]> git.wincent.com - wikitext.git/commitdiff
Replace ANTLR lexer with Ragel scanner
authorWincent Colaiuta <win@wincent.com>
Sun, 3 Feb 2008 17:03:10 +0000 (18:03 +0100)
committerWincent Colaiuta <win@wincent.com>
Sun, 3 Feb 2008 17:03:10 +0000 (18:03 +0100)
This insertion involves a rather massive refactoring so I'm doing it in
several steps: this first step is basically the result of ripping out
the old lexer and implanting the new scanner, up to the point were we
get an error and warning-free build.

From here on it's just a case of making adjustments to get all the specs
passing again.

Signed-off-by: Wincent Colaiuta <win@wincent.com>
12 files changed:
ext/parser.c [new file with mode: 0644]
ext/parser.h [new file with mode: 0644]
ext/spec/wikitext_spec.rb
ext/token.c [new file with mode: 0644]
ext/token.h [new file with mode: 0644]
ext/wikitext.c
ext/wikitext.h
ext/wikitext_ragel.c
ext/wikitext_ragel.h
ext/wikitext_ragel.rl
spec/external_link_spec.rb
spec/internal_link_spec.rb

diff --git a/ext/parser.c b/ext/parser.c
new file mode 100644 (file)
index 0000000..d6aceab
--- /dev/null
@@ -0,0 +1,2004 @@
+// Copyright 2007-2008 Wincent Colaiuta
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+#include "parser.h"
+#include "wikitext_ragel.h"
+
+// inline functions for inserting literals
+// have the option here of caching these in global variables, rb_obj_freeze
+
+inline VALUE escaped_no_wiki_start(void)
+{
+    return rb_str_new2("&lt;nowiki&gt;");
+}
+
+inline VALUE escaped_no_wiki_end(void)
+{
+    return rb_str_new2("&lt;/nowiki&gt;");
+}
+
+inline VALUE escaped_strong_em_start(void)
+{
+    return rb_str_new2("&lt;strong&gt;&lt;em&gt;");
+}
+
+inline VALUE escaped_strong_start(void)
+{
+    return rb_str_new2("&lt;strong&gt;");
+}
+
+inline VALUE escaped_em_start(void)
+{
+    return rb_str_new2("&lt;em&gt;");
+}
+
+inline VALUE escaped_tt_start(void)
+{
+    return rb_str_new2("&lt;tt&gt;");
+}
+
+inline VALUE escaped_tt_end(void)
+{
+    return rb_str_new2("&lt;/tt&gt;");
+}
+
+inline VALUE escaped_h6_start(void)
+{
+    return rb_str_new2("&lt;h6&gt;");
+}
+
+inline VALUE escaped_h6_end(void)
+{
+    return rb_str_new2("&lt;/h6&gt;");
+}
+
+inline VALUE escaped_h5_start(void)
+{
+    return rb_str_new2("&lt;h5&gt;");
+}
+
+inline VALUE escaped_h5_end(void)
+{
+    return rb_str_new2("&lt;/h5&gt;");
+}
+
+inline VALUE escaped_h4_start(void)
+{
+    return rb_str_new2("&lt;h4&gt;");
+}
+
+inline VALUE escaped_h4_end(void)
+{
+    return rb_str_new2("&lt;/h4&gt;");
+}
+
+inline VALUE escaped_h3_start(void)
+{
+    return rb_str_new2("&lt;h3&gt;");
+}
+
+inline VALUE escaped_h3_end(void)
+{
+    return rb_str_new2("&lt;/h3&gt;");
+}
+
+inline VALUE escaped_h2_start(void)
+{
+    return rb_str_new2("&lt;h2&gt;");
+}
+
+inline VALUE escaped_h2_end(void)
+{
+    return rb_str_new2("&lt;/h2&gt;");
+}
+
+inline VALUE escaped_h1_start(void)
+{
+    return rb_str_new2("&lt;h1&gt;");
+}
+
+inline VALUE escaped_h1_end(void)
+{
+    return rb_str_new2("&lt;/h1&gt;");
+}
+
+inline VALUE pre_start(void)
+{
+    return rb_str_new2("<pre>");
+}
+
+inline VALUE pre_end(void)
+{
+    return rb_str_new2("</pre>");
+}
+
+inline VALUE blockquote_start(void)
+{
+    return rb_str_new2("<blockquote>");
+}
+
+inline VALUE blockquote_end(void)
+{
+    return rb_str_new2("</blockquote>");
+}
+
+inline VALUE strong_em_start(void)
+{
+    return rb_str_new2("<strong><em>");
+}
+
+inline VALUE strong_start(void)
+{
+    return rb_str_new2("<strong>");
+}
+
+inline VALUE strong_end(void)
+{
+    return rb_str_new2("</strong>");
+}
+
+inline VALUE em_start(void)
+{
+    return rb_str_new2("<em>");
+}
+
+inline VALUE em_end(void)
+{
+    return rb_str_new2("</em>");
+}
+
+inline VALUE tt_start(void)
+{
+    return rb_str_new2("<tt>");
+}
+
+inline VALUE tt_end(void)
+{
+    return rb_str_new2("</tt>");
+}
+
+inline VALUE ol_start(void)
+{
+    return rb_str_new2("<ol>");
+}
+
+inline VALUE ol_end(void)
+{
+    return rb_str_new2("</ol>");
+}
+
+inline VALUE ul_start(void)
+{
+    return rb_str_new2("<ul>");
+}
+
+inline VALUE ul_end(void)
+{
+    return rb_str_new2("</ul>");
+}
+
+inline VALUE li_start(void)
+{
+    return rb_str_new2("<li>");
+}
+
+inline VALUE li_end(void)
+{
+    return rb_str_new2("</li>");
+}
+
+inline VALUE h6_start(void)
+{
+    return rb_str_new2("<h6>");
+}
+
+inline VALUE h6_end(void)
+{
+    return rb_str_new2("</h6>");
+}
+
+inline VALUE h5_start(void)
+{
+    return rb_str_new2("<h5>");
+}
+
+inline VALUE h5_end(void)
+{
+    return rb_str_new2("</h5>");
+}
+
+inline VALUE h4_start(void)
+{
+    return rb_str_new2("<h4>");
+}
+
+inline VALUE h4_end(void)
+{
+    return rb_str_new2("</h4>");
+}
+
+inline VALUE h3_start(void)
+{
+    return rb_str_new2("<h3>");
+}
+
+inline VALUE h3_end(void)
+{
+    return rb_str_new2("</h3>");
+}
+
+inline VALUE h2_start(void)
+{
+    return rb_str_new2("<h2>");
+}
+
+inline VALUE h2_end(void)
+{
+    return rb_str_new2("</h2>");
+}
+
+inline VALUE h1_start(void)
+{
+    return rb_str_new2("<h1>");
+}
+
+inline VALUE h1_end(void)
+{
+    return rb_str_new2("</h1>");
+}
+
+inline VALUE p_start(void)
+{
+    return rb_str_new2("<p>");
+}
+
+inline VALUE p_end(void)
+{
+    return rb_str_new2("</p>");
+}
+
+inline VALUE space(void)
+{
+    return rb_str_new2(" ");
+}
+
+inline VALUE a_start(void)
+{
+    return rb_str_new2("<a href=\"");
+}
+
+inline VALUE a_class(void)
+{
+    return rb_str_new2("\" class=\"");
+}
+
+inline VALUE a_start_close(void)
+{
+    return rb_str_new2("\">");
+}
+
+inline VALUE a_end(void)
+{
+    return rb_str_new2("</a>");
+}
+
+inline VALUE link_start(void)
+{
+    return rb_str_new2("[[");
+}
+
+inline VALUE link_end(void)
+{
+    return rb_str_new2("]]");
+}
+
+inline VALUE separator(void)
+{
+    return rb_str_new2("|");
+}
+
+inline VALUE ext_link_start(void)
+{
+    return rb_str_new2("[");
+}
+
+inline VALUE backtick(void)
+{
+    return rb_str_new2("`");
+}
+
+inline VALUE quote(void)
+{
+    return rb_str_new2("\"");
+}
+
+inline VALUE ampersand(void)
+{
+    return rb_str_new2("&");
+}
+
+inline VALUE quot_entity(void)
+{
+    return rb_str_new2("&quot;");
+}
+
+inline VALUE amp_entity(void)
+{
+    return rb_str_new2("&amp;");
+}
+
+inline VALUE lt_entity(void)
+{
+    return rb_str_new2("&lt;");
+}
+
+inline VALUE gt_entity(void)
+{
+    return rb_str_new2("&gt;");
+}
+
+inline VALUE ext_link_end(void)
+{
+    return rb_str_new2("]");
+}
+
+// for testing and debugging only
+VALUE Wikitext_parser_tokenize(VALUE self, VALUE string)
+{
+    if (NIL_P(string))
+        return Qnil;
+    string = StringValue(string);
+    VALUE tokens = rb_ary_new();
+    char *p = RSTRING_PTR(string);
+    long len = RSTRING_LEN(string);
+    char *pe = p + len;
+    token_t token;
+    next_token(&token, NULL, p, pe);
+    rb_ary_push(tokens, _Wikitext_token(&token));
+    while (token.type != END_OF_FILE)
+    {
+        next_token(&token, &token, NULL, pe);
+        rb_ary_push(tokens, _Wikitext_token(&token));
+    }
+    return tokens;
+}
+
+// for benchmarking raw tokenization speed only
+VALUE Wikitext_parser_benchmarking_tokenize(VALUE self, VALUE string)
+{
+    if (NIL_P(string))
+        return Qnil;
+    string = StringValue(string);
+    char *p = RSTRING_PTR(string);
+    long len = RSTRING_LEN(string);
+    char *pe = p + len;
+    token_t token;
+    next_token(&token, NULL, p, pe);
+    while (token.type != END_OF_FILE)
+        next_token(&token, &token, NULL, pe);
+    return Qnil;
+}
+
+// we downcase "in place", overwriting the original contents of the buffer and returning the same string
+inline VALUE _Wikitext_downcase(VALUE string)
+{
+    char *ptr   = RSTRING_PTR(string);
+    long len    = RSTRING_LEN(string);
+    for (long i = 0; i < len; i++)
+    {
+        if (ptr[i] >= 'A' && ptr[i] <= 'Z')
+            ptr[i] += 32;
+    }
+    return string;
+}
+
+inline VALUE _Wikitext_hyperlink(VALUE link_prefix, VALUE link_target, VALUE link_text, VALUE link_class)
+{
+    VALUE string = a_start();               // <a href="
+    if (!NIL_P(link_prefix))
+        rb_str_append(string, link_prefix);
+    rb_str_append(string, link_target);
+    if (link_class != Qnil)
+    {
+        rb_str_append(string, a_class());   // " class="
+        rb_str_append(string, link_class);
+    }
+    rb_str_append(string, a_start_close()); // ">
+    rb_str_append(string, link_text);
+    rb_str_append(string, a_end());
+    return string;
+}
+
+// Returns a count indicating the number of times the token appears in the collection.
+inline long _Wikitext_count(VALUE token, VALUE collection)
+{
+    long count = 0;
+    for (long i = 0, max = RARRAY_LEN(collection); i < max; i++)
+    {
+        if (FIX2INT(rb_ary_entry(collection, i)) == FIX2INT(token))
+            count++;
+    }
+    return count;
+}
+
+// Pops a single item off the stack.
+// A corresponding closing tag is written to the target string.
+void _Wikitext_pop_from_stack(VALUE stack, VALUE target, VALUE line_ending)
+{
+    VALUE top = rb_ary_entry(stack, -1);
+    if (NIL_P(top))
+        return;
+    switch (FIX2INT(top))
+    {
+        case PRE:
+            rb_str_append(target, pre_end());
+            rb_str_append(target, line_ending);
+            break;
+
+        case BLOCKQUOTE:
+            rb_str_append(target, blockquote_end());
+            rb_str_append(target, line_ending);
+            break;
+
+        case NO_WIKI_START:
+            // not a real HTML tag; so nothing to pop
+            break;
+
+        case STRONG:
+            rb_str_append(target, strong_end());
+            break;
+
+        case EM:
+            rb_str_append(target, em_end());
+            break;
+
+        case TT:
+        case TT_START:
+            rb_str_append(target, tt_end());
+            break;
+
+        case OL:
+            rb_str_append(target, ol_end());
+            rb_str_append(target, line_ending);
+            break;
+
+        case UL:
+            rb_str_append(target, ul_end());
+            rb_str_append(target, line_ending);
+            break;
+
+        case LI:
+            rb_str_append(target, li_end());
+            rb_str_append(target, line_ending);
+            break;
+
+        case H6_START:
+            rb_str_append(target, h6_end());
+            rb_str_append(target, line_ending);
+            break;
+
+        case H5_START:
+            rb_str_append(target, h5_end());
+            rb_str_append(target, line_ending);
+            break;
+
+        case H4_START:
+            rb_str_append(target, h4_end());
+            rb_str_append(target, line_ending);
+            break;
+
+        case H3_START:
+            rb_str_append(target, h3_end());
+            rb_str_append(target, line_ending);
+            break;
+
+        case H2_START:
+            rb_str_append(target, h2_end());
+            rb_str_append(target, line_ending);
+            break;
+
+        case H1_START:
+            rb_str_append(target, h1_end());
+            rb_str_append(target, line_ending);
+            break;
+
+        case EXT_LINK_START:
+            // not an HTML tag; so nothing to emit
+            break;
+
+        case SPACE:
+            // not an HTML tag (only used to separate an external link target from the link text); so nothing to emit
+            break;
+
+        case SEPARATOR:
+            // not an HTML tag (only used to separate an external link target from the link text); so nothing to emit
+            break;
+
+        case P:
+            rb_str_append(target, p_end());
+            rb_str_append(target, line_ending);
+            break;
+
+        default:
+            // should probably raise an exception here
+            break;
+    }
+    rb_ary_delete_at(stack, -1);
+}
+
+// Pops items off top of stack, accumulating closing tags for them into the target string, until item is reached.
+// If including is Qtrue then the item itself is also popped.
+void _Wikitext_pop_from_stack_up_to(VALUE stack, VALUE target, VALUE item, VALUE including, VALUE line_ending)
+{
+    int continue_looping = 1;
+    do
+    {
+        VALUE top = rb_ary_entry(stack, -1);
+        if (NIL_P(top))
+            return;
+        if (FIX2INT(top) == FIX2INT(item))
+        {
+            if (including != Qtrue)
+                return;
+            continue_looping = 0;
+        }
+        _Wikitext_pop_from_stack(stack, target, line_ending);
+    } while (continue_looping);
+}
+
+inline void _Wikitext_start_para_if_necessary(VALUE capture, VALUE scope, VALUE line, VALUE output,
+    VALUE *pending_crlf)
+{
+    if (!NIL_P(capture)) // we don't do anything if capturing mode
+        return;
+    // if no block open yet, or top of stack is BLOCKQUOTE (w¡th nothing in it yet)
+    if ((RARRAY_LEN(scope) == 0) || (FIX2INT(rb_ary_entry(scope, -1)) == BLOCKQUOTE))
+    {
+        rb_str_append(output, p_start());
+        rb_ary_push(scope, INT2FIX(P));
+        rb_ary_push(line, INT2FIX(P));
+    }
+    else if (rb_ary_includes(scope, INT2FIX(P)) && *pending_crlf == Qtrue)
+        // already in a paragraph block; convert pending CRLF into a space
+        rb_str_append(output, space());
+    *pending_crlf = Qfalse;
+}
+
+// Helper function that pops any excess elements off scope (pushing is already handled in the respective rules).
+// For example, given input like:
+//      > > foo
+//      bar
+// Upon seeing "bar", we want to pop two BLOCKQUOTE elements from the scope.
+// The reverse case (shown below) is handled from inside the BLOCKQUOTE rule itself:
+//      foo
+//      > > bar
+void inline _Wikitext_pop_excess_elements(VALUE capture, VALUE scope, VALUE line, VALUE output, VALUE line_ending)
+{
+    if (!NIL_P(capture)) // we don't pop anything if capturing mode
+        return;
+    for (long i = RARRAY_LEN(scope), j = RARRAY_LEN(line); i > j; i--)
+    {
+        // special case for last item on scope
+        if (i - j == 1)
+        {
+            // don't auto-pop P if it is only item on scope
+            long k = FIX2INT(rb_ary_entry(scope, -1));
+            if (k == P)
+                continue;
+            else if (k != FIX2INT(rb_ary_entry(line, -1)))
+            {
+                // pop off one more item in cases like this:
+                // * foo
+                //   pre
+                // seems necessary in the PRE case becase there's something braindead with my PRE implementation
+                // other rules (eg BLOCKQUOTE, H6 etc) seem to handle this fine
+                _Wikitext_pop_from_stack(scope, output, line_ending);
+            }
+        }
+        _Wikitext_pop_from_stack(scope, output, line_ending);
+    }
+}
+
+inline VALUE _Wikitext_utf32_char_to_entity(uint32_t character)
+{
+    // TODO: consider special casing some entities (ie. quot, amp, lt, gt etc)?
+    char hex_string[8]  = { '&', '#', 'x', 0, 0, 0, 0, ';' };
+    char scratch        = (character & 0xf000) >> 12;
+    hex_string[3]       = (scratch <= 9 ? scratch + 48 : scratch + 87);
+    scratch             = (character & 0x0f00) >> 8;
+    hex_string[4]       = (scratch <= 9 ? scratch + 48 : scratch + 87);
+    scratch             = (character & 0x00f0) >> 4;
+    hex_string[5]       = (scratch <= 9 ? scratch + 48 : scratch + 87);
+    scratch             = character & 0x000f;
+    hex_string[6]       = (scratch <= 9 ? scratch + 48 : scratch + 87);
+    return rb_str_new((const char *)hex_string, sizeof(hex_string));
+}
+
+// - non-printable (non-ASCII) characters converted to numeric entities
+// - QUOT and AMP characters converted to named entities
+inline VALUE _Wikitext_sanitize_link_target(VALUE self, VALUE string)
+{
+    string              = StringValue(string);  // raises if string is nil or doesn't quack like a string
+    char    *src        = RSTRING_PTR(string);
+    long    len         = RSTRING_LEN(string);
+    char    *end        = src + (len / sizeof(uint16_t));
+
+    // start with a destination buffer twice the size of the source, will realloc if necessary
+    // slop = (len / 8) * 8 (ie. one in every 8 characters can be converted into an entity, each entity requires 8 bytes)
+    // this efficiently handles the most common case (where the size of the buffer doesn't change much)
+    char    *dest       = ALLOC_N(char, len * 2);
+    char    *dest_ptr   = dest; // hang on to this so we can pass it to free() later
+
+    while (src < end)
+    {
+        // need at most 8 characters (8 bytes) to display each character
+        if (dest + 8 > dest_ptr + len)                      // outgrowing buffer, must reallocate
+        {
+            char *old_dest      = dest;
+            char *old_dest_ptr  = dest_ptr;
+            len                 = len + (end - src) * 8;    // allocate enough for worst case
+            dest                = realloc(dest_ptr, len);   // will never have to realloc more than once
+            if (dest == NULL)
+            {
+                // would have used reallocf, but this has to run on Linux too, not just Darwin
+                free(dest_ptr);
+                rb_raise(rb_eNoMemError, "failed to re-allocate temporary storage (memory allocation error)");
+            }
+            dest_ptr    = dest;
+            dest        = dest_ptr + (old_dest - old_dest_ptr);
+        }
+
+        if (*src == '"')                 // QUOT
+        {
+            char *quot_entity_literal = "&quot;";
+            memcpy(dest, quot_entity_literal, sizeof(quot_entity_literal));
+            dest += sizeof(quot_entity_literal);
+        }
+        else if (*src == '&')            // AMP
+        {
+            char *amp_entity_literal = "&amp;";
+            memcpy(dest, amp_entity_literal, sizeof(amp_entity_literal));
+            dest += sizeof(amp_entity_literal);
+        }
+        else if (*src == '<')           // LESS_THAN
+        {
+            free(dest_ptr);
+            rb_raise(rb_eRangeError, "invalid link text (\"<\" may not appear in link text)");
+        }
+        else if (*src == '>')           // GREATER_THAN
+        {
+            free(dest_ptr);
+            rb_raise(rb_eRangeError, "invalid link text (\">\" may not appear in link text)");
+        }
+        else if (*src >= 0x20 && *src <= 0x7e)    // printable ASCII
+        {
+            *dest = *src;
+            dest++;
+        }
+        else    // all others: must convert to entities
+        {
+            VALUE       entity      = _Wikitext_utf32_char_to_entity(*src);
+            char        *entity_src = RSTRING_PTR(entity);
+            long        entity_len  = RSTRING_LEN(entity); // should always be 8 characters (8 bytes)
+            memcpy(dest, entity_src, entity_len);
+            dest += entity_len;
+        }
+        src++;
+    }
+    VALUE out = rb_str_new(dest_ptr, dest - dest_ptr);
+    free(dest_ptr);
+    return out;
+}
+
+VALUE Wikitext_sanitize_link_target(VALUE self, VALUE string)
+{
+    return (_Wikitext_sanitize_link_target(self, string));
+}
+
+// encodes the input string according to RFCs 2396 and 2718
+// input is the pointer to the string, and len is its length in characters (not in bytes)
+// note that the first character of the target link is not case-sensitive
+// (this is a recommended application-level constraint; it is not imposed at this level)
+// this is to allow links like:
+//         ...the [[foo]] is...
+// to be equivalent to:
+//         thing. [[Foo]] was...
+// TODO: this is probably the right place to check if treat_slash_as_special is true and act accordingly
+inline static VALUE _Wikitext_encode_link_target(VALUE self, VALUE in)
+{
+    char        *input  = RSTRING_PTR(in);
+    long        len     = RSTRING_LEN(in);
+    static char hex[]   = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' };
+
+    // to avoid most reallocations start with a destination buffer twice the size of the source
+    // this handles the most common case (where most chars are in the ASCII range and don't require more storage, but there are
+    // often quite a few spaces, which are encoded as "%20" and occupy 3 bytes)
+    // the worst case is where _every_ byte must be written out using 3 bytes
+    long dest_len   = len * 2;
+    char *dest      = ALLOC_N(char, dest_len);
+    char *dest_ptr  = dest; // hang on to this so we can pass it to free() later
+
+    for (long i = 0; i < len; input++)
+    {
+        if ((dest + 3) > (dest_ptr + dest_len))     // worst case: a single character may grow to 3 characters once encoded
+        {
+            // outgrowing buffer, must reallocate
+            char *old_dest      = dest;
+            char *old_dest_ptr  = dest_ptr;
+            dest_len            += len;
+            dest                = realloc(dest_ptr, dest_len);
+            if (dest == NULL)
+            {
+                // would have used reallocf, but this has to run on Linux too, not just Darwin
+                free(dest_ptr);
+                rb_raise(rb_eNoMemError, "failed to re-allocate temporary storage (memory allocation error)");
+            }
+            dest_ptr    = dest;
+            dest        = dest_ptr + (old_dest - old_dest_ptr);
+        }
+
+        // pass through unreserved characters
+        if (((*input >= 'a') && (*input <= 'z')) ||
+            ((*input >= 'A') && (*input <= 'Z')) ||
+            ((*input >= '0') && (*input <= '9')) ||
+            (*input == '-') ||
+            (*input == '_') ||
+            (*input == '.') ||
+            (*input == '~'))
+            *dest++ = *input;
+        else    // everything else gets URL-encoded
+        {
+            *dest++ = '%';
+            *dest++ = hex[(*input) / 16];   // left
+            *dest++ = hex[(*input) % 16];   // right
+        }
+    }
+    VALUE out = rb_str_new(dest_ptr, dest - dest_ptr);
+    free(dest_ptr);
+    return out;
+}
+
+VALUE Wikitext_encode_link_target(VALUE self, VALUE in)
+{
+    return _Wikitext_encode_link_target(self, in);
+}
+
+// not sure whether these rollback functions should be inline: could refactor them into a single non-inlined function
+inline void _Wikitext_rollback_failed_link(VALUE output, VALUE scope, VALUE line, VALUE link_target, VALUE link_text,
+    VALUE link_class, VALUE line_ending)
+{
+    // I'd like to remove this paragraph creation from here and instead put it where the scope is first entered: would be cleaner
+    // same for the method below
+    // basically we can create a paragraph at that point because we know we'll either be emitting a valid link or the residue
+    // left behind by an invalid one
+    int scope_includes_separator = rb_ary_includes(scope, INT2FIX(SEPARATOR));
+    _Wikitext_pop_from_stack_up_to(scope, output, INT2FIX(LINK_START), Qtrue, line_ending);
+    if (!rb_ary_includes(scope, INT2FIX(P)) &&
+        !rb_ary_includes(scope, INT2FIX(H6_START)) &&
+        !rb_ary_includes(scope, INT2FIX(H5_START)) &&
+        !rb_ary_includes(scope, INT2FIX(H4_START)) &&
+        !rb_ary_includes(scope, INT2FIX(H3_START)) &&
+        !rb_ary_includes(scope, INT2FIX(H2_START)) &&
+        !rb_ary_includes(scope, INT2FIX(H1_START)))
+    {
+        // create a paragraph if necessary
+        rb_str_append(output, p_start());
+        rb_ary_push(scope, INT2FIX(P));
+        rb_ary_push(line, INT2FIX(P));
+    }
+    rb_str_append(output, link_start());
+    if (!NIL_P(link_target))
+    {
+        VALUE sanitized = Wikitext_sanitize_link_target(Qnil, link_target);
+        rb_str_append(output, sanitized);
+        if (scope_includes_separator)
+        {
+            rb_str_append(output, separator());
+            if (!NIL_P(link_text))
+                rb_str_append(output, link_text);
+        }
+    }
+}
+
+inline void _Wikitext_rollback_failed_external_link(VALUE output, VALUE scope, VALUE line, VALUE link_target,
+    VALUE link_text, VALUE link_class, VALUE autolink, VALUE line_ending)
+{
+    int scope_includes_space = rb_ary_includes(scope, INT2FIX(SPACE));
+    _Wikitext_pop_from_stack_up_to(scope, output, INT2FIX(EXT_LINK_START), Qtrue, line_ending);
+    if (!rb_ary_includes(scope, INT2FIX(P)) &&
+        !rb_ary_includes(scope, INT2FIX(H6_START)) &&
+        !rb_ary_includes(scope, INT2FIX(H5_START)) &&
+        !rb_ary_includes(scope, INT2FIX(H4_START)) &&
+        !rb_ary_includes(scope, INT2FIX(H3_START)) &&
+        !rb_ary_includes(scope, INT2FIX(H2_START)) &&
+        !rb_ary_includes(scope, INT2FIX(H1_START)))
+    {
+        // create a paragraph if necessary
+        rb_str_append(output, p_start());
+        rb_ary_push(scope, INT2FIX(P));
+        rb_ary_push(line, INT2FIX(P));
+    }
+    rb_str_append(output, ext_link_start());
+    if (!NIL_P(link_target))
+    {
+        if (autolink == Qtrue)
+            link_target = _Wikitext_hyperlink(Qnil, link_target, link_target, link_class); // link target, link text, link class
+        rb_str_append(output, link_target);
+        if (scope_includes_space)
+        {
+            rb_str_append(output, space());
+            if (!NIL_P(link_text))
+                rb_str_append(output, link_text);
+        }
+    }
+}
+
+VALUE Wikitext_parser_initialize(VALUE self)
+{
+    // no need to call super here; rb_call_super()
+    rb_iv_set(self, "@autolink",                Qtrue);
+    rb_iv_set(self, "@line_ending",             rb_str_new2("\n"));
+    rb_iv_set(self, "@external_link_class",     rb_str_new2("external"));
+    rb_iv_set(self, "@mailto_class",            rb_str_new2("mailto"));
+    rb_iv_set(self, "@internal_link_prefix",    rb_str_new2("/wiki/"));
+    return self;
+}
+
+VALUE Wikitext_parser_parse(VALUE self, VALUE string)
+{
+    // process arguments
+    if (NIL_P(string))
+        return Qnil;
+    string = StringValue(string);
+
+    // set up scanner
+    char *p = RSTRING_PTR(string);
+    long len = RSTRING_LEN(string);
+    char *pe = p + len;
+
+    // house-keeping
+    VALUE output        = rb_str_new2("");
+    VALUE capture       = Qnil;             // sometimes we want to capture output rather than send it to the output variable
+    VALUE scope         = rb_ary_new();     // stack for tracking scope
+    VALUE line          = rb_ary_new();     // stack for tracking scope as implied by current line
+    VALUE line_buffer   = rb_ary_new();     // stack for tracking raw tokens (not scope) on current line
+    VALUE pending_crlf  = Qfalse;
+    VALUE link_target   = Qnil;             // need some short term "memory" for parsing links
+    VALUE link_text     = Qnil;             // need some short term "memory" for parsing links
+
+    // access these once per parse
+    VALUE line_ending   = rb_iv_get(self, "@line_ending");
+    line_ending         = StringValue(line_ending);
+    VALUE autolink      = rb_iv_get(self, "@autolink");
+    VALUE link_class    = rb_iv_get(self, "@external_link_class");
+    link_class          = NIL_P(link_class) ? Qnil : StringValue(link_class);
+    VALUE mailto_class  = rb_iv_get(self, "@mailto_class");
+    mailto_class        = StringValue(mailto_class);
+    VALUE prefix        = rb_iv_get(self, "@internal_link_prefix");
+
+    token_t _token;
+    _token.type = NO_TOKEN;
+    token_t *token = NULL;
+    do
+    {
+        // check to see if we have a token hanging around from a previous iteration of this loop
+        if (token == NULL)
+        {
+            token = &_token;
+            if (_token.type == NO_TOKEN)
+                // first time here (haven't started scanning yet)
+                next_token(token, NULL, p, pe);
+            else
+                // already scanning
+#define NEXT_TOKEN()    next_token(token, token, NULL, pe)
+                NEXT_TOKEN();
+        }
+        int type = token->type;
+
+        // many restrictions depend on what is at the top of the stack
+        VALUE top = rb_ary_entry(scope, -1);
+
+        if (type != END_OF_FILE)
+        {
+            // push current token into line buffer (but not EOF as it won't fit inside a Fixnum)
+            // provides us with context-sensitive "memory" of what's been seen so far on this line
+            VALUE current = INT2FIX(type);
+
+            // for lines with *lots* of consecutive PRINTABLES this could be quite wasteful, so only store one
+            //if (type != PRINTABLE || NIL_P(top) || FIX2INT(top) != PRINTABLE)
+                rb_ary_push(line_buffer, current);
+        }
+
+        // can't declare new variables inside a switch statement, so predeclare them here
+        long remove_strong          = -1;
+        long remove_em              = -1;
+
+        // general purpose counters and flags
+        long i                      = 0;
+        long j                      = 0;
+        long k                      = 0;
+
+        // The following giant switch statement contains cases for all the possible token types.
+        // In the most basic sense we are emitting the HTML that corresponds to each token,
+        // but some tokens require context information in order to decide what to output.
+        // For example, does the STRONG token (''') translate to <strong> or </strong>?
+        // So when looking at any given token we have three state-maintaining variables which gives us a notion of "where we are":
+        //
+        //  - the "scope" stack (indicates what HTML DOM structures we are currently nested inside, similar to a CSS selector)
+        //  - the line buffer (records tokens seen so far on the current line)
+        //  - the line "scope" stack (indicates what the scope should be based only on what is visible on the line so far)
+        //
+        // Although this is fairly complicated, there is one key simplifying factor:
+        // The translator continuously performs auto-correction, and this means that we always have a guarantee that the
+        // scope stack (up to the current token) is valid; our translator can take this as a given.
+        // Auto-correction basically consists of inserting missing tokens (preventing subsquent HTML from being messed up),
+        // or converting illegal (unexpected) tokens to their plain text equivalents (providing visual feedback to Wikitext author).
+        switch (type)
+        {
+            case PRE:
+                if (rb_ary_includes(scope, INT2FIX(NO_WIKI_START)))
+                {
+                    // already in <nowiki> span (no need to check for <pre>; can never appear inside it)
+                    rb_str_append(output, space());
+                    break;
+                }
+
+                // count number of BLOCKQUOTE tokens in line buffer and in scope stack
+                rb_ary_push(line, INT2FIX(PRE));
+                i = _Wikitext_count(INT2FIX(BLOCKQUOTE), line);
+                j = _Wikitext_count(INT2FIX(BLOCKQUOTE), scope);
+
+                if (i < j)
+                {
+                    // must pop (reduce nesting level)
+                    for (i = j - i; i > 0; i--)
+                        _Wikitext_pop_from_stack_up_to(scope, output, INT2FIX(BLOCKQUOTE), Qtrue, line_ending);
+                }
+
+                if (!rb_ary_includes(scope, INT2FIX(PRE)))
+                {
+                    _Wikitext_pop_excess_elements(capture, scope, line, output, line_ending);
+                    rb_str_append(output, pre_start());
+                    rb_ary_push(scope, INT2FIX(PRE));
+                }
+                break;
+
+            case BLOCKQUOTE:
+                if (rb_ary_includes(scope, INT2FIX(NO_WIKI_START)))
+                    // already in <nowiki> span (no need to check for <pre>; can never appear inside it)
+                    rb_str_append(output, TOKEN_TEXT(token));
+                else
+                {
+                    rb_ary_push(line, INT2FIX(BLOCKQUOTE));
+
+                    // count number of BLOCKQUOTE tokens in line buffer and in scope stack
+                    i = _Wikitext_count(INT2FIX(BLOCKQUOTE), line);
+                    j = _Wikitext_count(INT2FIX(BLOCKQUOTE), scope);
+
+                    // given that BLOCKQUOTE tokens can be nested, peek ahead and see if there are any more which might affect the decision to push or pop
+                    while (NEXT_TOKEN(), (token->type == BLOCKQUOTE))
+                    {
+                        rb_ary_push(line, INT2FIX(BLOCKQUOTE));
+                        i++;
+                    }
+
+                    // now decide whether to push, pop or do nothing
+                    if (i > j)
+                    {
+                        // must push (increase nesting level)
+                        _Wikitext_pop_from_stack_up_to(scope, output, INT2FIX(BLOCKQUOTE), Qfalse, line_ending);
+                        for (i = i - j; i > 0; i--)
+                        {
+                            rb_str_append(output, blockquote_start());
+                            rb_ary_push(scope, INT2FIX(BLOCKQUOTE));
+                        }
+                    }
+                    else if (i < j)
+                    {
+                        // must pop (reduce nesting level)
+                        for (i = j - i; i > 0; i--)
+                            _Wikitext_pop_from_stack_up_to(scope, output, INT2FIX(BLOCKQUOTE), Qtrue, line_ending);
+                    }
+
+                    // jump to top of the loop to process token we scanned during lookahead
+                    continue;
+                }
+                break;
+
+            case NO_WIKI_START:
+                if (rb_ary_includes(scope, INT2FIX(NO_WIKI_START)) || rb_ary_includes(scope, INT2FIX(PRE)))
+                    // already in <nowiki> span or <pre> block
+                    rb_str_append(output, escaped_no_wiki_start());
+                else
+                {
+                    i = NIL_P(capture) ? output : capture;
+                    _Wikitext_pop_excess_elements(capture, scope, line, i, line_ending);
+                    _Wikitext_start_para_if_necessary(capture, scope, line, i, &pending_crlf);
+                    rb_ary_push(scope, INT2FIX(NO_WIKI_START));
+                    rb_ary_push(line, INT2FIX(NO_WIKI_START));
+                }
+                break;
+
+            case NO_WIKI_END:
+                if (rb_ary_includes(scope, INT2FIX(NO_WIKI_START)))
+                    // <nowiki> should always only ever be the last item in the stack, but use the helper routine just in case
+                    _Wikitext_pop_from_stack_up_to(scope, output, INT2FIX(NO_WIKI_START), Qtrue, line_ending);
+                else
+                {
+                    i = NIL_P(capture) ? output : capture;
+                    _Wikitext_pop_excess_elements(capture, scope, line, i, line_ending);
+                    _Wikitext_start_para_if_necessary(capture, scope, line, i, &pending_crlf);
+                    rb_str_append(output, escaped_no_wiki_end());
+                }
+                break;
+
+            case STRONG_EM:
+                if (rb_ary_includes(scope, INT2FIX(NO_WIKI_START)) || rb_ary_includes(scope, INT2FIX(PRE)))
+                {
+                    // already in <nowiki> span or <pre> block
+                    rb_str_append(output, escaped_strong_em_start());
+                    break;
+                }
+
+                i = NIL_P(capture) ? output : capture;
+                _Wikitext_pop_excess_elements(capture, scope, line, i, line_ending);
+
+                // if you've seen STRONG or EM, must close them in the reverse order that you saw them! otherwise, must open them
+                remove_strong  = -1;
+                remove_em      = -1;
+                j              = RARRAY_LEN(scope);
+                for (j = j - 1; j >= 0; j--)
+                {
+                    long val = FIX2INT(rb_ary_entry(scope, j));
+                    if (val == STRONG)
+                    {
+                        rb_str_append(i, strong_em_start());
+                        remove_strong = j;
+                    }
+                    else if (val == EM)
+                    {
+                        rb_str_append(i, em_end());
+                        remove_em = j;
+                    }
+                }
+
+                if (remove_strong > remove_em)      // must remove strong first
+                {
+                    rb_ary_delete_at(scope, remove_strong);
+                    if (remove_em > -1)
+                        rb_ary_delete_at(scope, remove_em);
+                    else    // there was no em to remove!, so consider this an opening em tag
+                    {
+                        rb_str_append(i, em_start());
+                        rb_ary_push(scope, INT2FIX(EM));
+                        rb_ary_push(line, INT2FIX(EM));
+                    }
+                }
+                else if (remove_em > remove_strong) // must remove em first
+                {
+                    rb_ary_delete_at(scope, remove_em);
+                    if (remove_strong > -1)
+                        rb_ary_delete_at(scope, remove_strong);
+                    else    // there was no strong to remove!, so consider this an opening strong tag
+                    {
+                        rb_str_append(i, strong_start());
+                        rb_ary_push(scope, INT2FIX(STRONG));
+                        rb_ary_push(line, INT2FIX(STRONG));
+                    }
+                }
+                else    // no strong or em to remove, so this must be a new opening of both
+                {
+                    _Wikitext_start_para_if_necessary(capture, scope, line, i, &pending_crlf);
+                    rb_str_append(i, strong_em_start());
+                    rb_ary_push(scope, INT2FIX(STRONG));
+                    rb_ary_push(line, INT2FIX(STRONG));
+                    rb_ary_push(scope, INT2FIX(EM));
+                    rb_ary_push(line, INT2FIX(EM));
+                }
+                break;
+
+            case STRONG:
+                if (rb_ary_includes(scope, INT2FIX(NO_WIKI_START)) || rb_ary_includes(scope, INT2FIX(PRE)))
+                    // already in <nowiki> span or <pre> block
+                    rb_str_append(output, escaped_strong_start());
+                else
+                {
+                    i = NIL_P(capture) ? output : capture;
+                    if (rb_ary_includes(scope, INT2FIX(STRONG)))
+                        // STRONG already seen, this is a closing tag
+                        _Wikitext_pop_from_stack_up_to(scope, i, INT2FIX(STRONG), Qtrue, line_ending);
+                    else
+                    {
+                        // this is a new opening
+                        _Wikitext_pop_excess_elements(capture, scope, line, i, line_ending);
+                        _Wikitext_start_para_if_necessary(capture, scope, line, i, &pending_crlf);
+                        rb_str_append(i, strong_start());
+                        rb_ary_push(scope, INT2FIX(STRONG));
+                        rb_ary_push(line, INT2FIX(STRONG));
+                    }
+                }
+                break;
+
+            case EM:
+                if (rb_ary_includes(scope, INT2FIX(NO_WIKI_START)) || rb_ary_includes(scope, INT2FIX(PRE)))
+                    // already in <nowiki> span or <pre> block
+                    rb_str_append(output, escaped_em_start());
+                else
+                {
+                    i = NIL_P(capture) ? output : capture;
+                    if (rb_ary_includes(scope, INT2FIX(EM)))
+                        // EM already seen, this is a closing tag
+                        _Wikitext_pop_from_stack_up_to(scope, i, INT2FIX(EM), Qtrue, line_ending);
+                    else
+                    {
+                        // this is a new opening
+                        _Wikitext_pop_excess_elements(capture, scope, line, i, line_ending);
+                        _Wikitext_start_para_if_necessary(capture, scope, line, i, &pending_crlf);
+                        rb_str_append(i, em_start());
+                        rb_ary_push(scope, INT2FIX(EM));
+                        rb_ary_push(line, INT2FIX(EM));
+                    }
+                }
+                break;
+
+            case TT:
+                if (rb_ary_includes(scope, INT2FIX(NO_WIKI_START)) || rb_ary_includes(scope, INT2FIX(PRE)))
+                    // already in <nowiki> span or <pre> block
+                    rb_str_append(output, backtick());
+                else
+                {
+                    i = NIL_P(capture) ? output : capture;
+                    if (rb_ary_includes(scope, INT2FIX(TT_START)))
+                        // already in span started with <tt>, no choice but to emit this literally
+                        rb_str_append(output, backtick());
+                    else if (rb_ary_includes(scope, INT2FIX(TT)))
+                        // TT (`) already seen, this is a closing tag
+                        _Wikitext_pop_from_stack_up_to(scope, i, INT2FIX(TT), Qtrue, line_ending);
+                    else
+                    {
+                        // this is a new opening
+                        _Wikitext_pop_excess_elements(capture, scope, line, i, line_ending);
+                        _Wikitext_start_para_if_necessary(capture, scope, line, i, &pending_crlf);
+                        rb_str_append(i, tt_start());
+                        rb_ary_push(scope, INT2FIX(TT));
+                        rb_ary_push(line, INT2FIX(TT));
+                    }
+                }
+                break;
+
+            case TT_START:
+                if (rb_ary_includes(scope, INT2FIX(NO_WIKI_START)) || rb_ary_includes(scope, INT2FIX(PRE)))
+                    // already in <nowiki> span, <pre> block
+                    rb_str_append(output, escaped_tt_start());
+                else
+                {
+                    i = NIL_P(capture) ? output : capture;
+                    if (rb_ary_includes(scope, INT2FIX(TT_START)) || rb_ary_includes(scope, INT2FIX(TT)))
+                        // already in TT_START (<tt>) or TT (`) span)
+                        rb_str_append(output, escaped_tt_start());
+                    else
+                    {
+                        _Wikitext_pop_excess_elements(capture, scope, line, i, line_ending);
+                        _Wikitext_start_para_if_necessary(capture, scope, line, i, &pending_crlf);
+                        rb_str_append(i, tt_start());
+                        rb_ary_push(scope, INT2FIX(TT_START));
+                        rb_ary_push(line, INT2FIX(TT_START));
+                    }
+                }
+                break;
+
+            case TT_END:
+                if (rb_ary_includes(scope, INT2FIX(NO_WIKI_START)) || rb_ary_includes(scope, INT2FIX(PRE)))
+                    // already in <nowiki> span or <pre> block
+                    rb_str_append(output, escaped_tt_end());
+                else
+                {
+                    i = NIL_P(capture) ? output : capture;
+                    if (rb_ary_includes(scope, INT2FIX(TT_START)))
+                        _Wikitext_pop_from_stack_up_to(scope, i, INT2FIX(TT_START), Qtrue, line_ending);
+                    else
+                    {
+                        // no TT_START in scope, so must interpret the TT_END without any special meaning
+                        _Wikitext_pop_excess_elements(capture, scope, line, i, line_ending);
+                        _Wikitext_start_para_if_necessary(capture, scope, line, i, &pending_crlf);
+                        rb_str_append(i, escaped_tt_end());
+                    }
+                }
+                break;
+
+            case OL:
+            case UL:
+                if (rb_ary_includes(scope, INT2FIX(NO_WIKI_START)))
+                {
+                    // already in <nowiki> span (no need to check for <pre>; can never appear inside it)
+                    rb_str_append(output, TOKEN_TEXT(token));
+                    break;
+                }
+
+                // count number of tokens in line and scope stacks
+                i = RARRAY_LEN(line);
+                j = RARRAY_LEN(scope);
+
+                // list tokens can be nested so look ahead for any more which might affect the decision to push or pop
+                for (;;)
+                {
+                    NEXT_TOKEN();
+                    type = token->type;
+                    if (type == OL || type == UL)
+                    {
+                        token = NULL;
+                        rb_ary_push(line, INT2FIX(type));
+                        rb_ary_push(line, INT2FIX(LI));
+                        i += 2;
+
+                        // want to compare line with scope but can only do so if scope has enough items on it
+                        if (j >= i)
+                        {
+                            if ((FIX2INT(rb_ary_entry(scope, i - 2)) == type) && (FIX2INT(rb_ary_entry(scope, i - 1)) == LI))
+                            {
+                                // line and scope match at this point: do nothing yet
+                            }
+                            else
+                            {
+                                // item just pushed onto line does not match corresponding slot of scope!
+                                for (; j >= i - 2; j--)
+                                    // must pop back before emitting
+                                    _Wikitext_pop_from_stack(scope, output, line_ending);
+
+                                // will emit UL or OL, then LI
+                                break;
+                            }
+                        }
+                        else        // line stack size now exceeds scope stack size: must increase nesting level
+                            break;  // will emit UL or OL, then LI
+                    }
+                    else
+                    {
+                        // not a OL or UL token!
+                        if (j == i)
+                            // must close existing LI and re-open new one
+                            _Wikitext_pop_from_stack(scope, output, line_ending);
+                        else if (j > i)
+                        {
+                            // item just pushed onto line does not match corresponding slot of scope!
+                            for (; j >= i; j--)
+                                // must pop back before emitting
+                                _Wikitext_pop_from_stack(scope, output, line_ending);
+                        }
+                        break;
+                    }
+                }
+
+                // TODO: consider adding indentation here... wouldn't be too hard...
+                if (type == OL || type == UL)
+                {
+                    // if LI is at the top of a stack this is the start of a nested list
+                    if (FIX2INT(rb_ary_entry(scope, -1)) == LI)
+                        // so we should precede it with a CRLF
+                        rb_str_append(output, line_ending);
+                }
+
+                // emit
+                if (type == OL)
+                    rb_str_append(output, ol_start());
+                else if (type == UL)
+                    rb_str_append(output, ul_start());
+
+                if (type == OL || type == UL)
+                {
+                    rb_ary_push(scope, INT2FIX(type));
+                    rb_str_append(output, line_ending);
+                }
+                else if (type == SPACE)
+                    // silently throw away the optional SPACE token after final list marker
+                    token = NULL;
+
+                rb_str_append(output, li_start());
+                rb_ary_push(scope, INT2FIX(LI));
+
+                // any subsequent UL or OL tokens on this line are syntax errors and must be emitted literally
+                if (type == OL || type == UL)
+                {
+                    k = 0;
+                    while (NEXT_TOKEN(), (type = token->type))
+                    {
+                        if (k == 0 && type == SPACE)
+                            // silently throw away the optional SPACE token after final list marker
+                            token = NULL;
+                        k++;
+                        if (type == OL || type == UL)
+                        {
+                            rb_str_append(output, TOKEN_TEXT(token));
+                            token = NULL;
+                        }
+                        else
+                            break;
+                    }
+                }
+
+                // jump to top of the loop to process token we scanned during lookahead
+                continue;
+
+            case H6_START:
+            case H5_START:
+            case H4_START:
+            case H3_START:
+            case H2_START:
+            case H1_START:
+                if (rb_ary_includes(scope, INT2FIX(NO_WIKI_START)))
+                {
+                    // already in <nowiki> span (no need to check for <pre>; can never appear inside it)
+                    rb_str_append(output, TOKEN_TEXT(token));
+                    break;
+                }
+
+                // pop up to but not including the last BLOCKQUOTE on the scope stack
+                _Wikitext_pop_from_stack_up_to(scope, output, INT2FIX(BLOCKQUOTE), Qfalse, line_ending);
+
+                // count number of BLOCKQUOTE tokens in line buffer and in scope stack
+                rb_ary_push(line, INT2FIX(type));
+                i = _Wikitext_count(INT2FIX(BLOCKQUOTE), line);
+                j = _Wikitext_count(INT2FIX(BLOCKQUOTE), scope);
+
+                // decide whether we need to pop off excess BLOCKQUOTE tokens (will never need to push; that is handled above in the BLOCKQUOTE case itself)
+                if (i < j)
+                {
+                    // must pop (reduce nesting level)
+                    for (i = j - i; i > 0; i--)
+                        _Wikitext_pop_from_stack_up_to(scope, output, INT2FIX(BLOCKQUOTE), Qtrue, line_ending);
+                }
+
+                // discard any whitespace here (so that "== foo ==" will be translated to "<h2>foo</h2>" rather than "<h2> foo </h2")
+                while (NEXT_TOKEN(), (token->type == SPACE))
+                    ; // discard
+
+                rb_ary_push(scope, INT2FIX(type));
+
+                // rather than repeat all that code for each kind of heading, share it and use a conditional here
+                if (type == H6_START)
+                    rb_str_append(output, h6_start());
+                else if (type == H5_START)
+                    rb_str_append(output, h5_start());
+                else if (type == H4_START)
+                    rb_str_append(output, h4_start());
+                else if (type == H3_START)
+                    rb_str_append(output, h3_start());
+                else if (type == H2_START)
+                    rb_str_append(output, h2_start());
+                else if (type == H1_START)
+                    rb_str_append(output, h1_start());
+
+                // jump to top of the loop to process token we scanned during lookahead
+                continue;
+
+            case H6_END:
+                if (rb_ary_includes(scope, INT2FIX(NO_WIKI_START)) || rb_ary_includes(scope, INT2FIX(PRE)))
+                    // already in <nowiki> span or <pre> block
+                    rb_str_append(output, escaped_h6_end());
+                else
+                {
+                    if (rb_ary_includes(scope, INT2FIX(EXT_LINK_START)))
+                    {
+                        // this is a syntax error; an unclosed external link
+                        _Wikitext_rollback_failed_external_link(output, scope, line, link_target, link_text, link_class, autolink,
+                            line_ending);
+                        link_target = Qnil;
+                        link_text   = Qnil;
+                        capture     = Qnil;
+                    }
+
+                    if (!rb_ary_includes(scope, INT2FIX(H6_START)))
+                    {
+                        // literal output only if not in h6 scope (we stay silent in that case)
+                        _Wikitext_start_para_if_necessary(capture, scope, line, output, &pending_crlf);
+                        rb_str_append(output, escaped_h6_end());
+                    }
+                }
+                break;
+
+            case H5_END:
+                if (rb_ary_includes(scope, INT2FIX(NO_WIKI_START)) || rb_ary_includes(scope, INT2FIX(PRE)))
+                    // already in <nowiki> span or <pre> block
+                    rb_str_append(output, escaped_h5_end());
+                else
+                {
+                    if (rb_ary_includes(scope, INT2FIX(EXT_LINK_START)))
+                    {
+                        // this is a syntax error; an unclosed external link
+                        _Wikitext_rollback_failed_external_link(output, scope, line, link_target, link_text, link_class, autolink,
+                            line_ending);
+                        link_target = Qnil;
+                        link_text   = Qnil;
+                        capture     = Qnil;
+                    }
+
+                    if (!rb_ary_includes(scope, INT2FIX(H5_START)))
+                    {
+                        // literal output only if not in h5 scope (we stay silent in that case)
+                        _Wikitext_start_para_if_necessary(capture, scope, line, output, &pending_crlf);
+                        rb_str_append(output, escaped_h5_end());
+                    }
+                }
+                break;
+
+            case H4_END:
+                if (rb_ary_includes(scope, INT2FIX(NO_WIKI_START)) || rb_ary_includes(scope, INT2FIX(PRE)))
+                    // already in <nowiki> span or <pre> block
+                    rb_str_append(output, escaped_h4_end());
+                else
+                {
+                    if (rb_ary_includes(scope, INT2FIX(EXT_LINK_START)))
+                    {
+                        // this is a syntax error; an unclosed external link
+                        _Wikitext_rollback_failed_external_link(output, scope, line, link_target, link_text, link_class, autolink,
+                            line_ending);
+                        link_target = Qnil;
+                        link_text   = Qnil;
+                        capture     = Qnil;
+                    }
+
+                    if (!rb_ary_includes(scope, INT2FIX(H4_START)))
+                    {
+                        // literal output only if not in h4 scope (we stay silent in that case)
+                        _Wikitext_start_para_if_necessary(capture, scope, line, output, &pending_crlf);
+                        rb_str_append(output, escaped_h4_end());
+                    }
+                }
+                break;
+
+            case H3_END:
+                if (rb_ary_includes(scope, INT2FIX(NO_WIKI_START)) || rb_ary_includes(scope, INT2FIX(PRE)))
+                    // already in <nowiki> span or <pre> block
+                    rb_str_append(output, escaped_h3_end());
+                else
+                {
+                    if (rb_ary_includes(scope, INT2FIX(EXT_LINK_START)))
+                    {
+                        // this is a syntax error; an unclosed external link
+                        _Wikitext_rollback_failed_external_link(output, scope, line, link_target, link_text, link_class, autolink,
+                            line_ending);
+                        link_target = Qnil;
+                        link_text   = Qnil;
+                        capture     = Qnil;
+                    }
+
+                    if (!rb_ary_includes(scope, INT2FIX(H3_START)))
+                    {
+                        // literal output only if not in h3 scope (we stay silent in that case)
+                        _Wikitext_start_para_if_necessary(capture, scope, line, output, &pending_crlf);
+                        rb_str_append(output, escaped_h3_end());
+                    }
+                }
+                break;
+
+            case H2_END:
+                if (rb_ary_includes(scope, INT2FIX(NO_WIKI_START)) || rb_ary_includes(scope, INT2FIX(PRE)))
+                    // already in <nowiki> span or <pre> block
+                    rb_str_append(output, escaped_h2_end());
+                else
+                {
+                    if (rb_ary_includes(scope, INT2FIX(EXT_LINK_START)))
+                    {
+                        // this is a syntax error; an unclosed external link
+                        _Wikitext_rollback_failed_external_link(output, scope, line, link_target, link_text, link_class, autolink,
+                            line_ending);
+                        link_target = Qnil;
+                        link_text   = Qnil;
+                        capture     = Qnil;
+                    }
+
+                    if (!rb_ary_includes(scope, INT2FIX(H2_START)))
+                    {
+                        // literal output only if not in h2 scope (we stay silent in that case)
+                        _Wikitext_start_para_if_necessary(capture, scope, line, output, &pending_crlf);
+                        rb_str_append(output, escaped_h2_end());
+                    }
+                }
+                break;
+
+            case H1_END:
+                if (rb_ary_includes(scope, INT2FIX(NO_WIKI_START)) || rb_ary_includes(scope, INT2FIX(PRE)))
+                    // already in <nowiki> span or <pre> block
+                    rb_str_append(output, escaped_h1_end());
+                else
+                {
+                    if (rb_ary_includes(scope, INT2FIX(EXT_LINK_START)))
+                    {
+                        // this is a syntax error; an unclosed external link
+                        _Wikitext_rollback_failed_external_link(output, scope, line, link_target, link_text, link_class, autolink,
+                            line_ending);
+                        link_target = Qnil;
+                        link_text   = Qnil;
+                        capture     = Qnil;
+                    }
+
+                    if (!rb_ary_includes(scope, INT2FIX(H1_START)))
+                    {
+                        // literal output only if not in h1 scope (we stay silent in that case)
+                        _Wikitext_start_para_if_necessary(capture, scope, line, output, &pending_crlf);
+                        rb_str_append(output, escaped_h1_end());
+                    }
+                }
+                break;
+
+            case URI:
+                i = TOKEN_TEXT(token); // the URI
+                if (rb_ary_includes(scope, INT2FIX(NO_WIKI_START)))
+                    // user can temporarily suppress autolinking by using <nowiki></nowiki>
+                    // note that unlike MediaWiki, we do allow autolinking inside PRE blocks
+                    rb_str_append(output, i);
+                else if (rb_ary_includes(scope, INT2FIX(LINK_START)))
+                {
+                    // not yet implemented
+                    // TODO: implement
+                }
+                else if (rb_ary_includes(scope, INT2FIX(EXT_LINK_START)))
+                {
+                    if (NIL_P(link_target))
+                    {
+                        // this must be our link target: look ahead to make sure we see the space we're expecting to see
+                        NEXT_TOKEN();
+                        if (token->type == SPACE)
+                        {
+                            rb_ary_push(scope, INT2FIX(SPACE));
+                            link_target = i;
+                            link_text   = rb_str_new2("");
+                            capture     = link_text;
+                            token       = NULL; // silently consume space
+                        }
+                        else
+                        {
+                            // didn't see the space! this must be an error
+                            _Wikitext_pop_from_stack(scope, output, line_ending);
+                            _Wikitext_pop_excess_elements(Qnil, scope, line, output, line_ending);
+                            _Wikitext_start_para_if_necessary(Qnil, scope, line, output, &pending_crlf);
+                            rb_str_append(output, ext_link_start());
+                            if (autolink == Qtrue)
+                                i = _Wikitext_hyperlink(Qnil, i, i, link_class); // link target, link text, link class
+                            rb_str_append(output, i);
+                        }
+                    }
+                    else
+                    {
+                        if (NIL_P(link_text))
+                            // this must be the first part of our link text
+                            link_text = i;
+                        else
+                            // add to existing link text
+                            rb_str_append(link_text, i);
+                    }
+                }
+                else
+                {
+                    // in plain scope, will turn into autolink (with appropriate, user-configurable CSS)
+                    _Wikitext_pop_excess_elements(capture, scope, line, output, line_ending);
+                    _Wikitext_start_para_if_necessary(capture, scope, line, output, &pending_crlf);
+                    if (autolink == Qtrue)
+                        i = _Wikitext_hyperlink(Qnil, i, i, link_class); // link target, link text, link class
+                    rb_str_append(output, i);
+                }
+                break;
+
+            // internal links (links to other wiki articles) look like this:
+            //      [[another article]] (would point at, for example, "/wiki/another_article")
+            //      [[the other article|the link text we'll use for it]]
+            //      [[the other article | the link text we'll use for it]]
+            // note that the forward slash is a reserved character which changes the meaning of an internal link;
+            // this is a link that is external to the wiki but internal to the site as a whole:
+            //      [[bug/12]] (a relative link to "/bug/12")
+            // MediaWiki has strict requirements about what it will accept as a link target:
+            //      all wikitext markup is disallowed:
+            //          example [[foo ''bar'' baz]]
+            //          renders [[foo <em>bar</em> baz]]        (ie. not a link)
+            //          example [[foo <em>bar</em> baz]]
+            //          renders [[foo <em>bar</em> baz]]        (ie. not a link)
+            //          example [[foo <nowiki>''</nowiki> baz]]
+            //          renders [[foo '' baz]]                  (ie. not a link)
+            //          example [[foo <bar> baz]]
+            //          renders [[foo &lt;bar&gt; baz]]         (ie. not a link)
+            //      HTML entities and non-ASCII, however, make it through:
+            //          example [[foo &euro;]]
+            //          renders <a href="/wiki/Foo_%E2%82%AC">foo &euro;</a>
+            //          example [[foo €]]
+            //          renders <a href="/wiki/Foo_%E2%82%AC">foo €</a>
+            // we'll impose similar restrictions here for the link target; allowed tokens will be:
+            //      SPACE, PRINTABLE, DEFAULT, QUOT and AMP
+            // everything else will be rejected
+            case LINK_START:
+                i = NIL_P(capture) ? output : capture;
+                if (rb_ary_includes(scope, INT2FIX(NO_WIKI_START)) || rb_ary_includes(scope, INT2FIX(PRE)))
+                    // already in <nowiki> span or <pre> block
+                    rb_str_append(i, link_start());
+                else if (rb_ary_includes(scope, INT2FIX(EXT_LINK_START)))
+                    // already in external link scope! (and in fact, must be capturing link_text right now)
+                    rb_str_append(i, link_start());
+                else if (rb_ary_includes(scope, INT2FIX(LINK_START)))
+                {
+                    // already in internal link scope! this is a syntax error
+                    _Wikitext_rollback_failed_link(output, scope, line, link_target, link_text, link_class, line_ending);
+                    link_target = Qnil;
+                    link_text   = Qnil;
+                    capture     = Qnil;
+                    rb_str_append(output, link_start());
+                }
+                else if (rb_ary_includes(scope, INT2FIX(SEPARATOR)))
+                {
+                    // scanning internal link text
+                }
+                else // not in internal link scope yet
+                {
+                    rb_ary_push(scope, INT2FIX(LINK_START));
+
+                    // look ahead and try to gobble up link target
+                    while (NEXT_TOKEN(), (type = token->type))
+                    {
+                        if (type == SPACE       ||
+                            type == PRINTABLE   ||
+                            type == DEFAULT     ||
+                            type == QUOT        ||
+                            type == QUOT_ENTITY ||
+                            type == AMP         ||
+                            type == AMP_ENTITY)
+                        {
+                            // accumulate these tokens into link_target
+                            if (NIL_P(link_target))
+                            {
+                                link_target = rb_str_new2("");
+                                capture     = link_target;
+                            }
+                            if (type == QUOT_ENTITY)
+                                // don't insert the entity, insert the literal quote
+                                rb_str_append(link_target, quote());
+                            else if (type == AMP_ENTITY)
+                                // don't insert the entity, insert the literal ampersand
+                                rb_str_append(link_target, ampersand());
+                            else
+                                rb_str_append(link_target, TOKEN_TEXT(token));
+                        }
+                        else if (type == LINK_END)
+                            break; // jump back to top of loop (will handle this in LINK_END case below)
+                        else if (type == SEPARATOR)
+                        {
+                            rb_ary_push(scope, INT2FIX(SEPARATOR));
+                            link_text   = rb_str_new2("");
+                            capture     = link_text;
+                            token       = NULL;
+                            break;
+                        }
+                        else // unexpected token (syntax error)
+                        {
+                            _Wikitext_rollback_failed_link(output, scope, line, link_target, link_text, link_class, line_ending);
+                            link_target = Qnil;
+                            link_text   = Qnil;
+                            capture     = Qnil;
+                            break; // jump back to top of loop to handle unexpected token
+                        }
+                    }
+
+                    // jump to top of the loop to process token we scanned during lookahead (if any)
+                    continue;
+                }
+                break;
+
+            case LINK_END:
+                i = NIL_P(capture) ? output : capture;
+                if (rb_ary_includes(scope, INT2FIX(NO_WIKI_START)) || rb_ary_includes(scope, INT2FIX(PRE)))
+                    // already in <nowiki> span or <pre> block
+                    rb_str_append(i, link_end());
+                else if (rb_ary_includes(scope, INT2FIX(EXT_LINK_START)))
+                    // already in external link scope! (and in fact, must be capturing link_text right now)
+                    rb_str_append(i, link_end());
+                else if (rb_ary_includes(scope, INT2FIX(LINK_START)))
+                {
+                    // in internal link scope!
+                    if (NIL_P(link_text) || RSTRING_LEN(link_text) == 0)
+                        // use link target as link text
+                        link_text = Wikitext_sanitize_link_target(self, link_target);
+                    link_target = Wikitext_encode_link_target(self, link_target);
+                    _Wikitext_pop_from_stack_up_to(scope, i, INT2FIX(LINK_START), Qtrue, line_ending);
+                    _Wikitext_pop_excess_elements(Qnil, scope, line, output, line_ending);
+                    _Wikitext_start_para_if_necessary(Qnil, scope, line, output, &pending_crlf);
+                    i = _Wikitext_hyperlink(prefix, link_target, link_text, Qnil); // link target, link text, link class
+                    rb_str_append(output, i);
+                    link_target = Qnil;
+                    link_text   = Qnil;
+                    capture     = Qnil;
+                }
+                else // wasn't in internal link scope
+                {
+                    _Wikitext_pop_excess_elements(capture, scope, line, output, line_ending);
+                    _Wikitext_start_para_if_necessary(capture, scope, line, output, &pending_crlf);
+                    rb_str_append(i, link_end());
+                }
+                break;
+
+            // external links look like this:
+            //      [http://google.com/ the link text]
+            // strings in square brackets which don't match this syntax get passed through literally; eg:
+            //      he was very angery [sic] about the turn of events
+            case EXT_LINK_START:
+                i = NIL_P(capture) ? output : capture;
+                if (rb_ary_includes(scope, INT2FIX(NO_WIKI_START)) || rb_ary_includes(scope, INT2FIX(PRE)))
+                    // already in <nowiki> span or <pre> block
+                    rb_str_append(i, ext_link_start());
+                else if (rb_ary_includes(scope, INT2FIX(EXT_LINK_START)))
+                    // already in external link scope! (and in fact, must be capturing link_text right now)
+                    rb_str_append(i, ext_link_start());
+                else if (rb_ary_includes(scope, INT2FIX(LINK_START)))
+                {
+                    // already in internal link scope!
+                    i = ext_link_start();
+                    if (NIL_P(link_target))
+                        // this must be the first character of our link target
+                        link_target = i;
+                    else if (rb_ary_includes(scope, INT2FIX(SPACE)))
+                    {
+                        // link target has already been scanned
+                        if (NIL_P(link_text))
+                            // this must be the first character of our link text
+                            link_text = i;
+                        else
+                            // add to existing link text
+                            rb_str_append(link_text, i);
+                    }
+                    else
+                        // add to existing link target
+                        rb_str_append(link_target, i);
+                }
+                else // not in external link scope yet
+                {
+                    // look ahead: expect a URI
+                    NEXT_TOKEN();
+                    if (token->type == URI)
+                        rb_ary_push(scope, INT2FIX(EXT_LINK_START));    // so far so good, jump back to the top of the loop
+                    else
+                    {
+                        // only get here if there was a syntax error (missing URI)
+                        _Wikitext_pop_excess_elements(capture, scope, line, output, line_ending);
+                        _Wikitext_start_para_if_necessary(capture, scope, line, output, &pending_crlf);
+                        rb_str_append(output, ext_link_start());
+                    }
+                    continue; // jump back to top of loop to handle token (either URI or whatever it is)
+                }
+                break;
+
+            case EXT_LINK_END:
+                i = NIL_P(capture) ? output : capture;
+                if (rb_ary_includes(scope, INT2FIX(NO_WIKI_START)) || rb_ary_includes(scope, INT2FIX(PRE)))
+                    // already in <nowiki> span or <pre> block
+                    rb_str_append(i, ext_link_end());
+                else if (rb_ary_includes(scope, INT2FIX(EXT_LINK_START)))
+                {
+                    if (NIL_P(link_text))
+                        // this is a syntax error; external link with no link text
+                        _Wikitext_rollback_failed_external_link(output, scope, line, link_target, link_text, link_class, autolink,
+                            line_ending);
+                    else
+                    {
+                        // success!
+                        _Wikitext_pop_from_stack_up_to(scope, i, INT2FIX(EXT_LINK_START), Qtrue, line_ending);
+                        _Wikitext_pop_excess_elements(Qnil, scope, line, output, line_ending);
+                        _Wikitext_start_para_if_necessary(Qnil, scope, line, output, &pending_crlf);
+                        i = _Wikitext_hyperlink(Qnil, link_target, link_text, link_class); // link target, link text, link class
+                        rb_str_append(output, i);
+                    }
+                    link_target = Qnil;
+                    link_text   = Qnil;
+                    capture     = Qnil;
+                }
+                else
+                {
+                    _Wikitext_pop_excess_elements(Qnil, scope, line, output, line_ending);
+                    _Wikitext_start_para_if_necessary(Qnil, scope, line, output, &pending_crlf);
+                    rb_str_append(output, ext_link_end());
+                }
+                break;
+
+            case SEPARATOR:
+                i = NIL_P(capture) ? output : capture;
+                _Wikitext_pop_excess_elements(capture, scope, line, i, line_ending);
+                _Wikitext_start_para_if_necessary(capture, scope, line, i, &pending_crlf);
+                rb_str_append(i, separator());
+                break;
+
+            case SPACE:
+                i = NIL_P(capture) ? output : capture;
+                j = TOKEN_TEXT(token); // SPACE token may actually be a run of spaces
+                if (rb_ary_includes(scope, INT2FIX(NO_WIKI_START)) || rb_ary_includes(scope, INT2FIX(PRE)))
+                    // already in <nowiki> span or <pre> block
+                    rb_str_append(i, j);
+                else
+                {
+                    // peek ahead to see next token
+                    NEXT_TOKEN();
+                    type = token->type;
+                    if (((type == H6_END) && rb_ary_includes(scope, INT2FIX(H6_START))) ||
+                        ((type == H5_END) && rb_ary_includes(scope, INT2FIX(H5_START))) ||
+                        ((type == H4_END) && rb_ary_includes(scope, INT2FIX(H4_START))) ||
+                        ((type == H3_END) && rb_ary_includes(scope, INT2FIX(H3_START))) ||
+                        ((type == H2_END) && rb_ary_includes(scope, INT2FIX(H2_START))) ||
+                        ((type == H1_END) && rb_ary_includes(scope, INT2FIX(H1_START))))
+                    {
+                        // will suppress emission of space (discard) if next token is a H6_END, H5_END etc and we are in the corresponding scope
+                    }
+                    else
+                    {
+                        // emit the space
+                        _Wikitext_pop_excess_elements(capture, scope, line, i, line_ending);
+                        _Wikitext_start_para_if_necessary(capture, scope, line, i, &pending_crlf);
+                        rb_str_append(i, j);
+                    }
+
+                    // jump to top of the loop to process token we scanned during lookahead
+                    continue;
+                }
+                break;
+
+            case QUOT_ENTITY:
+            case AMP_ENTITY:
+            case NAMED_ENTITY:
+            case DECIMAL_ENTITY:
+                // pass these through unaltered as they are case sensitive
+                i = NIL_P(capture) ? output : capture;
+                _Wikitext_pop_excess_elements(capture, scope, line, i, line_ending);
+                _Wikitext_start_para_if_necessary(capture, scope, line, i, &pending_crlf);
+                rb_str_append(i, TOKEN_TEXT(token));
+                break;
+
+            case HEX_ENTITY:
+                // normalize hex entities (downcase them)
+                i = NIL_P(capture) ? output : capture;
+                _Wikitext_pop_excess_elements(capture, scope, line, i, line_ending);
+                _Wikitext_start_para_if_necessary(capture, scope, line, i, &pending_crlf);
+                rb_str_append(i, _Wikitext_downcase(TOKEN_TEXT(token)));
+                break;
+
+            case QUOT:
+                i = NIL_P(capture) ? output : capture;
+                _Wikitext_pop_excess_elements(capture, scope, line, i, line_ending);
+                _Wikitext_start_para_if_necessary(capture, scope, line, i, &pending_crlf);
+                rb_str_append(i, quot_entity());
+                break;
+
+            case AMP:
+                i = NIL_P(capture) ? output : capture;
+                _Wikitext_pop_excess_elements(capture, scope, line, i, line_ending);
+                _Wikitext_start_para_if_necessary(capture, scope, line, i, &pending_crlf);
+                rb_str_append(i, amp_entity());
+                break;
+
+            case LESS:
+                i = NIL_P(capture) ? output : capture;
+                _Wikitext_pop_excess_elements(capture, scope, line, i, line_ending);
+                _Wikitext_start_para_if_necessary(capture, scope, line, i, &pending_crlf);
+                rb_str_append(i, lt_entity());
+                break;
+
+            case GREATER:
+                i = NIL_P(capture) ? output : capture;
+                _Wikitext_pop_excess_elements(capture, scope, line, i, line_ending);
+                _Wikitext_start_para_if_necessary(capture, scope, line, i, &pending_crlf);
+                rb_str_append(i, gt_entity());
+                break;
+
+            case CRLF:
+                if (rb_ary_includes(scope, INT2FIX(LINK_START)))
+                {
+                    // this is a syntax error; an unclosed external link
+                    _Wikitext_rollback_failed_link(output, scope, line, link_target, link_text, link_class, line_ending);
+                    link_target = Qnil;
+                    link_text   = Qnil;
+                    capture     = Qnil;
+                }
+                else if (rb_ary_includes(scope, INT2FIX(EXT_LINK_START)))
+                {
+                    // this is a syntax error; an unclosed external link
+                    _Wikitext_rollback_failed_external_link(output, scope, line, link_target, link_text, link_class, autolink,
+                        line_ending);
+                    link_target = Qnil;
+                    link_text   = Qnil;
+                    capture     = Qnil;
+                }
+
+                if (rb_ary_includes(scope, INT2FIX(NO_WIKI_START)))
+                {
+                    // <nowiki> spans are unique; CRLFs are blindly echoed
+                    while (!NIL_P(rb_ary_delete_at(line_buffer, -1)));
+                    rb_str_append(output, line_ending);
+                    pending_crlf = Qfalse;
+                    break;
+                }
+                else if (rb_ary_includes(scope, INT2FIX(PRE)))
+                {
+                    // beware when nothing or BLOCKQUOTE on line buffer (not line stack!) prior to CRLF, that must be end of PRE block
+                    if (NIL_P(rb_ary_entry(line_buffer, -2)) || (FIX2INT(rb_ary_entry(line_buffer, -2)) == BLOCKQUOTE))
+                    {
+                        // don't emit in this case
+                    }
+                    else
+                        rb_str_append(output, line_ending);
+                    pending_crlf = Qfalse;
+                }
+                else
+                {
+                    pending_crlf = Qtrue;
+
+                    // count number of BLOCKQUOTE tokens in line buffer (can be zero) and pop back to that level
+                    // as a side effect, this handles any open span-level elements and unclosed blocks (with special handling for P blocks and LI elements)
+                    i = _Wikitext_count(INT2FIX(BLOCKQUOTE), line);
+                    for (j = RARRAY_LEN(scope); j > i; j--)
+                    {
+                        if (FIX2INT(rb_ary_entry(line, -1)) == LI)
+                        {
+                            pending_crlf = Qfalse;
+                            break;
+                        }
+
+                        // special handling on last iteration through the loop if the top item on the scope is a P block
+                        if ((j - i == 1) && (FIX2INT(rb_ary_entry(scope, -1)) == P))
+                        {
+                            // if nothing or BLOCKQUOTE on line buffer (not line stack!) prior to CRLF, this must be a paragraph break
+                            if (NIL_P(rb_ary_entry(line_buffer, -2)) || (FIX2INT(rb_ary_entry(line_buffer, -2)) == BLOCKQUOTE))
+                                // paragraph break
+                                pending_crlf = Qfalse;
+                            else
+                                // not a paragraph break!
+                                continue;
+                        }
+                        _Wikitext_pop_from_stack(scope, output, line_ending);
+                    }
+                }
+
+                // delete the entire contents of the line scope stack and buffer
+                while (!NIL_P(rb_ary_delete_at(line, -1)));
+                while (!NIL_P(rb_ary_delete_at(line_buffer, -1)));
+                break;
+
+            case PRINTABLE:
+                i = NIL_P(capture) ? output : capture;
+                _Wikitext_pop_excess_elements(capture, scope, line, i, line_ending);
+                _Wikitext_start_para_if_necessary(capture, scope, line, i, &pending_crlf);
+                rb_str_append(i, TOKEN_TEXT(token));
+                break;
+
+            case DEFAULT:
+                i = NIL_P(capture) ? output : capture;
+                _Wikitext_pop_excess_elements(capture, scope, line, i, line_ending);
+                _Wikitext_start_para_if_necessary(capture, scope, line, i, &pending_crlf);
+                rb_str_append(i, _Wikitext_utf32_char_to_entity(token->code_point));    // convert to entity
+                break;
+
+            case END_OF_FILE:
+                // close any open scopes on hitting EOF
+                if (rb_ary_includes(scope, INT2FIX(EXT_LINK_START)))
+                    // this is a syntax error; an unclosed external link
+                    _Wikitext_rollback_failed_external_link(output, scope, line, link_target, link_text, link_class, autolink,
+                        line_ending);
+                else if (rb_ary_includes(scope, INT2FIX(LINK_START)))
+                    // this is a syntax error; an unclosed internal link
+                    _Wikitext_rollback_failed_link(output, scope, line, link_target, link_text, link_class, line_ending);
+                for (i = 0, j = RARRAY_LEN(scope); i < j; i++)
+                    _Wikitext_pop_from_stack(scope, output, line_ending);
+                goto return_output; // break not enough here (want to break out of outer while loop, not inner switch statement)
+
+            default:
+                break;
+        }
+
+        // reset current token; forcing lexer to return another token at the top of the loop
+        token = NULL;
+    } while (1);
+return_output:
+    return output;
+}
diff --git a/ext/parser.h b/ext/parser.h
new file mode 100644 (file)
index 0000000..d09b14f
--- /dev/null
@@ -0,0 +1,27 @@
+// Copyright 2008 Wincent Colaiuta
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+#include <ruby/ruby.h>
+
+VALUE Wikitext_parser_initialize(VALUE self);
+
+VALUE Wikitext_parser_tokenize(VALUE self, VALUE string);
+
+VALUE Wikitext_parser_benchmarking_tokenize(VALUE self, VALUE string);
+
+VALUE Wikitext_sanitize_link_target(VALUE self, VALUE string);
+
+VALUE Wikitext_encode_link_target(VALUE self, VALUE in);
+
+VALUE Wikitext_parser_parse(VALUE self, VALUE string);
index 77e82aca162b31b7022cccaa76c6760d0329720d..9c2eea0fcb715b83d6217692a3feb586b78ec247 100755 (executable)
@@ -16,9 +16,9 @@
 require File.join(File.dirname(__FILE__), 'spec_helper.rb')
 require 'wikitext'
 
-describe Wikitext, 'token_types method' do
+describe Wikitext::Parser::Token do
   before do
-    @tokens = Wikitext::token_types
+    @tokens = Wikitext::Parser::Token.types
   end
 
   it 'should report the available token types as a hash' do
@@ -27,8 +27,8 @@ describe Wikitext, 'token_types method' do
 
   it 'should report token names as symbols and values as numbers' do
     @tokens.each do |k, v|
-      k.should be_kind_of(Integer)
       v.should be_kind_of(Symbol)
+      k.should be_kind_of(Integer)
     end
   end
 
@@ -43,7 +43,6 @@ end
 describe Wikitext::Parser, 'tokenizing' do
   before do
     @parser = Wikitext::Parser.new
-    @types  = Wikitext::token_types
   end
 
   it 'should do nothing if passed nil' do
@@ -167,6 +166,6 @@ link. And [http://example.com/ is another.
 SLAB
 
     @tokens = @parser.tokenize(large_block_of_text)
-    @tokens
+    @tokens.length.should > 0
   end
 end
diff --git a/ext/token.c b/ext/token.c
new file mode 100644 (file)
index 0000000..f46a418
--- /dev/null
@@ -0,0 +1,99 @@
+// Copyright 2008 Wincent Colaiuta
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+#include "token.h"
+#include "wikitext.h"
+
+// return a hash of token types
+// we make this available for unit testing purposes
+
+VALUE Wikitext_parser_token_types(VALUE self)
+{
+    VALUE hash = rb_hash_new();
+
+#define SET_TOKEN_TYPE(identifier)  (void)rb_hash_aset(hash, INT2FIX(identifier), \
+    rb_funcall(rb_funcall(rb_str_new2(#identifier), rb_intern("downcase"), 0), rb_intern("to_sym"), 0))
+
+    SET_TOKEN_TYPE(NO_TOKEN);
+    SET_TOKEN_TYPE(P);
+    SET_TOKEN_TYPE(LI);
+    SET_TOKEN_TYPE(PRE);
+    SET_TOKEN_TYPE(NO_WIKI_START);
+    SET_TOKEN_TYPE(NO_WIKI_END);
+    SET_TOKEN_TYPE(BLOCKQUOTE);
+    SET_TOKEN_TYPE(STRONG_EM);
+    SET_TOKEN_TYPE(STRONG);
+    SET_TOKEN_TYPE(EM);
+    SET_TOKEN_TYPE(TT_START);
+    SET_TOKEN_TYPE(TT_END);
+    SET_TOKEN_TYPE(TT);
+    SET_TOKEN_TYPE(OL);
+    SET_TOKEN_TYPE(UL);
+    SET_TOKEN_TYPE(H6_START);
+    SET_TOKEN_TYPE(H5_START);
+    SET_TOKEN_TYPE(H4_START);
+    SET_TOKEN_TYPE(H3_START);
+    SET_TOKEN_TYPE(H2_START);
+    SET_TOKEN_TYPE(H1_START);
+    SET_TOKEN_TYPE(H6_END);
+    SET_TOKEN_TYPE(H5_END);
+    SET_TOKEN_TYPE(H4_END);
+    SET_TOKEN_TYPE(H3_END);
+    SET_TOKEN_TYPE(H2_END);
+    SET_TOKEN_TYPE(H1_END);
+    SET_TOKEN_TYPE(URI);
+    SET_TOKEN_TYPE(LINK_START);
+    SET_TOKEN_TYPE(LINK_END);
+    SET_TOKEN_TYPE(EXT_LINK_START);
+    SET_TOKEN_TYPE(EXT_LINK_END);
+    SET_TOKEN_TYPE(SEPARATOR);
+    SET_TOKEN_TYPE(SPACE);
+    SET_TOKEN_TYPE(QUOT_ENTITY);
+    SET_TOKEN_TYPE(AMP_ENTITY);
+    SET_TOKEN_TYPE(NAMED_ENTITY);
+    SET_TOKEN_TYPE(HEX_ENTITY);
+    SET_TOKEN_TYPE(DECIMAL_ENTITY);
+    SET_TOKEN_TYPE(QUOT);
+    SET_TOKEN_TYPE(AMP);
+    SET_TOKEN_TYPE(LESS);
+    SET_TOKEN_TYPE(GREATER);
+    SET_TOKEN_TYPE(CRLF);
+    SET_TOKEN_TYPE(PRINTABLE);
+    SET_TOKEN_TYPE(DEFAULT);
+    SET_TOKEN_TYPE(END_OF_FILE);
+
+#undef SET_TOKEN_TYPE
+
+    return hash;
+}
+
+// for testing and debugging only
+VALUE _Wikitext_token(token_t *token)
+{
+    VALUE object = rb_class_new_instance(0, NULL, cWikitextParserToken);
+    (void)rb_iv_set(object, "@start",           LONG2NUM((long)token->start));
+    (void)rb_iv_set(object, "@stop",            LONG2NUM((long)token->stop));
+    (void)rb_iv_set(object, "@line_start",      LONG2NUM(token->line_start));
+    (void)rb_iv_set(object, "@line_stop",       LONG2NUM(token->line_stop));
+    (void)rb_iv_set(object, "@column_start",    LONG2NUM(token->column_start));
+    (void)rb_iv_set(object, "@column_stop",     LONG2NUM(token->column_stop));
+    (void)rb_iv_set(object, "@code_point",      INT2NUM(token->code_point));
+
+    // look-up the token type
+    VALUE types = Wikitext_parser_token_types(Qnil);
+    VALUE type  = rb_hash_aref(types, INT2FIX(token->type));
+    (void)rb_iv_set(object, "@token_type",      type);
+    (void)rb_iv_set(object, "@string_value",    rb_str_new(token->start, token->stop - token->start));
+    return object;
+}
diff --git a/ext/token.h b/ext/token.h
new file mode 100644 (file)
index 0000000..76a942b
--- /dev/null
@@ -0,0 +1,84 @@
+// Copyright 2008 Wincent Colaiuta
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+#include <ruby/ruby.h>
+#include <stdint.h>     /* uint32_t */
+
+#define TOKEN_TEXT(token)   rb_str_new((const char *)token->start, (token->stop + 1 - token->start))
+
+typedef struct
+{
+    char        *start;
+    char        *stop;
+    size_t      line_start;
+    size_t      line_stop;
+    size_t      column_start;
+    size_t      column_stop;
+    uint32_t    code_point;
+    int         type;
+} token_t;
+
+enum token_types {
+    NO_TOKEN,
+    P,              // imaginary token (never explicitly marked up)
+    LI,             // imaginary token (never explicitly marked up)
+    PRE,
+    NO_WIKI_START,
+    NO_WIKI_END,
+    BLOCKQUOTE,
+    STRONG_EM,
+    STRONG,
+    EM,
+    TT_START,
+    TT_END,
+    TT,
+    OL,
+    UL,
+    H6_START,
+    H5_START,
+    H4_START,
+    H3_START,
+    H2_START,
+    H1_START,
+    H6_END,
+    H5_END,
+    H4_END,
+    H3_END,
+    H2_END,
+    H1_END,
+    URI,
+    LINK_START,
+    LINK_END,
+    EXT_LINK_START,
+    EXT_LINK_END,
+    SEPARATOR,
+    SPACE,
+    QUOT_ENTITY,
+    AMP_ENTITY,
+    NAMED_ENTITY,
+    HEX_ENTITY,
+    DECIMAL_ENTITY,
+    QUOT,
+    AMP,
+    LESS,
+    GREATER,
+    CRLF,
+    PRINTABLE,
+    DEFAULT,
+    END_OF_FILE
+};
+
+VALUE Wikitext_parser_token_types(VALUE self);
+
+VALUE _Wikitext_token(token_t *token);
index 5cd2d1f7fc312bf1f97727dee9537da7f8dfe747..1ea233a12ba302167fa3147a356cd55029f06f7f 100644 (file)
 // along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 #include "wikitext_ragel.h"
+#include "parser.h"
 
-static VALUE mWikitext              = 0;    // Wikitext
-static VALUE cWikitextParser        = 0;    // Wikitext::Parser
-static VALUE cWikitextParserToken   = 0;    // Wikitext::Parser::Token
-VALUE eWikitextError                = 0;    // Wikitext::Error
-
-// return a hash of token types
-// we make this available for unit testing purposes
-VALUE Wikitext_token_types(VALUE self)
-{
-    VALUE hash = rb_hash_new();
-
-#define SET_TOKEN_TYPE(identifier)  (void)rb_hash_aset(hash, INT2FIX(identifier), \
-    rb_funcall(rb_funcall(rb_str_new2(#identifier), rb_intern("downcase"), 0), rb_intern("to_sym"), 0))
-
-    SET_TOKEN_TYPE(NO_TOKEN);
-    SET_TOKEN_TYPE(P);
-    SET_TOKEN_TYPE(LI);
-    SET_TOKEN_TYPE(PRE);
-    SET_TOKEN_TYPE(NO_WIKI_START);
-    SET_TOKEN_TYPE(NO_WIKI_END);
-    SET_TOKEN_TYPE(BLOCKQUOTE);
-    SET_TOKEN_TYPE(STRONG_EM);
-    SET_TOKEN_TYPE(STRONG);
-    SET_TOKEN_TYPE(EM);
-    SET_TOKEN_TYPE(TT_START);
-    SET_TOKEN_TYPE(TT_END);
-    SET_TOKEN_TYPE(TT);
-    SET_TOKEN_TYPE(OL);
-    SET_TOKEN_TYPE(UL);
-    SET_TOKEN_TYPE(H6_START);
-    SET_TOKEN_TYPE(H5_START);
-    SET_TOKEN_TYPE(H4_START);
-    SET_TOKEN_TYPE(H3_START);
-    SET_TOKEN_TYPE(H2_START);
-    SET_TOKEN_TYPE(H1_START);
-    SET_TOKEN_TYPE(H6_END);
-    SET_TOKEN_TYPE(H5_END);
-    SET_TOKEN_TYPE(H4_END);
-    SET_TOKEN_TYPE(H3_END);
-    SET_TOKEN_TYPE(H2_END);
-    SET_TOKEN_TYPE(H1_END);
-    SET_TOKEN_TYPE(URI);
-    SET_TOKEN_TYPE(LINK_START);
-    SET_TOKEN_TYPE(LINK_END);
-    SET_TOKEN_TYPE(EXT_LINK_START);
-    SET_TOKEN_TYPE(EXT_LINK_END);
-    SET_TOKEN_TYPE(SEPARATOR);
-    SET_TOKEN_TYPE(SPACE);
-    SET_TOKEN_TYPE(QUOT_ENTITY);
-    SET_TOKEN_TYPE(AMP_ENTITY);
-    SET_TOKEN_TYPE(NAMED_ENTITY);
-    SET_TOKEN_TYPE(HEX_ENTITY);
-    SET_TOKEN_TYPE(DECIMAL_ENTITY);
-    SET_TOKEN_TYPE(QUOT);
-    SET_TOKEN_TYPE(AMP);
-    SET_TOKEN_TYPE(LESS);
-    SET_TOKEN_TYPE(GREATER);
-    SET_TOKEN_TYPE(CRLF);
-    SET_TOKEN_TYPE(PRINTABLE);
-    SET_TOKEN_TYPE(DEFAULT);
-    SET_TOKEN_TYPE(END_OF_FILE);
-
-#undef SET_TOKEN_TYPE
-
-    return hash;
-}
-
-VALUE Wikitext_parser_initialize(VALUE self)
-{
-    // no need to call super here; rb_call_super()
-    return self;
-}
-
-// for testing and debugging only
-VALUE _Wikitext_token(token_t *token)
-{
-    VALUE object = rb_class_new_instance(0, NULL, cWikitextParserToken);
-    (void)rb_iv_set(object, "@start",           LONG2NUM((long)token->start));
-    (void)rb_iv_set(object, "@stop",            LONG2NUM((long)token->stop));
-    (void)rb_iv_set(object, "@line_start",      LONG2NUM(token->line_start));
-    (void)rb_iv_set(object, "@line_stop",       LONG2NUM(token->line_stop));
-    (void)rb_iv_set(object, "@column_start",    LONG2NUM(token->column_start));
-    (void)rb_iv_set(object, "@column_stop",     LONG2NUM(token->column_stop));
-    (void)rb_iv_set(object, "@code_point",      INT2NUM(token->code_point));
-
-    // look-up the token type
-    VALUE types = Wikitext_token_types(Qnil);
-    VALUE type  = rb_hash_aref(types, INT2FIX(token->type));
-    (void)rb_iv_set(object, "@token_type",      type);
-    (void)rb_iv_set(object, "@string_value",    rb_str_new(token->start, token->stop - token->start));
-    return object;
-}
-
-// for testing and debugging only
-VALUE Wikitext_parser_tokenize(VALUE self, VALUE string)
-{
-    if (NIL_P(string))
-        return Qnil;
-    string = StringValue(string);
-    VALUE tokens = rb_ary_new();
-    char *p = RSTRING_PTR(string);
-    long len = RSTRING_LEN(string);
-    char *pe = p + len;
-    token_t token;
-    next_token(&token, NULL, p, pe);
-    rb_ary_push(tokens, _Wikitext_token(&token));
-    while (token.type != END_OF_FILE)
-    {
-        next_token(&token, &token, NULL, pe);
-        rb_ary_push(tokens, _Wikitext_token(&token));
-    }
-    return tokens;
-}
-
-// for benchmarking raw tokenization speed only
-VALUE Wikitext_parser_benchmarking_tokenize(VALUE self, VALUE string)
-{
-    if (NIL_P(string))
-        return Qnil;
-    string = StringValue(string);
-    char *p = RSTRING_PTR(string);
-    long len = RSTRING_LEN(string);
-    char *pe = p + len;
-    token_t token;
-    next_token(&token, NULL, p, pe);
-    while (token.type != END_OF_FILE)
-        next_token(&token, &token, NULL, pe);
-    return Qnil;
-}
+VALUE mWikitext              = 0;   // Wikitext
+VALUE cWikitextParser        = 0;   // Wikitext::Parser
+VALUE eWikitextParserError   = 0;   // Wikitext::Parser::Error
+VALUE cWikitextParserToken   = 0;   // Wikitext::Parser::Token
 
 void Init_wikitext()
 {
     // Wikitext
     mWikitext = rb_define_module("Wikitext");
-    rb_define_singleton_method(mWikitext, "token_types", Wikitext_token_types, 0);
-
-    // Wikitext::Error
-    eWikitextError = rb_define_class_under(mWikitext, "Error", rb_eException);
 
     // Wikitext::Parser
     cWikitextParser = rb_define_class_under(mWikitext, "Parser", rb_cObject);
     rb_define_method(cWikitextParser, "initialize", Wikitext_parser_initialize, 0);
+    rb_define_method(cWikitextParser, "parse", Wikitext_parser_parse, 1);
     rb_define_method(cWikitextParser, "tokenize", Wikitext_parser_tokenize, 1);
     rb_define_method(cWikitextParser, "benchmarking_tokenize", Wikitext_parser_benchmarking_tokenize, 1);
 
+    // sanitizes an internal link target for inclusion with the HTML stream; for example, a link target for the article titled:
+    //      foo, "bar" & baz €
+    // would be sanitized as:
+    //      foo, &quot;bar&quot; &amp; baz &#x20ac;
+    rb_define_singleton_method(cWikitextParser, "sanitize_link_target", Wikitext_sanitize_link_target, 1);
+
+    // encodes an internal link target for use as an anchor href; for example, the link target:
+    //      foo, "bar" & baz €
+    // would be encoded as:
+    //      foo%2c%20%22bar%22%20%26%20baz%e2%82%ac
+    // and used as follows (combined with the output of sanitize_link_target):
+    //      <a href="foo%2c%20%22bar%22%20%26%20baz%e2%82%ac">foo, &quot;bar&quot; &amp; baz &#x20ac;</a>
+    rb_define_singleton_method(cWikitextParser, "encode_link_target", Wikitext_encode_link_target, 1);
+
+    // override default line_ending
+    // defaults to "\n"
+    rb_define_attr(cWikitextParser, "line_ending", Qtrue, Qtrue);
+
+    // the prefix to be prepended to internal links; defaults to "/wiki/"
+    // for example, given an internal_link_prefix of "/wiki/"
+    //      [[Apple]]
+    // would be transformed into:
+    //      <a href="/wiki/Apple">Apple</a>
+    rb_define_attr(cWikitextParser, "internal_link_prefix", Qtrue, Qtrue);
+
+    // CSS class to be applied to external links; defaults to "external"
+    // for example, given an external_link_class of "external":
+    //      [http://www.google.com/ the best search engine]
+    // would be transformed into:
+    //      <a class="external" href="http://www.google.com/">the best search engine</a>
+    rb_define_attr(cWikitextParser, "external_link_class", Qtrue, Qtrue);
+
+    // CSS class to be applied to external links; defaults to "mailto"
+    // for example:
+    //      [mailto:user@example.com user@example.com]
+    // or if autolinking of email addresses is turned on (not yet implemented) just
+    //      user@example.com
+    // would be transformed into:
+    //      <a class="mailto" href="mailto:user@example.com">user@example.com</a>
+    rb_define_attr(cWikitextParser, "mailto_class", Qtrue, Qtrue);
+
+    // whether to autolink URIs found in the plain scope
+    // when true:
+    //      http://apple.com/
+    // will be transformed to:
+    //      <a href="http://apple.com/">http://apple.com/</a>
+    // and if an external_link_class is set (to "external", for example) then the transformation will be:
+    //      <a class="external" href="http://apple.com/">http://apple.com/</a>
+    rb_define_attr(cWikitextParser, "autolink", Qtrue, Qtrue);
+
+    // whether "slash" in link text is treated specially
+    // when true, any link containing a slash is considered to be a relative link within the current site, but outside the wiki
+    // in other words, while:
+    //      [[interesting article]]
+    // is a wiki link (assuming the internal_link_prefix of "/wiki/"):
+    //      <a href="/wiki/interesting+article">interesting article</a>
+    // in contrast:
+    //      [[issue/400]]
+    // is interpreted as a link external to the wiki but internal to the site, and is converted into:
+    //      <a href="/issue/400">issue/400</a>
+    // this design is intended to work well with preprocessors, that can scan the input for things like:
+    //      issue #400
+    // and transform them before feeding them into the wikitext parser as:
+    //      [[issue/400|issue #400]]
+    // which in turn would be transformed into:
+    //      <a href="/issue/400">issue #400</a>
+    rb_define_attr(cWikitextParser, "treat_slash_as_special", Qtrue, Qtrue);
+
+    // Wikitext::Parser::Error
+    eWikitextParserError = rb_define_class_under(mWikitext, "Error", rb_eException);
+
     // Wikitext::Parser::Token
     cWikitextParserToken = rb_define_class_under(cWikitextParser, "Token", rb_cObject);
+    rb_define_singleton_method(cWikitextParserToken, "token_types", Wikitext_parser_token_types, 0);
     rb_define_attr(cWikitextParserToken, "start", Qtrue, Qfalse);
     rb_define_attr(cWikitextParserToken, "stop", Qtrue, Qfalse);
     rb_define_attr(cWikitextParserToken, "line_start", Qtrue, Qfalse);
index ac6097e36c7ff46d84b4dba0db847de956fd39f1..64d2faf556af64d1bd05a96bed2e46faa1a17905 100644 (file)
 #include <ruby/ruby.h>
 #include <stdint.h>
 
-// error raised when scanning fails
-extern VALUE eWikitextError;
+// Wikitext
+extern VALUE mWikitext;
+
+// Wikitext::Parser
+extern VALUE cWikitextParser;
 
-typedef struct
-{
-    char        *start;
-    char        *stop;
-    size_t      line_start;
-    size_t      line_stop;
-    size_t      column_start;
-    size_t      column_stop;
-    uint32_t    code_point;
-    int         type;
-} token_t;
+// Wikitext::Parser::Error
+// error raised when scanning fails
+extern VALUE eWikitextParserError;
 
-enum token_types {
-    NO_TOKEN,
-    P,              // imaginary token (never explicitly marked up)
-    LI,             // imaginary token (never explicitly marked up)
-    PRE,
-    NO_WIKI_START,
-    NO_WIKI_END,
-    BLOCKQUOTE,
-    STRONG_EM,
-    STRONG,
-    EM,
-    TT_START,
-    TT_END,
-    TT,
-    OL,
-    UL,
-    H6_START,
-    H5_START,
-    H4_START,
-    H3_START,
-    H2_START,
-    H1_START,
-    H6_END,
-    H5_END,
-    H4_END,
-    H3_END,
-    H2_END,
-    H1_END,
-    URI,
-    LINK_START,
-    LINK_END,
-    EXT_LINK_START,
-    EXT_LINK_END,
-    SEPARATOR,
-    SPACE,
-    QUOT_ENTITY,
-    AMP_ENTITY,
-    NAMED_ENTITY,
-    HEX_ENTITY,
-    DECIMAL_ENTITY,
-    QUOT,
-    AMP,
-    LESS,
-    GREATER,
-    CRLF,
-    PRINTABLE,
-    DEFAULT,
-    END_OF_FILE
-};
+// Wikitext::Parser::Token
+extern VALUE cWikitextParserToken;
index 1d213b9c696c8a0e367b502b982fda1ac3f0a8f8..1a43a0ed9816e1cf307365a88fa182eb1afe6ca0 100644 (file)
@@ -20,6 +20,7 @@
 //----------------------------------------------------------------------//
 
 #include "wikitext_ragel.h"
+#include "wikitext.h"
 #include <stdio.h>
 
 #define EMIT(t)     do { out->type = t; out->stop = p + 1; out->column_stop += (out->stop - out->start); } while (0)
@@ -27,7 +28,7 @@
 #define REWIND()    do { p = mark; } while (0)
 
 
-#line 31 "wikitext_ragel.c"
+#line 32 "wikitext_ragel.c"
 static const char _wikitext_actions[] = {
        0, 1, 0, 1, 2, 1, 3, 1, 
        4, 1, 10, 1, 11, 1, 12, 1, 
@@ -49,11 +50,11 @@ static const short _wikitext_key_offsets[] = {
        26, 33, 36, 44, 52, 59, 67, 75, 
        83, 90, 94, 96, 98, 100, 102, 104, 
        105, 107, 108, 110, 112, 114, 116, 118, 
-       119, 121, 122, 138, 171, 172, 186, 193
-       208, 223, 238, 253, 258, 273, 274, 289
-       290, 305, 306, 321, 322, 337, 338, 353
-       354, 355, 371, 387, 402, 417, 432, 452
-       472, 488, 504, 520, 536, 537
+       119, 121, 122, 138, 171, 172, 173, 187
+       194, 209, 224, 239, 254, 259, 274, 275
+       290, 291, 306, 307, 322, 323, 338, 339
+       354, 355, 356, 372, 388, 403, 418, 433
+       453, 473, 489, 505, 521, 537, 538
 };
 
 static const char _wikitext_trans_keys[] = {
@@ -78,53 +79,53 @@ static const char _wikitext_trans_keys[] = {
        39, 42, 60, 61, 62, 70, 72, 83, 
        91, 93, 96, 102, 104, 115, 124, 126, 
        127, -62, -33, -32, -17, -16, -12, 1, 
-       31, 33, 123, 10, 33, 61, 92, 126, 
-       35, 37, 39, 59, 63, 90, 94, 95, 
-       97, 123, 35, 97, 113, 65, 90, 98, 
-       122, 33, 39, 61, 92, 126, 35, 37, 
+       31, 33, 123, 10, 32, 33, 61, 92, 
+       126, 35, 37, 39, 59, 63, 90, 94, 
+       95, 97, 123, 35, 97, 113, 65, 90, 
+       98, 122, 33, 39, 61, 92, 126, 35, 
+       37, 40, 59, 63, 90, 94, 95, 97, 
+       123, 33, 39, 61, 92, 126, 35, 37, 
        40, 59, 63, 90, 94, 95, 97, 123, 
        33, 39, 61, 92, 126, 35, 37, 40, 
        59, 63, 90, 94, 95, 97, 123, 33, 
        39, 61, 92, 126, 35, 37, 40, 59, 
-       63, 90, 94, 95, 97, 123, 33, 39
-       61, 92, 126, 35, 37, 40, 59, 63
-       90, 94, 95, 97, 123, 47, 78, 84
-       110, 116, 32, 33, 61, 92, 126, 35
-       37, 39, 59, 63, 90, 94, 95, 97
-       123, 32, 32, 33, 61, 92, 126, 35
-       37, 39, 59, 63, 90, 94, 95, 97
-       123, 32, 32, 33, 61, 92, 126, 35
-       37, 39, 59, 63, 90, 94, 95, 97
-       123, 32, 32, 33, 61, 92, 126, 35
-       37, 39, 59, 63, 90, 94, 95, 97
-       123, 32, 32, 33, 61, 92, 126, 35
-       37, 39, 59, 63, 90, 94, 95, 97
-       123, 32, 32, 33, 61, 92, 126, 35
-       37, 39, 59, 63, 90, 94, 95, 97
-       123, 32, 32, 33, 61, 84, 92, 116
-       126, 35, 37, 39, 59, 63, 90, 94
-       95, 97, 123, 33, 61, 80, 92, 112, 
+       63, 90, 94, 95, 97, 123, 47, 78
+       84, 110, 116, 32, 33, 61, 92, 126
+       35, 37, 39, 59, 63, 90, 94, 95
+       97, 123, 32, 32, 33, 61, 92, 126
+       35, 37, 39, 59, 63, 90, 94, 95
+       97, 123, 32, 32, 33, 61, 92, 126
+       35, 37, 39, 59, 63, 90, 94, 95
+       97, 123, 32, 32, 33, 61, 92, 126
+       35, 37, 39, 59, 63, 90, 94, 95
+       97, 123, 32, 32, 33, 61, 92, 126
+       35, 37, 39, 59, 63, 90, 94, 95
+       97, 123, 32, 32, 33, 61, 92, 126
+       35, 37, 39, 59, 63, 90, 94, 95
+       97, 123, 32, 32, 33, 61, 84, 92
+       116, 126, 35, 37, 39, 59, 63, 90
+       94, 95, 97, 123, 33, 61, 80, 92
+       112, 126, 35, 37, 39, 59, 63, 90
+       94, 95, 97, 123, 33, 58, 61, 92, 
        126, 35, 37, 39, 59, 63, 90, 94, 
-       95, 97, 123, 33, 58, 61, 92, 126, 
+       95, 97, 123, 33, 47, 61, 92, 126, 
        35, 37, 39, 59, 63, 90, 94, 95, 
        97, 123, 33, 47, 61, 92, 126, 35, 
        37, 39, 59, 63, 90, 94, 95, 97, 
-       123, 33, 47, 61, 92, 126, 35, 37, 
-       39, 59, 63, 90, 94, 95, 97, 123, 
-       33, 38, 41, 44, 46, 61, 63, 92, 
-       94, 95, 123, 126, 35, 57, 58, 59, 
-       64, 90, 97, 122, 33, 38, 41, 44, 
-       46, 61, 63, 92, 94, 95, 123, 126, 
-       35, 57, 58, 59, 64, 90, 97, 122, 
-       33, 41, 44, 46, 61, 63, 95, 126, 
-       35, 57, 58, 59, 64, 90, 97, 122, 
-       33, 61, 84, 92, 116, 126, 35, 37, 
-       39, 59, 63, 90, 94, 95, 97, 123, 
-       33, 61, 86, 92, 118, 126, 35, 37, 
-       39, 59, 63, 90, 94, 95, 97, 123, 
-       33, 61, 78, 92, 110, 126, 35, 37, 
-       39, 59, 63, 90, 94, 95, 97, 123, 
-       91, 93, 0
+       123, 33, 38, 41, 44, 46, 61, 63, 
+       92, 94, 95, 123, 126, 35, 57, 58, 
+       59, 64, 90, 97, 122, 33, 38, 41, 
+       44, 46, 61, 63, 92, 94, 95, 123, 
+       126, 35, 57, 58, 59, 64, 90, 97, 
+       122, 33, 41, 44, 46, 61, 63, 95, 
+       126, 35, 57, 58, 59, 64, 90, 97, 
+       122, 33, 61, 84, 92, 116, 126, 35, 
+       37, 39, 59, 63, 90, 94, 95, 97, 
+       123, 33, 61, 86, 92, 118, 126, 35, 
+       37, 39, 59, 63, 90, 94, 95, 97, 
+       123, 33, 61, 78, 92, 110, 126, 35, 
+       37, 39, 59, 63, 90, 94, 95, 97, 
+       123, 91, 93, 0
 };
 
 static const char _wikitext_single_lengths[] = {
@@ -132,11 +133,11 @@ static const char _wikitext_single_lengths[] = {
        1, 1, 2, 2, 1, 2, 2, 2, 
        1, 4, 2, 2, 2, 2, 2, 1, 
        2, 1, 2, 2, 2, 2, 2, 1, 
-       2, 1, 8, 23, 1, 4, 3, 5
-       5, 5, 5, 5, 5, 1, 5, 1
-       5, 1, 5, 1, 5, 1, 5, 1
-       1, 6, 6, 5, 5, 5, 12, 12, 
-       8, 6, 6, 6, 1, 1
+       2, 1, 8, 23, 1, 1, 4, 3
+       5, 5, 5, 5, 5, 5, 1, 5
+       1, 5, 1, 5, 1, 5, 1, 5
+       1, 1, 6, 6, 5, 5, 5, 12, 
+       12, 8, 6, 6, 6, 1, 1
 };
 
 static const char _wikitext_range_lengths[] = {
@@ -144,11 +145,11 @@ static const char _wikitext_range_lengths[] = {
        3, 1, 3, 3, 3, 3, 3, 3, 
        3, 0, 0, 0, 0, 0, 0, 0, 
        0, 0, 0, 0, 0, 0, 0, 0, 
-       0, 0, 4, 5, 0, 5, 2, 5
-       5, 5, 5, 0, 5, 0, 5, 0
-       5, 0, 5, 0, 5, 0, 5, 0
-       0, 5, 5, 5, 5, 5, 4, 4, 
-       4, 5, 5, 5, 0, 0
+       0, 0, 4, 5, 0, 0, 5, 2
+       5, 5, 5, 5, 0, 5, 0, 5
+       0, 5, 0, 5, 0, 5, 0, 5
+       0, 0, 5, 5, 5, 5, 5, 4, 
+       4, 4, 5, 5, 5, 0, 0
 };
 
 static const short _wikitext_index_offsets[] = {
@@ -156,11 +157,11 @@ static const short _wikitext_index_offsets[] = {
        22, 27, 30, 36, 42, 47, 53, 59, 
        65, 70, 75, 78, 81, 84, 87, 90, 
        92, 95, 97, 100, 103, 106, 109, 112, 
-       114, 117, 119, 132, 161, 163, 173, 179
-       190, 201, 212, 223, 229, 240, 242, 253
-       255, 266, 268, 279, 281, 292, 294, 305
-       307, 309, 321, 333, 344, 355, 366, 383
-       400, 413, 425, 437, 449, 451
+       114, 117, 119, 132, 161, 163, 165, 175
+       181, 192, 203, 214, 225, 231, 242, 244
+       255, 257, 268, 270, 281, 283, 294, 296
+       307, 309, 311, 323, 335, 346, 357, 368
+       385, 402, 415, 427, 439, 451, 453
 };
 
 static const char _wikitext_indicies[] = {
@@ -184,43 +185,43 @@ static const char _wikitext_indicies[] = {
        48, 49, 50, 51, 52, 53, 54, 55, 
        56, 57, 58, 59, 60, 55, 56, 57, 
        61, 46, 0, 2, 3, 42, 0, 46, 
-       1, 43, 63, 46, 46, 46, 46, 46, 
-       46, 46, 46, 46, 64, 66, 67, 68, 
-       12, 12, 65, 46, 70, 46, 46, 46, 
-       46, 46, 46, 46, 46, 69, 46, 72, 
-       46, 46, 46, 46, 46, 46, 46, 46, 
-       71, 46, 74, 46, 46, 46, 46, 46, 
-       46, 46, 46, 73, 46, 75, 46, 46, 
-       46, 46, 46, 46, 46, 46, 69, 77, 
-       78, 79, 78, 79, 76, 81, 46, 82, 
-       46, 46, 46, 46, 46, 46, 46, 80, 
-       81, 80, 84, 46, 85, 46, 46, 46, 
-       46, 46, 46, 46, 83, 84, 83, 87, 
-       46, 88, 46, 46, 46, 46, 46, 46, 
-       46, 86, 87, 86, 90, 46, 91, 46, 
-       46, 46, 46, 46, 46, 46, 89, 90, 
-       89, 93, 46, 94, 46, 46, 46, 46, 
-       46, 46, 46, 92, 93, 92, 96, 46, 
+       1, 43, 63, 65, 64, 46, 46, 46, 
+       46, 46, 46, 46, 46, 46, 66, 68, 
+       69, 70, 12, 12, 67, 46, 72, 46, 
+       46, 46, 46, 46, 46, 46, 46, 71, 
+       46, 74, 46, 46, 46, 46, 46, 46, 
+       46, 46, 73, 46, 76, 46, 46, 46, 
+       46, 46, 46, 46, 46, 75, 46, 77, 
        46, 46, 46, 46, 46, 46, 46, 46, 
-       95, 96, 95, 98, 97, 46, 46, 99, 
-       46, 99, 46, 46, 46, 46, 46, 46, 
-       69, 46, 46, 100, 46, 100, 46, 46, 
-       46, 46, 46, 46, 69, 46, 101, 46, 
-       46, 46, 46, 46, 46, 46, 46, 69, 
-       46, 102, 46, 46, 46, 46, 46, 46, 
-       46, 46, 69, 46, 103, 46, 46, 46, 
-       46, 46, 46, 46, 46, 69, 46, 41, 
-       46, 46, 46, 104, 46, 46, 46, 104, 
-       46, 104, 104, 46, 104, 104, 69, 105, 
-       41, 105, 105, 105, 104, 105, 46, 46, 
-       104, 46, 104, 104, 105, 104, 104, 64, 
-       40, 40, 40, 40, 41, 40, 41, 41, 
-       41, 40, 41, 41, 106, 46, 46, 55, 
-       46, 55, 46, 46, 46, 46, 46, 46, 
-       69, 46, 46, 107, 46, 107, 46, 46, 
-       46, 46, 46, 46, 69, 46, 46, 100, 
-       46, 100, 46, 46, 46, 46, 46, 46, 
-       69, 109, 108, 111, 110, 0
+       71, 79, 80, 81, 80, 81, 78, 83, 
+       46, 84, 46, 46, 46, 46, 46, 46, 
+       46, 82, 83, 82, 86, 46, 87, 46, 
+       46, 46, 46, 46, 46, 46, 85, 86, 
+       85, 89, 46, 90, 46, 46, 46, 46, 
+       46, 46, 46, 88, 89, 88, 92, 46, 
+       93, 46, 46, 46, 46, 46, 46, 46, 
+       91, 92, 91, 95, 46, 96, 46, 46, 
+       46, 46, 46, 46, 46, 94, 95, 94, 
+       98, 46, 46, 46, 46, 46, 46, 46, 
+       46, 46, 97, 98, 97, 100, 99, 46, 
+       46, 101, 46, 101, 46, 46, 46, 46, 
+       46, 46, 71, 46, 46, 102, 46, 102, 
+       46, 46, 46, 46, 46, 46, 71, 46, 
+       103, 46, 46, 46, 46, 46, 46, 46, 
+       46, 71, 46, 104, 46, 46, 46, 46, 
+       46, 46, 46, 46, 71, 46, 105, 46, 
+       46, 46, 46, 46, 46, 46, 46, 71, 
+       46, 41, 46, 46, 46, 106, 46, 46, 
+       46, 106, 46, 106, 106, 46, 106, 106, 
+       71, 107, 41, 107, 107, 107, 106, 107, 
+       46, 46, 106, 46, 106, 106, 107, 106, 
+       106, 66, 40, 40, 40, 40, 41, 40, 
+       41, 41, 41, 40, 41, 41, 108, 46, 
+       46, 55, 46, 55, 46, 46, 46, 46, 
+       46, 46, 71, 46, 46, 109, 46, 109, 
+       46, 46, 46, 46, 46, 46, 71, 46, 
+       46, 102, 46, 102, 46, 46, 46, 46, 
+       46, 46, 71, 111, 110, 113, 112, 0
 };
 
 static const char _wikitext_trans_targs_wi[] = {
@@ -229,32 +230,34 @@ static const char _wikitext_trans_targs_wi[] = {
        14, 15, 16, 35, 35, 18, 24, 19, 
        20, 21, 22, 23, 35, 25, 35, 27, 
        28, 29, 30, 31, 35, 33, 35, 35, 
-       34, 64, 3, 35, 36, 35, 37, 35, 
-       37, 38, 39, 37, 43, 44, 56, 57, 
-       65, 66, 68, 69, 35, 35, 35, 35, 
-       35, 35, 4, 10, 13, 35, 40, 35, 
-       41, 35, 42, 37, 35, 17, 26, 32, 
-       35, 45, 46, 35, 47, 48, 35, 49, 
-       50, 35, 51, 52, 35, 53, 54, 35, 
-       55, 35, 35, 58, 59, 60, 61, 62, 
-       63, 63, 35, 67, 35, 35, 35, 35
+       34, 65, 3, 35, 36, 37, 38, 35, 
+       38, 39, 40, 38, 44, 45, 57, 58, 
+       66, 67, 69, 70, 35, 35, 35, 35, 
+       35, 37, 35, 35, 4, 10, 13, 35, 
+       41, 35, 42, 35, 43, 38, 35, 17, 
+       26, 32, 35, 46, 47, 35, 48, 49, 
+       35, 50, 51, 35, 52, 53, 35, 54, 
+       55, 35, 56, 35, 35, 59, 60, 61, 
+       62, 63, 64, 64, 35, 68, 35, 35, 
+       35, 35
 };
 
 static const char _wikitext_trans_actions_wi[] = {
-       43, 0, 0, 0, 79, 0, 0, 37
-       0, 35, 0, 33, 0, 0, 0, 31
-       0, 0, 0, 29, 81, 0, 0, 0, 
+       41, 0, 0, 0, 79, 0, 0, 35
+       0, 33, 0, 31, 0, 0, 0, 29
+       0, 0, 0, 27, 81, 0, 0, 0, 
        0, 0, 0, 0, 11, 0, 17, 0, 
        0, 0, 0, 0, 9, 0, 15, 77, 
-       0, 7, 0, 41, 0, 21, 100, 39
+       0, 7, 0, 39, 0, 1, 100, 37
        91, 7, 0, 94, 7, 1, 1, 0, 
-       0, 0, 0, 0, 13, 27, 85, 73, 
-       83, 69, 0, 0, 0, 75, 0, 47, 
-       0, 45, 0, 88, 71, 0, 0, 0, 
-       61, 0, 1, 59, 0, 1, 57, 0, 
-       1, 55, 0, 1, 53, 0, 1, 51, 
-       0, 49, 19, 0, 0, 0, 0, 0, 
-       97, 100, 63, 0, 65, 23, 67, 25
+       0, 0, 0, 0, 13, 25, 85, 73, 
+       49, 0, 83, 69, 0, 0, 0, 75, 
+       0, 45, 0, 43, 0, 88, 71, 0, 
+       0, 0, 61, 0, 1, 59, 0, 1, 
+       57, 0, 1, 55, 0, 1, 53, 0, 
+       1, 51, 0, 47, 19, 0, 0, 0, 
+       0, 0, 97, 100, 63, 0, 65, 21, 
+       67, 23
 };
 
 static const char _wikitext_to_state_actions[] = {
@@ -266,7 +269,7 @@ static const char _wikitext_to_state_actions[] = {
        0, 0, 0, 0, 0, 0, 0, 0, 
        0, 0, 0, 0, 0, 0, 0, 0, 
        0, 0, 0, 0, 0, 0, 0, 0, 
-       0, 0, 0, 0, 0, 0
+       0, 0, 0, 0, 0, 0, 0
 };
 
 static const char _wikitext_from_state_actions[] = {
@@ -278,7 +281,7 @@ static const char _wikitext_from_state_actions[] = {
        0, 0, 0, 0, 0, 0, 0, 0, 
        0, 0, 0, 0, 0, 0, 0, 0, 
        0, 0, 0, 0, 0, 0, 0, 0, 
-       0, 0, 0, 0, 0, 0
+       0, 0, 0, 0, 0, 0, 0
 };
 
 static const char _wikitext_eof_trans[] = {
@@ -286,11 +289,11 @@ static const char _wikitext_eof_trans[] = {
        5, 5, 5, 5, 5, 5, 5, 5, 
        5, 21, 21, 21, 21, 21, 21, 21, 
        21, 21, 21, 21, 21, 21, 21, 21, 
-       21, 21, 40, 0, 63, 65, 66, 70
-       72, 74, 70, 77, 81, 81, 84, 84
-       87, 87, 90, 90, 93, 93, 96, 96
-       98, 70, 70, 70, 70, 70, 70, 65
-       107, 70, 70, 70, 109, 111
+       21, 21, 40, 0, 63, 65, 67, 68
+       72, 74, 76, 72, 79, 83, 83, 86
+       86, 89, 89, 92, 92, 95, 95, 98
+       98, 100, 72, 72, 72, 72, 72, 72
+       67, 109, 72, 72, 72, 111, 113
 };
 
 static const int wikitext_start = 35;
@@ -299,7 +302,7 @@ static const int wikitext_error = 0;
 
 static const int wikitext_en_main = 35;
 
-#line 390 "wikitext_ragel.rl"
+#line 394 "wikitext_ragel.rl"
 
 
 // for now we use the scanner as a tokenizer that returns one token at a time, just like ANTLR
@@ -342,16 +345,16 @@ void next_token(token_t *out, token_t *last_token, char *p, char *pe)
     char    *te;        // token end (scanner)
     int     act;        // identity of last patterned matched (scanner)
     
-#line 346 "wikitext_ragel.c"
+#line 349 "wikitext_ragel.c"
        {
        cs = wikitext_start;
        ts = 0;
        te = 0;
        act = 0;
        }
-#line 432 "wikitext_ragel.rl"
+#line 436 "wikitext_ragel.rl"
     
-#line 355 "wikitext_ragel.c"
+#line 358 "wikitext_ragel.c"
        {
        int _klen;
        unsigned int _trans;
@@ -372,7 +375,7 @@ _resume:
 #line 1 "wikitext_ragel.rl"
        {ts = p;}
        break;
-#line 376 "wikitext_ragel.c"
+#line 379 "wikitext_ragel.c"
                }
        }
 
@@ -439,13 +442,13 @@ _eof_trans:
                switch ( *_acts++ )
                {
        case 0:
-#line 32 "wikitext_ragel.rl"
+#line 33 "wikitext_ragel.rl"
        {
         MARK();
     }
        break;
        case 1:
-#line 37 "wikitext_ragel.rl"
+#line 38 "wikitext_ragel.rl"
        {
         out->code_point = *p & 0x7f;
     }
@@ -455,62 +458,62 @@ _eof_trans:
        {te = p+1;}
        break;
        case 5:
-#line 81 "wikitext_ragel.rl"
+#line 82 "wikitext_ragel.rl"
        {act = 3;}
        break;
        case 6:
-#line 141 "wikitext_ragel.rl"
+#line 145 "wikitext_ragel.rl"
        {act = 11;}
        break;
        case 7:
-#line 150 "wikitext_ragel.rl"
+#line 154 "wikitext_ragel.rl"
        {act = 12;}
        break;
        case 8:
-#line 260 "wikitext_ragel.rl"
+#line 264 "wikitext_ragel.rl"
        {act = 19;}
        break;
        case 9:
-#line 361 "wikitext_ragel.rl"
+#line 365 "wikitext_ragel.rl"
        {act = 35;}
        break;
        case 10:
-#line 69 "wikitext_ragel.rl"
+#line 70 "wikitext_ragel.rl"
        {te = p+1;{
             EMIT(NO_WIKI_START);
             {p++; goto _out; }
         }}
        break;
        case 11:
-#line 75 "wikitext_ragel.rl"
+#line 76 "wikitext_ragel.rl"
        {te = p+1;{
             EMIT(NO_WIKI_END);
             {p++; goto _out; }
         }}
        break;
        case 12:
-#line 99 "wikitext_ragel.rl"
+#line 100 "wikitext_ragel.rl"
        {te = p+1;{
             EMIT(TT);
             {p++; goto _out; }
         }}
        break;
        case 13:
-#line 105 "wikitext_ragel.rl"
+#line 106 "wikitext_ragel.rl"
        {te = p+1;{
             EMIT(TT_START);
             {p++; goto _out; }
         }}
        break;
        case 14:
-#line 111 "wikitext_ragel.rl"
+#line 112 "wikitext_ragel.rl"
        {te = p+1;{
             EMIT(TT_END);
             {p++; goto _out; }
         }}
        break;
        case 15:
-#line 118 "wikitext_ragel.rl"
+#line 119 "wikitext_ragel.rl"
        {te = p+1;{
             if (out->column_start == 1 || last_token_type == BLOCKQUOTE)
                 EMIT(BLOCKQUOTE);
@@ -523,80 +526,70 @@ _eof_trans:
         }}
        break;
        case 16:
-#line 132 "wikitext_ragel.rl"
-       {te = p+1;{
-            if (out->column_start == 1)
-                EMIT(PRE);
-            else
-                EMIT(SPACE);
-            {p++; goto _out; }
-        }}
-       break;
-       case 17:
-#line 266 "wikitext_ragel.rl"
+#line 270 "wikitext_ragel.rl"
        {te = p+1;{
             EMIT(LINK_START);
             {p++; goto _out; }
         }}
        break;
-       case 18:
-#line 272 "wikitext_ragel.rl"
+       case 17:
+#line 276 "wikitext_ragel.rl"
        {te = p+1;{
             EMIT(LINK_END);
             {p++; goto _out; }
         }}
        break;
-       case 19:
-#line 278 "wikitext_ragel.rl"
+       case 18:
+#line 282 "wikitext_ragel.rl"
        {te = p+1;{
             EMIT(SEPARATOR);
             {p++; goto _out; }
         }}
        break;
-       case 20:
-#line 296 "wikitext_ragel.rl"
+       case 19:
+#line 300 "wikitext_ragel.rl"
        {te = p+1;{
             EMIT(QUOT_ENTITY);
             {p++; goto _out; }
         }}
        break;
-       case 21:
-#line 302 "wikitext_ragel.rl"
+       case 20:
+#line 306 "wikitext_ragel.rl"
        {te = p+1;{
             EMIT(AMP_ENTITY);
             {p++; goto _out; }
         }}
        break;
-       case 22:
-#line 308 "wikitext_ragel.rl"
+       case 21:
+#line 312 "wikitext_ragel.rl"
        {te = p+1;{
             EMIT(NAMED_ENTITY);
             {p++; goto _out; }
         }}
        break;
-       case 23:
-#line 314 "wikitext_ragel.rl"
+       case 22:
+#line 318 "wikitext_ragel.rl"
        {te = p+1;{
             EMIT(HEX_ENTITY);
             {p++; goto _out; }
         }}
        break;
-       case 24:
-#line 320 "wikitext_ragel.rl"
+       case 23:
+#line 324 "wikitext_ragel.rl"
        {te = p+1;{
             EMIT(DECIMAL_ENTITY);
             {p++; goto _out; }
         }}
        break;
-       case 25:
-#line 326 "wikitext_ragel.rl"
+       case 24:
+#line 330 "wikitext_ragel.rl"
        {te = p+1;{
             EMIT(QUOT);
             {p++; goto _out; }
         }}
        break;
-       case 26:
-#line 350 "wikitext_ragel.rl"
+       case 25:
+#line 354 "wikitext_ragel.rl"
        {te = p+1;{
             EMIT(CRLF);
             out->column_stop = 1;
@@ -604,30 +597,30 @@ _eof_trans:
             {p++; goto _out; }
         }}
        break;
-       case 27:
-#line 381 "wikitext_ragel.rl"
+       case 26:
+#line 385 "wikitext_ragel.rl"
        {te = p+1;{
             EMIT(DEFAULT);
             out->column_stop = out->column_start + 1;
             {p++; goto _out; }
         }}
        break;
-       case 28:
-#line 87 "wikitext_ragel.rl"
+       case 27:
+#line 88 "wikitext_ragel.rl"
        {te = p;p--;{
             EMIT(STRONG);
             {p++; goto _out; }
         }}
        break;
-       case 29:
-#line 93 "wikitext_ragel.rl"
+       case 28:
+#line 94 "wikitext_ragel.rl"
        {te = p;p--;{
             EMIT(EM);
             {p++; goto _out; }
         }}
        break;
-       case 30:
-#line 118 "wikitext_ragel.rl"
+       case 29:
+#line 119 "wikitext_ragel.rl"
        {te = p;p--;{
             if (out->column_start == 1 || last_token_type == BLOCKQUOTE)
                 EMIT(BLOCKQUOTE);
@@ -639,8 +632,21 @@ _eof_trans:
             {p++; goto _out; }
         }}
        break;
+       case 30:
+#line 133 "wikitext_ragel.rl"
+       {te = p;p--;{
+            if (out->column_start == 1)
+            {
+                REWIND();
+                EMIT(PRE);
+            }
+            else
+                EMIT(SPACE);
+            {p++; goto _out; }
+        }}
+       break;
        case 31:
-#line 159 "wikitext_ragel.rl"
+#line 163 "wikitext_ragel.rl"
        {te = p;p--;{
             if (out->column_start == 1 || last_token_type == BLOCKQUOTE)
             {
@@ -658,7 +664,7 @@ _eof_trans:
         }}
        break;
        case 32:
-#line 176 "wikitext_ragel.rl"
+#line 180 "wikitext_ragel.rl"
        {te = p;p--;{
             if (out->column_start == 1 || last_token_type == BLOCKQUOTE)
             {
@@ -675,7 +681,7 @@ _eof_trans:
         }}
        break;
        case 33:
-#line 192 "wikitext_ragel.rl"
+#line 196 "wikitext_ragel.rl"
        {te = p;p--;{
             if (out->column_start == 1 || last_token_type == BLOCKQUOTE)
             {
@@ -693,7 +699,7 @@ _eof_trans:
         }}
        break;
        case 34:
-#line 209 "wikitext_ragel.rl"
+#line 213 "wikitext_ragel.rl"
        {te = p;p--;{
             if (out->column_start == 1 || last_token_type == BLOCKQUOTE)
             {
@@ -711,7 +717,7 @@ _eof_trans:
         }}
        break;
        case 35:
-#line 226 "wikitext_ragel.rl"
+#line 230 "wikitext_ragel.rl"
        {te = p;p--;{
             if (out->column_start == 1 || last_token_type == BLOCKQUOTE)
             {
@@ -729,7 +735,7 @@ _eof_trans:
         }}
        break;
        case 36:
-#line 243 "wikitext_ragel.rl"
+#line 247 "wikitext_ragel.rl"
        {te = p;p--;{
             if (out->column_start == 1 || last_token_type == BLOCKQUOTE)
             {
@@ -747,42 +753,42 @@ _eof_trans:
         }}
        break;
        case 37:
-#line 260 "wikitext_ragel.rl"
+#line 264 "wikitext_ragel.rl"
        {te = p;p--;{
             EMIT(URI);
             {p++; goto _out; }
         }}
        break;
        case 38:
-#line 284 "wikitext_ragel.rl"
+#line 288 "wikitext_ragel.rl"
        {te = p;p--;{
             EMIT(EXT_LINK_START);
             {p++; goto _out; }
         }}
        break;
        case 39:
-#line 290 "wikitext_ragel.rl"
+#line 294 "wikitext_ragel.rl"
        {te = p;p--;{
             EMIT(EXT_LINK_END);
             {p++; goto _out; }
         }}
        break;
        case 40:
-#line 332 "wikitext_ragel.rl"
+#line 336 "wikitext_ragel.rl"
        {te = p;p--;{
             EMIT(AMP);
             {p++; goto _out; }
         }}
        break;
        case 41:
-#line 338 "wikitext_ragel.rl"
+#line 342 "wikitext_ragel.rl"
        {te = p;p--;{
             EMIT(LESS);
             {p++; goto _out; }
         }}
        break;
        case 42:
-#line 350 "wikitext_ragel.rl"
+#line 354 "wikitext_ragel.rl"
        {te = p;p--;{
             EMIT(CRLF);
             out->column_stop = 1;
@@ -791,28 +797,28 @@ _eof_trans:
         }}
        break;
        case 43:
-#line 361 "wikitext_ragel.rl"
+#line 365 "wikitext_ragel.rl"
        {te = p;p--;{
             EMIT(PRINTABLE);
             {p++; goto _out; }
         }}
        break;
        case 44:
-#line 260 "wikitext_ragel.rl"
+#line 264 "wikitext_ragel.rl"
        {{p = ((te))-1;}{
             EMIT(URI);
             {p++; goto _out; }
         }}
        break;
        case 45:
-#line 332 "wikitext_ragel.rl"
+#line 336 "wikitext_ragel.rl"
        {{p = ((te))-1;}{
             EMIT(AMP);
             {p++; goto _out; }
         }}
        break;
        case 46:
-#line 338 "wikitext_ragel.rl"
+#line 342 "wikitext_ragel.rl"
        {{p = ((te))-1;}{
             EMIT(LESS);
             {p++; goto _out; }
@@ -861,7 +867,7 @@ _eof_trans:
        }
        }
        break;
-#line 865 "wikitext_ragel.c"
+#line 871 "wikitext_ragel.c"
                }
        }
 
@@ -874,7 +880,7 @@ _again:
 #line 1 "wikitext_ragel.rl"
        {ts = 0;}
        break;
-#line 878 "wikitext_ragel.c"
+#line 884 "wikitext_ragel.c"
                }
        }
 
@@ -893,9 +899,9 @@ _again:
 
        _out: {}
        }
-#line 433 "wikitext_ragel.rl"
+#line 437 "wikitext_ragel.rl"
     if (cs == wikitext_error)
-        rb_raise(eWikitextError, "failed before finding a token");
+        rb_raise(eWikitextParserError, "failed before finding a token");
     else if (out->type == NO_TOKEN)
-        rb_raise(eWikitextError, "failed to produce a token");
+        rb_raise(eWikitextParserError, "failed to produce a token");
 }
index c30796450957734e23d0fa765bf4dd6b30ff62f9..8a6cca427478b3e8884c3709e03d0f2d75468f1e 100644 (file)
@@ -12,6 +12,6 @@
 // You should have received a copy of the GNU General Public License
 // along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
-#include "wikitext.h"
+#include "token.h"
 
 void next_token(token_t *out, token_t *last_token, char *p, char *pe);
index 5ab1e2787129acac1d8706fcd53047a4fffd242f..9db08b69bc6460897b81a7c66f547dc6d094855a 100644 (file)
@@ -19,6 +19,7 @@
 //----------------------------------------------------------------------//
 
 #include "wikitext_ragel.h"
+#include "wikitext.h"
 #include <stdio.h>
 
 #define EMIT(t)     do { out->type = t; out->stop = p + 1; out->column_stop += (out->stop - out->start); } while (0)
 
         # shorthand for <pre> and </pre>
         # consider adding real <pre> and </pre> HTML tags later on
-        ' '
+        ' ' @mark ' '*
         {
             if (out->column_start == 1)
+            {
+                REWIND();
                 EMIT(PRE);
+            }
             else
                 EMIT(SPACE);
             fbreak;
@@ -431,7 +435,7 @@ void next_token(token_t *out, token_t *last_token, char *p, char *pe)
     %% write init;
     %% write exec;
     if (cs == wikitext_error)
-        rb_raise(eWikitextError, "failed before finding a token");
+        rb_raise(eWikitextParserError, "failed before finding a token");
     else if (out->type == NO_TOKEN)
-        rb_raise(eWikitextError, "failed to produce a token");
+        rb_raise(eWikitextParserError, "failed to produce a token");
 }
index 17e817de892b3fa76f2588067013bbcae08f8367..f86ac0bf7fd1865227a1d5258aedc84d43312559 100755 (executable)
@@ -26,6 +26,16 @@ describe Wikitext::Parser, 'external links' do
     @parser.parse('[http://google.com/ Google]').should == expected
   end
 
+  it 'should treat runs of spaces after the link target as a single space' do
+    expected = %Q{<p><a href="http://google.com/" class="external">Google</a></p>\n}
+    @parser.parse('[http://google.com/                  Google]').should == expected
+  end
+
+  it 'should not treat runs of spaces within the link text as a single space' do
+    expected = %Q{<p><a href="http://google.com/" class="external">Google    search</a></p>\n}
+    @parser.parse('[http://google.com/ Google    search]').should == expected
+  end
+
   it 'should format a link with emphasis in the link text' do
     expected = %Q{<p><a href="http://google.com/" class="external">Google <em>rocks</em></a></p>\n}
     @parser.parse("[http://google.com/ Google ''rocks'']").should == expected
index 3ec5fe2b9b664b24f489b1f6e910c7d701fc2efb..569e39b923358e4157ec558778a16a264fd97286 100755 (executable)
@@ -76,6 +76,14 @@ describe Wikitext::Parser, 'internal links' do
       @parser.parse('[[foo|bar]]').should == %Q{<p><a href="/wiki/foo">bar</a></p>\n}
     end
 
+    it 'should treat a separator inside the link text as part of the link text' do
+      @parser.parse('[[foo|bar|baz]]').should == %Q{<p><a href="/wiki/foo">bar|baz</a></p>\n}
+    end
+
+    it 'should treat separators outside of links as normal text' do
+      @parser.parse('foo|bar').should == %Q{<p>foo|bar</p>\n}
+    end
+
     it 'should allow em markup in the custom link text' do
       expected = %Q{<p><a href="/wiki/foo">bar <em>baz</em></a></p>\n}
       @parser.parse("[[foo|bar ''baz'']]").should == expected