1 // Copyright 2008-2009 Wincent Colaiuta. All rights reserved.
3 // Redistribution and use in source and binary forms, with or without
4 // modification, are permitted provided that the following conditions are met:
6 // 1. Redistributions of source code must retain the above copyright notice,
7 // this list of conditions and the following disclaimer.
8 // 2. Redistributions in binary form must reproduce the above copyright notice,
9 // this list of conditions and the following disclaimer in the documentation
10 // and/or other materials provided with the distribution.
12 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
13 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
14 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
15 // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
16 // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
17 // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
18 // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
19 // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
20 // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
21 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
22 // POSSIBILITY OF SUCH DAMAGE.
24 //----------------------------------------------------------------------//
25 // NOTE: wikitext_ragel.c is generated from wikitext_ragel.rl, so //
26 // if you make changes to the former they will be overwritten. //
27 // You should perform all your edits in wikitext_ragel.rl. //
28 //----------------------------------------------------------------------//
30 #include "wikitext_ragel.h"
34 #define EMIT(t) do { out->type = t; out->stop = p + 1; out->column_stop += (out->stop - out->start); } while (0)
35 #define MARK() do { mark = p; } while (0)
36 #define REWIND() do { p = mark; } while (0)
37 #define AT_END() (p + 1 == pe)
38 #define DISTANCE() (p + 1 - ts)
39 #define NEXT_CHAR() (*(p + 1))
49 action non_printable_ascii
51 out->code_point = *p & 0x7f;
54 action two_byte_utf8_sequence
56 out->code_point = ((uint32_t)(*(p - 1)) & 0x1f) << 6 |
60 action three_byte_utf8_sequence
62 out->code_point = ((uint32_t)(*(p - 2)) & 0x0f) << 12 |
63 ((uint32_t)(*(p - 1)) & 0x3f) << 6 |
67 action four_byte_utf8_sequence
69 out->code_point = ((uint32_t)(*(p - 3)) & 0x07) << 18 |
70 ((uint32_t)(*(p - 2)) & 0x3f) << 12 |
71 ((uint32_t)(*(p - 1)) & 0x3f) << 6 |
75 # simple approximation for matching email addresses; not quite RFC 2822!
76 user = (alnum | [_\.] | '-')+ ;
78 domain = (alnum+ '.')+ tld ;
79 mail = user '@' domain ;
81 uri_chars = (alnum | [@$&'(\*\+=%_~/#] | '-')+ ;
82 special_uri_chars = ([:!\(\),;\.\?])+ ;
83 uri = ('mailto:'i mail) |
84 (('http'i [sS]? '://' | 'ftp://'i | 'svn://'i) uri_chars (special_uri_chars uri_chars)*) ;
85 path = '/' ([a-zA-Z0-9_\-.]+ '/'?)* ;
115 EMIT(BLOCKQUOTE_START);
121 EMIT(BLOCKQUOTE_END);
129 else if (DISTANCE() == 4)
134 else if (DISTANCE() == 3)
136 else if (DISTANCE() == 2)
185 # shorthand for <blockquote> and </blockquote>
188 if (out->column_start == 1 || last_token_type == BLOCKQUOTE)
198 # shorthand for <pre> and </pre>
201 if (out->column_start == 1 || last_token_type == BLOCKQUOTE)
213 if (out->column_start == 1 ||
214 last_token_type == OL ||
215 last_token_type == UL ||
216 last_token_type == BLOCKQUOTE ||
217 last_token_type == BLOCKQUOTE_START)
226 if (out->column_start == 1 ||
227 last_token_type == OL ||
228 last_token_type == UL ||
229 last_token_type == BLOCKQUOTE ||
230 last_token_type == BLOCKQUOTE_START)
239 if (out->column_start == 1 || last_token_type == BLOCKQUOTE || last_token_type == BLOCKQUOTE_START)
244 else if (DISTANCE() == 2)
246 else if (DISTANCE() == 3)
248 else if (DISTANCE() == 4)
250 else if (DISTANCE() == 5)
252 else if (DISTANCE() == 6)
254 else if (DISTANCE() > 6)
260 else if (AT_END() || NEXT_CHAR() == '\n' || NEXT_CHAR() == '\r')
265 else if (DISTANCE() == 2)
267 else if (DISTANCE() == 3)
269 else if (DISTANCE() == 4)
271 else if (DISTANCE() == 5)
273 else if (DISTANCE() == 6)
275 else if (DISTANCE() > 6)
277 p -= 6; // will scan the H6 on the next scan
283 // note that a H*_END token will never match before a BLOCKQUOTE_END
328 EMIT(EXT_LINK_START);
350 '&' alpha+ digit* ';'
356 '&#' [xX] xdigit+ ';'
364 EMIT(DECIMAL_ENTITY);
419 out->column_stop = 1;
424 # must tokenize these separately from the other PRINTABLE characters otherwise a string like:
425 # See http://example.com/.
426 # will get greedily tokenized as PRINABLE, SPACE, PRINTABLE rather than PRINTABLE, SPACE, URI, SPECIAL_URI_CHARS
427 # this also applies to MAIL tokenization and input strings like:
428 # Email me (user@example.com) for more info.
431 EMIT(SPECIAL_URI_CHARS);
441 # all the printable ASCII characters (0x20 to 0x7e) excluding those explicitly covered elsewhere:
442 # we skip space (0x20), exclamation mark (0x21), quote (0x22), hash (0x23), ampersand (0x26), apostrophe (0x27),
443 # left parenthesis (0x28), right parenthesis (0x29), numbers (0x30..0x39), asterisk (0x2a), comma (0x2c), period (0x2e),
444 # colon (0x3a), semi-colon (0x3b), less than (0x3c), equals (0x3d), greater than (0x3e), question mark (0x3f), uppercase
445 # letters (0x41..0x5a), left bracket (0x5b), right bracket (0x5d), backtick (0x60), lowercase letters (0x61..0x7a), left
446 # curly brace (0x7b), vertical bar (0x7c) and right curly brace (0x7d).
447 (0x24..0x25 | 0x2b | 0x2d | 0x2f | 0x40 | 0x5c | 0x5e..0x5f | 0x7e)+
453 # here is where we handle the UTF-8 and everything else
455 # one_byte_sequence = byte begins with zero;
456 # two_byte_sequence = first byte begins with 110 (0xc0..0xdf), next with 10 (0x80..9xbf);
457 # three_byte_sequence = first byte begins with 1110 (0xe0..0xef), next two with 10 (0x80..9xbf);
458 # four_byte_sequence = first byte begins with 11110 (0xf0..0xf7), next three with 10 (0x80..9xbf);
460 # within the ranges specified, we also exclude these illegal sequences:
461 # 1100000x (c0 c1) overlong encoding, lead byte of 2 byte seq but code point <= 127
462 # 11110101 (f5) restricted by RFC 3629 lead byte of 4-byte sequence for codepoint above 10ffff
463 # 1111011x (f6, f7) restricted by RFC 3629 lead byte of 4-byte sequence for codepoint above 10ffff
464 (0x01..0x1f | 0x7f) @non_printable_ascii |
465 (0xc2..0xdf 0x80..0xbf) @two_byte_utf8_sequence |
466 (0xe0..0xef 0x80..0xbf 0x80..0xbf) @three_byte_utf8_sequence |
467 (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf) @four_byte_utf8_sequence
470 out->column_stop = out->column_start + 1;
479 // for now we use the scanner as a tokenizer that returns one token at a time, just like ANTLR
480 // ultimately we could look at embedding all of the transformation inside the scanner itself (combined scanner/parser)
481 // pass in the last token because that's useful for the scanner to know
482 // p data pointer (required by Ragel machine); overriden with contents of last_token if supplied
483 // pe data end pointer (required by Ragel machine)
484 void next_token(token_t *out, token_t *last_token, char *p, char *pe)
486 int last_token_type = NO_TOKEN;
489 last_token_type = last_token->type;
490 p = last_token->stop;
491 out->line_start = out->line_stop = last_token->line_stop;
492 out->column_start = out->column_stop = last_token->column_stop;
497 out->column_start = 1;
499 out->column_stop = 1;
501 out->type = NO_TOKEN;
506 // all done, have reached end of input
508 out->type = END_OF_FILE;
512 char *mark; // for manual backtracking
513 char *eof = pe; // required for backtracking (longest match determination)
514 int cs; // current state (standard Ragel)
515 char *ts; // token start (scanner)
516 char *te; // token end (scanner)
517 int act; // identity of last patterned matched (scanner)
520 if (cs == wikitext_error)
521 rb_raise(eWikitextParserError, "failed before finding a token");
522 else if (out->type == NO_TOKEN)
523 rb_raise(eWikitextParserError, "failed to produce a token");