1 // Copyright 2008-2009 Wincent Colaiuta. All rights reserved.
3 // Redistribution and use in source and binary forms, with or without
4 // modification, are permitted provided that the following conditions are met:
6 // 1. Redistributions of source code must retain the above copyright notice,
7 // this list of conditions and the following disclaimer.
8 // 2. Redistributions in binary form must reproduce the above copyright notice,
9 // this list of conditions and the following disclaimer in the documentation
10 // and/or other materials provided with the distribution.
12 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
13 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
14 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
15 // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
16 // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
17 // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
18 // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
19 // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
20 // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
21 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
22 // POSSIBILITY OF SUCH DAMAGE.
24 //----------------------------------------------------------------------//
25 // NOTE: wikitext_ragel.c is generated from wikitext_ragel.rl, so //
26 // if you make changes to the former they will be overwritten. //
27 // You should perform all your edits in wikitext_ragel.rl. //
28 //----------------------------------------------------------------------//
30 #include "wikitext_ragel.h"
34 #define EMIT(t) do { out->type = t; out->stop = p + 1; out->column_stop += (out->stop - out->start); } while (0)
35 #define MARK() do { mark = p; } while (0)
36 #define REWIND() do { p = mark; } while (0)
37 #define AT_END() (p + 1 == pe)
38 #define DISTANCE() (p + 1 - ts)
39 #define NEXT_CHAR() (*(p + 1))
49 action non_printable_ascii
51 out->code_point = *p & 0x7f;
54 action two_byte_utf8_sequence
56 out->code_point = ((uint32_t)(*(p - 1)) & 0x1f) << 6 |
60 action three_byte_utf8_sequence
62 out->code_point = ((uint32_t)(*(p - 2)) & 0x0f) << 12 |
63 ((uint32_t)(*(p - 1)) & 0x3f) << 6 |
67 action four_byte_utf8_sequence
69 out->code_point = ((uint32_t)(*(p - 3)) & 0x07) << 18 |
70 ((uint32_t)(*(p - 2)) & 0x3f) << 12 |
71 ((uint32_t)(*(p - 1)) & 0x3f) << 6 |
75 # simple approximation for matching email addresses; not quite RFC 2822!
76 user = (alnum | [_\.] | '-')+ ;
78 domain = (alnum+ '.')+ tld ;
79 mail = user '@' domain ;
81 uri_chars = (alnum | [@$&'(\*\+=%_~/#] | '-')+ ;
82 special_uri_chars = ([:!\(\),;\.\?])+ ;
83 uri = ('mailto:'i mail) |
84 (('http'i [sS]? '://' | 'ftp://'i | 'svn://'i) uri_chars (special_uri_chars uri_chars)*) ;
85 path = '/' ([a-zA-Z0-9_\-.]+ '/'?)* ;
107 '<pre lang="' alpha+ '">'
121 EMIT(BLOCKQUOTE_START);
127 EMIT(BLOCKQUOTE_END);
135 else if (DISTANCE() == 4)
140 else if (DISTANCE() == 3)
142 else if (DISTANCE() == 2)
191 # shorthand for <blockquote> and </blockquote>
194 if (out->column_start == 1 || last_token_type == BLOCKQUOTE)
204 # shorthand for <pre> and </pre>
207 if (out->column_start == 1 || last_token_type == BLOCKQUOTE)
219 if (out->column_start == 1 ||
220 last_token_type == OL ||
221 last_token_type == UL ||
222 last_token_type == BLOCKQUOTE ||
223 last_token_type == BLOCKQUOTE_START)
232 if (out->column_start == 1 ||
233 last_token_type == OL ||
234 last_token_type == UL ||
235 last_token_type == BLOCKQUOTE ||
236 last_token_type == BLOCKQUOTE_START)
245 if (out->column_start == 1 || last_token_type == BLOCKQUOTE || last_token_type == BLOCKQUOTE_START)
250 else if (DISTANCE() == 2)
252 else if (DISTANCE() == 3)
254 else if (DISTANCE() == 4)
256 else if (DISTANCE() == 5)
258 else if (DISTANCE() == 6)
260 else if (DISTANCE() > 6)
266 else if (AT_END() || NEXT_CHAR() == '\n' || NEXT_CHAR() == '\r')
271 else if (DISTANCE() == 2)
273 else if (DISTANCE() == 3)
275 else if (DISTANCE() == 4)
277 else if (DISTANCE() == 5)
279 else if (DISTANCE() == 6)
281 else if (DISTANCE() > 6)
283 p -= 6; // will scan the H6 on the next scan
289 // note that a H*_END token will never match before a BLOCKQUOTE_END
334 EMIT(EXT_LINK_START);
356 '&' alpha+ digit* ';'
362 '&#' [xX] xdigit+ ';'
370 EMIT(DECIMAL_ENTITY);
425 out->column_stop = 1;
430 # must tokenize these separately from the other PRINTABLE characters otherwise a string like:
431 # See http://example.com/.
432 # will get greedily tokenized as PRINTABLE, SPACE, PRINTABLE rather than PRINTABLE, SPACE, URI, SPECIAL_URI_CHARS
433 # this also applies to MAIL tokenization and input strings like:
434 # Email me (user@example.com) for more info.
437 EMIT(SPECIAL_URI_CHARS);
447 # all the printable ASCII characters (0x20 to 0x7e) excluding those explicitly covered elsewhere:
448 # we skip space (0x20), exclamation mark (0x21), quote (0x22), hash (0x23), ampersand (0x26), apostrophe (0x27),
449 # left parenthesis (0x28), right parenthesis (0x29), numbers (0x30..0x39), asterisk (0x2a), comma (0x2c), period (0x2e),
450 # colon (0x3a), semi-colon (0x3b), less than (0x3c), equals (0x3d), greater than (0x3e), question mark (0x3f), uppercase
451 # letters (0x41..0x5a), left bracket (0x5b), right bracket (0x5d), backtick (0x60), lowercase letters (0x61..0x7a), left
452 # curly brace (0x7b), vertical bar (0x7c) and right curly brace (0x7d).
453 (0x24..0x25 | 0x2b | 0x2d | 0x2f | 0x40 | 0x5c | 0x5e..0x5f | 0x7e)+
459 # here is where we handle the UTF-8 and everything else
461 # one_byte_sequence = byte begins with zero;
462 # two_byte_sequence = first byte begins with 110 (0xc0..0xdf), next with 10 (0x80..9xbf);
463 # three_byte_sequence = first byte begins with 1110 (0xe0..0xef), next two with 10 (0x80..9xbf);
464 # four_byte_sequence = first byte begins with 11110 (0xf0..0xf7), next three with 10 (0x80..9xbf);
466 # within the ranges specified, we also exclude these illegal sequences:
467 # 1100000x (c0 c1) overlong encoding, lead byte of 2 byte seq but code point <= 127
468 # 11110101 (f5) restricted by RFC 3629 lead byte of 4-byte sequence for codepoint above 10ffff
469 # 1111011x (f6, f7) restricted by RFC 3629 lead byte of 4-byte sequence for codepoint above 10ffff
470 (0x01..0x1f | 0x7f) @non_printable_ascii |
471 (0xc2..0xdf 0x80..0xbf) @two_byte_utf8_sequence |
472 (0xe0..0xef 0x80..0xbf 0x80..0xbf) @three_byte_utf8_sequence |
473 (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf) @four_byte_utf8_sequence
476 out->column_stop = out->column_start + 1;
485 // for now we use the scanner as a tokenizer that returns one token at a time, just like ANTLR
486 // ultimately we could look at embedding all of the transformation inside the scanner itself (combined scanner/parser)
487 // pass in the last token because that's useful for the scanner to know
488 // p data pointer (required by Ragel machine); overriden with contents of last_token if supplied
489 // pe data end pointer (required by Ragel machine)
490 void next_token(token_t *out, token_t *last_token, char *p, char *pe)
492 int last_token_type = NO_TOKEN;
495 last_token_type = last_token->type;
496 p = last_token->stop;
497 out->line_start = out->line_stop = last_token->line_stop;
498 out->column_start = out->column_stop = last_token->column_stop;
503 out->column_start = 1;
505 out->column_stop = 1;
507 out->type = NO_TOKEN;
512 // all done, have reached end of input
514 out->type = END_OF_FILE;
518 char *mark; // for manual backtracking
519 char *eof = pe; // required for backtracking (longest match determination)
520 int cs; // current state (standard Ragel)
521 char *ts; // token start (scanner)
522 char *te; // token end (scanner)
523 int act; // identity of last patterned matched (scanner)
526 if (cs == wikitext_error)
527 rb_raise(eWikitextParserError, "failed before finding a token");
528 else if (out->type == NO_TOKEN)
529 rb_raise(eWikitextParserError, "failed to produce a token");