1 // Copyright 2008-2009 Wincent Colaiuta
2 // This program is free software: you can redistribute it and/or modify
3 // it under the terms of the GNU General Public License as published by
4 // the Free Software Foundation, either version 3 of the License, or
5 // (at your option) any later version.
7 // This program is distributed in the hope that it will be useful,
8 // but WITHOUT ANY WARRANTY; without even the implied warranty of
9 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 // GNU General Public License for more details.
12 // You should have received a copy of the GNU General Public License
13 // along with this program. If not, see <http://www.gnu.org/licenses/>.
15 //----------------------------------------------------------------------//
16 // NOTE: wikitext_ragel.c is generated from wikitext_ragel.rl, so //
17 // if you make changes to the former they will be overwritten. //
18 // You should perform all your edits in wikitext_ragel.rl. //
19 //----------------------------------------------------------------------//
21 #include "wikitext_ragel.h"
25 #define EMIT(t) do { out->type = t; out->stop = p + 1; out->column_stop += (out->stop - out->start); } while (0)
26 #define MARK() do { mark = p; } while (0)
27 #define REWIND() do { p = mark; } while (0)
28 #define AT_END() (p + 1 == pe)
29 #define DISTANCE() (p + 1 - ts)
30 #define NEXT_CHAR() (*(p + 1))
40 action non_printable_ascii
42 out->code_point = *p & 0x7f;
45 action two_byte_utf8_sequence
47 out->code_point = ((uint32_t)(*(p - 1)) & 0x1f) << 6 |
51 action three_byte_utf8_sequence
53 out->code_point = ((uint32_t)(*(p - 2)) & 0x0f) << 12 |
54 ((uint32_t)(*(p - 1)) & 0x3f) << 6 |
58 action four_byte_utf8_sequence
60 out->code_point = ((uint32_t)(*(p - 3)) & 0x07) << 18 |
61 ((uint32_t)(*(p - 2)) & 0x3f) << 12 |
62 ((uint32_t)(*(p - 1)) & 0x3f) << 6 |
66 # simple approximation for matching email addresses; not quite RFC 2822!
67 user = (alnum | [_\.] | '-')+ ;
69 domain = (alnum+ '.')+ tld ;
70 mail = user '@' domain ;
72 uri_chars = (alnum | [@$&'(\*\+=%_~/#] | '-')+ ;
73 special_uri_chars = ([:!\(\),;\.\?])+ ;
74 uri = ('mailto:'i mail) |
75 (('http'i [sS]? '://' | 'ftp://'i | 'svn://'i) uri_chars (special_uri_chars uri_chars)*) ;
76 path = '/' ([a-zA-Z0-9_\-.]+ '/'?)* ;
106 EMIT(BLOCKQUOTE_START);
112 EMIT(BLOCKQUOTE_END);
120 else if (DISTANCE() == 4)
125 else if (DISTANCE() == 3)
127 else if (DISTANCE() == 2)
176 # shorthand for <blockquote> and </blockquote>
179 if (out->column_start == 1 || last_token_type == BLOCKQUOTE)
189 # shorthand for <pre> and </pre>
192 if (out->column_start == 1 || last_token_type == BLOCKQUOTE)
204 if (out->column_start == 1 ||
205 last_token_type == OL ||
206 last_token_type == UL ||
207 last_token_type == BLOCKQUOTE ||
208 last_token_type == BLOCKQUOTE_START)
217 if (out->column_start == 1 ||
218 last_token_type == OL ||
219 last_token_type == UL ||
220 last_token_type == BLOCKQUOTE ||
221 last_token_type == BLOCKQUOTE_START)
230 if (out->column_start == 1 || last_token_type == BLOCKQUOTE || last_token_type == BLOCKQUOTE_START)
235 else if (DISTANCE() == 2)
237 else if (DISTANCE() == 3)
239 else if (DISTANCE() == 4)
241 else if (DISTANCE() == 5)
243 else if (DISTANCE() == 6)
245 else if (DISTANCE() > 6)
251 else if (AT_END() || NEXT_CHAR() == '\n' || NEXT_CHAR() == '\r')
256 else if (DISTANCE() == 2)
258 else if (DISTANCE() == 3)
260 else if (DISTANCE() == 4)
262 else if (DISTANCE() == 5)
264 else if (DISTANCE() == 6)
266 else if (DISTANCE() > 6)
268 p -= 6; // will scan the H6 on the next scan
274 // note that a H*_END token will never match before a BLOCKQUOTE_END
319 EMIT(EXT_LINK_START);
341 '&' alpha+ digit* ';'
347 '&#' [xX] xdigit+ ';'
355 EMIT(DECIMAL_ENTITY);
410 out->column_stop = 1;
415 # must tokenize these separately from the other PRINTABLE characters otherwise a string like:
416 # See http://example.com/.
417 # will get greedily tokenized as PRINABLE, SPACE, PRINTABLE rather than PRINTABLE, SPACE, URI, SPECIAL_URI_CHARS
418 # this also applies to MAIL tokenization and input strings like:
419 # Email me (user@example.com) for more info.
422 EMIT(SPECIAL_URI_CHARS);
432 # all the printable ASCII characters (0x20 to 0x7e) excluding those explicitly covered elsewhere:
433 # we skip space (0x20), exclamation mark (0x21), quote (0x22), hash (0x23), ampersand (0x26), apostrophe (0x27),
434 # left parenthesis (0x28), right parenthesis (0x29), numbers (0x30..0x39), asterisk (0x2a), comma (0x2c), period (0x2e),
435 # colon (0x3a), semi-colon (0x3b), less than (0x3c), equals (0x3d), greater than (0x3e), question mark (0x3f), uppercase
436 # letters (0x41..0x5a), left bracket (0x5b), right bracket (0x5d), backtick (0x60), lowercase letters (0x61..0x7a), left
437 # curly brace (0x7b), vertical bar (0x7c) and right curly brace (0x7d).
438 (0x24..0x25 | 0x2b | 0x2d | 0x2f | 0x40 | 0x5c | 0x5e..0x5f | 0x7e)+
444 # here is where we handle the UTF-8 and everything else
446 # one_byte_sequence = byte begins with zero;
447 # two_byte_sequence = first byte begins with 110 (0xc0..0xdf), next with 10 (0x80..9xbf);
448 # three_byte_sequence = first byte begins with 1110 (0xe0..0xef), next two with 10 (0x80..9xbf);
449 # four_byte_sequence = first byte begins with 11110 (0xf0..0xf7), next three with 10 (0x80..9xbf);
451 # within the ranges specified, we also exclude these illegal sequences:
452 # 1100000x (c0 c1) overlong encoding, lead byte of 2 byte seq but code point <= 127
453 # 11110101 (f5) restricted by RFC 3629 lead byte of 4-byte sequence for codepoint above 10ffff
454 # 1111011x (f6, f7) restricted by RFC 3629 lead byte of 4-byte sequence for codepoint above 10ffff
455 (0x01..0x1f | 0x7f) @non_printable_ascii |
456 (0xc2..0xdf 0x80..0xbf) @two_byte_utf8_sequence |
457 (0xe0..0xef 0x80..0xbf 0x80..0xbf) @three_byte_utf8_sequence |
458 (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf) @four_byte_utf8_sequence
461 out->column_stop = out->column_start + 1;
470 // for now we use the scanner as a tokenizer that returns one token at a time, just like ANTLR
471 // ultimately we could look at embedding all of the transformation inside the scanner itself (combined scanner/parser)
472 // pass in the last token because that's useful for the scanner to know
473 // p data pointer (required by Ragel machine); overriden with contents of last_token if supplied
474 // pe data end pointer (required by Ragel machine)
475 void next_token(token_t *out, token_t *last_token, char *p, char *pe)
477 int last_token_type = NO_TOKEN;
480 last_token_type = last_token->type;
481 p = last_token->stop;
482 out->line_start = out->line_stop = last_token->line_stop;
483 out->column_start = out->column_stop = last_token->column_stop;
488 out->column_start = 1;
490 out->column_stop = 1;
492 out->type = NO_TOKEN;
497 // all done, have reached end of input
499 out->type = END_OF_FILE;
503 char *mark; // for manual backtracking
504 char *eof = pe; // required for backtracking (longest match determination)
505 int cs; // current state (standard Ragel)
506 char *ts; // token start (scanner)
507 char *te; // token end (scanner)
508 int act; // identity of last patterned matched (scanner)
511 if (cs == wikitext_error)
512 rb_raise(eWikitextParserError, "failed before finding a token");
513 else if (out->type == NO_TOKEN)
514 rb_raise(eWikitextParserError, "failed to produce a token");