]> git.wincent.com - wikitext.git/commitdiff
Add "fulltext_tokenize" method to parser
authorWincent Colaiuta <win@wincent.com>
Thu, 24 Apr 2008 00:03:39 +0000 (02:03 +0200)
committerWincent Colaiuta <win@wincent.com>
Thu, 24 Apr 2008 00:03:39 +0000 (02:03 +0200)
This is a thin C wrapper that makes use of the Ragel-generated scanner
to provide a Ruby-accessible array of "interesting" tokens for the
purposes of full-text search indexing, as mentioned in 51c8e75.

Signed-off-by: Wincent Colaiuta <win@wincent.com>
ext/parser.c
ext/parser.h
ext/wikitext.c

index c4dca96ae42e5a42d6829ad4f6e5b936769cd630..22bee4badeaba8eb134776e199401ffe68f35df0 100644 (file)
@@ -158,6 +158,36 @@ VALUE Wikitext_parser_benchmarking_tokenize(VALUE self, VALUE string)
     return Qnil;
 }
 
+VALUE Wikitext_parser_fulltext_tokenize(VALUE self, VALUE string)
+{
+    if (NIL_P(string))
+        return Qnil;
+    string = StringValue(string);
+    VALUE tokens = rb_ary_new();
+    char *p = RSTRING_PTR(string);
+    long len = RSTRING_LEN(string);
+    char *pe = p + len;
+    token_t token;
+    token_t *_token = &token;
+    next_token(&token, NULL, p, pe);
+    while (token.type != END_OF_FILE)
+    {
+        switch (token.type)
+        {
+            case URI:
+            case MAIL:
+            case ALNUM:
+                rb_ary_push(tokens, TOKEN_TEXT(_token));
+                break;
+            default:
+                // ignore everything else
+                break;
+        }
+        next_token(&token, &token, NULL, pe);
+    }
+    return tokens;
+}
+
 // we downcase "in place", overwriting the original contents of the buffer and returning the same string
 VALUE _Wikitext_downcase(VALUE string)
 {
index 2b9524c550038b3fcedeae1320243a34a9abba88..969465d67758b2aed7c61bed38f2f31c5488545f 100644 (file)
@@ -20,6 +20,8 @@ VALUE Wikitext_parser_tokenize(VALUE self, VALUE string);
 
 VALUE Wikitext_parser_benchmarking_tokenize(VALUE self, VALUE string);
 
+VALUE Wikitext_parser_fulltext_tokenize(VALUE self, VALUE string);
+
 VALUE Wikitext_parser_sanitize_link_target(VALUE self, VALUE string);
 
 VALUE Wikitext_parser_encode_link_target(VALUE self, VALUE in);
index 65790a0200b597a7fa6732267fdadf7ba1ed9719..9a5126707b59574ed3d78d0d88f283fb625b10a5 100644 (file)
@@ -32,6 +32,7 @@ void Init_wikitext()
     rb_define_method(cWikitextParser, "profiling_parse", Wikitext_parser_profiling_parse, 1);
     rb_define_method(cWikitextParser, "tokenize", Wikitext_parser_tokenize, 1);
     rb_define_method(cWikitextParser, "benchmarking_tokenize", Wikitext_parser_benchmarking_tokenize, 1);
+    rb_define_method(cWikitextParser, "fulltext_tokenize", Wikitext_parser_fulltext_tokenize, 1);
     rb_define_singleton_method(cWikitextParser, "sanitize_link_target", Wikitext_parser_sanitize_link_target, 1);
     rb_define_singleton_method(cWikitextParser, "encode_link_target", Wikitext_parser_encode_link_target, 1);
     rb_define_singleton_method(cWikitextParser, "encode_special_link_target", Wikitext_parser_encode_special_link_target, 1);