1 # Copyright 2007-2010 Wincent Colaiuta. All rights reserved.
2 # Redistribution and use in source and binary forms, with or without
3 # modification, are permitted provided that the following conditions are met:
5 # 1. Redistributions of source code must retain the above copyright notice,
6 # this list of conditions and the following disclaimer.
7 # 2. Redistributions in binary form must reproduce the above copyright notice,
8 # this list of conditions and the following disclaimer in the documentation
9 # and/or other materials provided with the distribution.
11 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
12 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
13 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
14 # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
15 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
16 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
17 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
18 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
19 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
20 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
21 # POSSIBILITY OF SUCH DAMAGE.
23 require File.expand_path('../spec_helper', File.dirname(__FILE__))
25 describe Walrat::RegexpParslet do
27 @parslet = Walrat::RegexpParslet.new(/[a-zA-Z_][a-zA-Z0-9_]*/)
30 it 'raises an ArgumentError if initialized with nil' do
32 Walrat::RegexpParslet.new nil
33 end.to raise_error(ArgumentError, /nil regexp/)
36 it 'parse should succeed if the input string matches' do
37 lambda { @parslet.parse('an_identifier') }.should_not raise_error
38 lambda { @parslet.parse('An_Identifier') }.should_not raise_error
39 lambda { @parslet.parse('AN_IDENTIFIER') }.should_not raise_error
40 lambda { @parslet.parse('an_identifier1') }.should_not raise_error
41 lambda { @parslet.parse('An_Identifier1') }.should_not raise_error
42 lambda { @parslet.parse('AN_IDENTIFIER1') }.should_not raise_error
43 lambda { @parslet.parse('a') }.should_not raise_error
44 lambda { @parslet.parse('A') }.should_not raise_error
45 lambda { @parslet.parse('a9') }.should_not raise_error
46 lambda { @parslet.parse('A9') }.should_not raise_error
47 lambda { @parslet.parse('_identifier') }.should_not raise_error
48 lambda { @parslet.parse('_Identifier') }.should_not raise_error
49 lambda { @parslet.parse('_IDENTIFIER') }.should_not raise_error
50 lambda { @parslet.parse('_9Identifier') }.should_not raise_error
51 lambda { @parslet.parse('_') }.should_not raise_error
54 it 'parse should succeed if the input string matches, even if it continues after the match' do
55 lambda { @parslet.parse('an_identifier, more') }.should_not raise_error
56 lambda { @parslet.parse('An_Identifier, more') }.should_not raise_error
57 lambda { @parslet.parse('AN_IDENTIFIER, more') }.should_not raise_error
58 lambda { @parslet.parse('an_identifier1, more') }.should_not raise_error
59 lambda { @parslet.parse('An_Identifier1, more') }.should_not raise_error
60 lambda { @parslet.parse('AN_IDENTIFIER1, more') }.should_not raise_error
61 lambda { @parslet.parse('a, more') }.should_not raise_error
62 lambda { @parslet.parse('A, more') }.should_not raise_error
63 lambda { @parslet.parse('a9, more') }.should_not raise_error
64 lambda { @parslet.parse('A9, more') }.should_not raise_error
65 lambda { @parslet.parse('_identifier, more') }.should_not raise_error
66 lambda { @parslet.parse('_Identifier, more') }.should_not raise_error
67 lambda { @parslet.parse('_IDENTIFIER, more') }.should_not raise_error
68 lambda { @parslet.parse('_9Identifier, more') }.should_not raise_error
69 lambda { @parslet.parse('_, more') }.should_not raise_error
72 it 'parse should return a MatchDataWrapper object' do
73 @parslet.parse('an_identifier').should == 'an_identifier'
74 @parslet.parse('an_identifier, more').should == 'an_identifier'
77 it 'parse should raise an ArgumentError if passed nil' do
80 end.to raise_error(ArgumentError, /nil string/)
83 it 'parse should raise a ParseError if the input string does not match' do
84 lambda { @parslet.parse('9') }.should raise_error(Walrat::ParseError) # a number is not a valid identifier
85 lambda { @parslet.parse('9fff') }.should raise_error(Walrat::ParseError) # identifiers must not start with numbers
86 lambda { @parslet.parse(' identifier') }.should raise_error(Walrat::ParseError) # note the leading whitespace
87 lambda { @parslet.parse('') }.should raise_error(Walrat::ParseError) # empty strings can't match
90 it 'should be able to compare parslets for equality' do
91 /foo/.to_parseable.should eql(/foo/.to_parseable) # equal
92 /foo/.to_parseable.should_not eql(/bar/.to_parseable) # different
93 /foo/.to_parseable.should_not eql(/Foo/.to_parseable) # differing only in case
94 /foo/.to_parseable.should_not eql('foo') # totally different classes
97 it 'should accurately pack line and column ends into whatever gets returned from "parse"' do
99 parslet = /.+/m.to_parseable
100 result = parslet.parse('hello')
101 result.line_end.should == 0
102 result.column_end.should == 5
104 # single word with newline at end (UNIX style)
105 result = parslet.parse("hello\n")
106 result.line_end.should == 1
107 result.column_end.should == 0
109 # single word with newline at end (Classic Mac style)
110 result = parslet.parse("hello\r")
111 result.line_end.should == 1
112 result.column_end.should == 0
114 # single word with newline at end (Windows style)
115 result = parslet.parse("hello\r\n")
116 result.line_end.should == 1
117 result.column_end.should == 0
119 # two lines (UNIX style)
120 result = parslet.parse("hello\nworld")
121 result.line_end.should == 1
122 result.column_end.should == 5
124 # two lines (Classic Mac style)
125 result = parslet.parse("hello\rworld")
126 result.line_end.should == 1
127 result.column_end.should == 5
129 # two lines (Windows style)
130 result = parslet.parse("hello\r\nworld")
131 result.line_end.should == 1
132 result.column_end.should == 5
135 # in the case of RegexpParslets, the "last successfully scanned position" is
137 it 'line and column end should reflect last succesfully scanned position prior to failure' do
138 # fail right at start
139 parslet = /hello\r\nworld/.to_parseable
141 parslet.parse('foobar')
142 rescue Walrat::ParseError => e
145 exception.line_end.should == 0
146 exception.column_end.should == 0
148 # fail after 1 character
150 parslet.parse('hfoobar')
151 rescue Walrat::ParseError => e
154 exception.line_end.should == 0
155 exception.column_end.should == 0
157 # fail after end-of-line
159 parslet.parse("hello\r\nfoobar")
160 rescue Walrat::ParseError => e
163 exception.line_end.should == 0
164 exception.column_end.should == 0
168 describe 'chaining two regexp parslets together' do
169 it 'parslets should work in specified order' do
170 parslet = Walrat::RegexpParslet.new(/foo.\d/) &
171 Walrat::RegexpParslet.new(/bar.\d/)
172 parslet.parse('foo_1bar_2').should == ['foo_1', 'bar_2']
175 # Parser Expression Grammars match greedily
176 it 'parslets should match greedily' do
177 # the first parslet should gobble up the entire string, preventing the
178 # second parslet from succeeding
179 parslet = Walrat::RegexpParslet.new(/foo.+\d/) &
180 Walrat::RegexpParslet.new(/bar.+\d/)
181 lambda { parslet.parse('foo_1bar_2') }.should raise_error(Walrat::ParseError)
185 describe 'alternating two regexp parslets' do
186 it 'either parslet should apply to generate a match' do
187 parslet = Walrat::RegexpParslet.new(/\d+/) |
188 Walrat::RegexpParslet.new(/[A-Z]+/)
189 parslet.parse('ABC').should == 'ABC'
190 parslet.parse('123').should == '123'
193 it 'should fail if no parslet generates a match' do
194 parslet = Walrat::RegexpParslet.new(/\d+/) |
195 Walrat::RegexpParslet.new(/[A-Z]+/)
196 lambda { parslet.parse('abc') }.should raise_error(Walrat::ParseError)
199 it 'parslets should be tried in left-to-right order' do
200 # in this case the first parslet should win even though the second one is also a valid match
201 parslet = Walrat::RegexpParslet.new(/(.)(..)/) |
202 Walrat::RegexpParslet.new(/(..)(.)/)
203 match_data = parslet.parse('abc').match_data
204 match_data[1].should == 'a'
205 match_data[2].should == 'bc'
207 # here we swap the order; again the first parslet should win
208 parslet = Walrat::RegexpParslet.new(/(..)(.)/) |
209 Walrat::RegexpParslet.new(/(.)(..)/)
210 match_data = parslet.parse('abc').match_data
211 match_data[1].should == 'ab'
212 match_data[2].should == 'c'
216 describe 'chaining three regexp parslets' do
217 it 'parslets should work in specified order' do
218 parslet = Walrat::RegexpParslet.new(/foo.\d/) &
219 Walrat::RegexpParslet.new(/bar.\d/) &
220 Walrat::RegexpParslet.new(/.../)
221 parslet.parse('foo_1bar_2ABC').should == ['foo_1', 'bar_2', 'ABC']
225 describe 'alternating three regexp parslets' do
226 it 'any parslet should apply to generate a match' do
227 parslet = Walrat::RegexpParslet.new(/\d+/) |
228 Walrat::RegexpParslet.new(/[A-Z]+/) |
229 Walrat::RegexpParslet.new(/[a-z]+/)
230 parslet.parse('ABC').should == 'ABC'
231 parslet.parse('123').should == '123'
232 parslet.parse('abc').should == 'abc'
235 it 'should fail if no parslet generates a match' do
236 parslet = Walrat::RegexpParslet.new(/\d+/) |
237 Walrat::RegexpParslet.new(/[A-Z]+/) |
238 Walrat::RegexpParslet.new(/[a-z]+/)
239 lambda { parslet.parse(':::') }.should raise_error(Walrat::ParseError)
242 it 'parslets should be tried in left-to-right order' do
243 # in this case the first parslet should win even though the others also produce valid matches
244 parslet = Walrat::RegexpParslet.new(/(.)(..)/) |
245 Walrat::RegexpParslet.new(/(..)(.)/) |
246 Walrat::RegexpParslet.new(/(...)/)
247 match_data = parslet.parse('abc').match_data
248 match_data[1].should == 'a'
249 match_data[2].should == 'bc'
251 # here we swap the order; again the first parslet should win
252 parslet = Walrat::RegexpParslet.new(/(..)(.)/) |
253 Walrat::RegexpParslet.new(/(.)(..)/) |
254 Walrat::RegexpParslet.new(/(...)/)
255 match_data = parslet.parse('abc').match_data
256 match_data[1].should == 'ab'
257 match_data[2].should == 'c'
259 # similar test but this time the first parslet can't win (doesn't match)
260 parslet = Walrat::RegexpParslet.new(/foo/) |
261 Walrat::RegexpParslet.new(/(...)/) |
262 Walrat::RegexpParslet.new(/(.)(..)/)
263 match_data = parslet.parse('abc').match_data
264 match_data[1].should == 'abc'
268 describe 'combining chaining and alternation' do
269 it 'chaining should having higher precedence than alternation' do
270 # equivalent to /foo/ | ( /bar/ & /abc/ )
271 parslet = Walrat::RegexpParslet.new(/foo/) |
272 Walrat::RegexpParslet.new(/bar/) &
273 Walrat::RegexpParslet.new(/abc/)
274 parslet.parse('foo').should == 'foo' # succeed on first choice
275 parslet.parse('barabc').should == ['bar', 'abc'] # succeed on alternate path
276 lambda { parslet.parse('bar...') }.should raise_error(Walrat::ParseError) # fail half-way down alternate path
277 lambda { parslet.parse('lemon') }.should raise_error(Walrat::ParseError) # fail immediately
279 # swap the order, now equivalent to: ( /bar/ & /abc/ ) | /foo/
280 parslet = Walrat::RegexpParslet.new(/bar/) &
281 Walrat::RegexpParslet.new(/abc/) |
282 Walrat::RegexpParslet.new(/foo/)
283 parslet.parse('barabc').should == ['bar', 'abc'] # succeed on first choice
284 parslet.parse('foo').should == 'foo' # succeed on alternate path
285 lambda { parslet.parse('bar...') }.should raise_error(Walrat::ParseError) # fail half-way down first path
286 lambda { parslet.parse('lemon') }.should raise_error(Walrat::ParseError) # fail immediately
289 it 'should be able to override precedence using parentheses' do
290 # take first example above and make it ( /foo/ | /bar/ ) & /abc/
291 parslet = (Walrat::RegexpParslet.new(/foo/) |
292 Walrat::RegexpParslet.new(/bar/)) &
293 Walrat::RegexpParslet.new(/abc/)
294 parslet.parse('fooabc').should == ['foo', 'abc'] # first choice
295 parslet.parse('barabc').should == ['bar', 'abc'] # second choice
296 lambda { parslet.parse('foo...') }.should raise_error(Walrat::ParseError) # fail in second half
297 lambda { parslet.parse('bar...') }.should raise_error(Walrat::ParseError) # another way of failing in second half
298 lambda { parslet.parse('foo') }.should raise_error(Walrat::ParseError) # another way of failing in second half
299 lambda { parslet.parse('bar') }.should raise_error(Walrat::ParseError) # another way of failing in second half
300 lambda { parslet.parse('lemon') }.should raise_error(Walrat::ParseError) # fail immediately
301 lambda { parslet.parse('abcfoo') }.should raise_error(Walrat::ParseError) # order matters
303 # take second example above and make it /bar/ & ( /abc/ | /foo/ )
304 parslet = Walrat::RegexpParslet.new(/bar/) &
305 (Walrat::RegexpParslet.new(/abc/) | Walrat::RegexpParslet.new(/foo/))
306 parslet.parse('barabc').should == ['bar', 'abc'] # succeed on first choice
307 parslet.parse('barfoo').should == ['bar', 'foo'] # second choice
308 lambda { parslet.parse('bar...') }.should raise_error(Walrat::ParseError) # fail in second part
309 lambda { parslet.parse('bar') }.should raise_error(Walrat::ParseError) # another way to fail in second part
310 lambda { parslet.parse('lemon') }.should raise_error(Walrat::ParseError) # fail immediately
311 lambda { parslet.parse('abcbar') }.should raise_error(Walrat::ParseError) # order matters
314 it 'should be able to include long runs of sequences' do
316 parslet = Walrat::RegexpParslet.new(/a/) &
317 Walrat::RegexpParslet.new(/b/) &
318 Walrat::RegexpParslet.new(/c/) &
319 Walrat::RegexpParslet.new(/d/) |
320 Walrat::RegexpParslet.new(/e/)
321 parslet.parse('abcd').should == ['a', 'b', 'c', 'd']
322 parslet.parse('e').should == 'e'
323 lambda { parslet.parse('f') }.should raise_error(Walrat::ParseError)
326 it 'should be able to include long runs of options' do
328 parslet = Walrat::RegexpParslet.new(/a/) |
329 Walrat::RegexpParslet.new(/b/) |
330 Walrat::RegexpParslet.new(/c/) |
331 Walrat::RegexpParslet.new(/d/) &
332 Walrat::RegexpParslet.new(/e/)
333 parslet.parse('a').should == 'a'
334 parslet.parse('b').should == 'b'
335 parslet.parse('c').should == 'c'
336 parslet.parse('de').should == ['d', 'e']
337 lambda { parslet.parse('f') }.should raise_error(Walrat::ParseError)
340 it 'should be able to alternate repeatedly between sequences and choices' do
342 parslet = Walrat::RegexpParslet.new(/a/) &
343 Walrat::RegexpParslet.new(/b/) |
344 Walrat::RegexpParslet.new(/c/) &
345 Walrat::RegexpParslet.new(/d/) |
346 Walrat::RegexpParslet.new(/e/)
347 parslet.parse('ab').should == ['a', 'b']
348 parslet.parse('cd').should == ['c', 'd']
349 parslet.parse('e').should == 'e'
350 lambda { parslet.parse('f') }.should raise_error(Walrat::ParseError)
353 it 'should be able to combine long runs with alternation' do
354 # A & B & C | D | E | F & G & H
355 parslet = Walrat::RegexpParslet.new(/a/) &
356 Walrat::RegexpParslet.new(/b/) &
357 Walrat::RegexpParslet.new(/c/) |
358 Walrat::RegexpParslet.new(/d/) |
359 Walrat::RegexpParslet.new(/e/) |
360 Walrat::RegexpParslet.new(/f/) &
361 Walrat::RegexpParslet.new(/g/) &
362 Walrat::RegexpParslet.new(/h/)
363 parslet.parse('abc').should == ['a', 'b', 'c']
364 parslet.parse('d').should == 'd'
365 parslet.parse('e').should == 'e'
366 parslet.parse('fgh').should == ['f', 'g', 'h']
367 lambda { parslet.parse('i') }.should raise_error(Walrat::ParseError)