wibble 1.1
parse.h
Go to the documentation of this file.
1// -*- C++ -*- (c) 2011 Petr Rockai
2// (c) 2013 Jan Kriho
3// Support code for writing backtracking recursive-descent parsers
4#include <string>
5#include <cassert>
6#include <deque>
7#include <vector>
8#include <map>
9#include <queue>
10
11#include <wibble/regexp.h>
12
13#ifndef WIBBLE_PARSE_H
14#define WIBBLE_PARSE_H
15
16namespace wibble {
17
18struct Position {
19 std::string source;
20 int line;
21 int column;
22 Position() : source("-"), line(1), column(1) {}
23 bool operator==( const Position &o ) const {
24 return o.source == source && o.line == line && o.column == column;
25 }
26};
27
28template< typename _Id >
29struct Token {
30 typedef _Id Id;
32 std::string data;
34 bool _valid;
35
36 bool valid() { return _valid; }
37
38 Token( Id _id, char c ) : id( _id ), _valid( true ) {
39 data.push_back( c );
40 }
41
42 Token( Id _id, std::string d ) : id( _id ), data( d ), _valid( true ) {}
43 Token() : id( Id( 0 ) ), _valid( false ) {}
44
45 bool operator==( const Token &o ) const {
46 return o._valid == _valid && o.id == id && o.data == data && o.position == position;
47 }
48
49};
50
51template< typename X, typename Y >
52inline std::ostream &operator<<( std::ostream &o, const std::pair< X, Y > &x ) {
53 return o << "(" << x.first << ", " << x.second << ")";
54}
55
56/*
57 * This is a SLOW lexer (at least compared to systems like lex/flex, which
58 * build a finite-state machine to make decisions per input character. We could
59 * do that in theory, but it would still be slow, since we would be in effect
60 * interpreting the FSM anyway, while (f)lex uses an optimising compiler like
61 * gcc. So while this is friendly and flexible, it won't give you a fast
62 * scanner.
63 */
64template< typename Token, typename Stream >
65struct Lexer {
67 typedef std::deque< char > Window;
70
72
73 void shift() {
74 assert( !stream.eof() );
75 std::string r = stream.remove();
76 std::copy( r.begin(), r.end(), std::back_inserter( _window ) );
77 }
78
79 bool eof() {
80 return _window.empty() && stream.eof();
81 }
82
83 std::string window( unsigned n ) {
84 bool valid = ensure_window( n );
85 assert( valid );
86 static_cast< void >( valid );
87 std::deque< char >::iterator b = _window.begin(), e = b;
88 e += n;
89 return std::string( b, e );
90 }
91
92 bool ensure_window( unsigned n ) {
93 while ( _window.size() < n && !stream.eof() )
94 shift();
95 return _window.size() >= n;
96 }
97
98 void consume( int n ) {
99 for( int i = 0; i < n; ++i ) {
100 if ( _window[i] == '\n' ) {
101 current.line ++;
102 current.column = 1;
103 } else
104 current.column ++;
105 }
106 std::deque< char >::iterator b = _window.begin(), e = b;
107 e += n;
108 _window.erase( b, e );
109 }
110
111 void consume( const std::string &s ) {
112 consume( s.length() );
113 }
114
115 void consume( const Token &t ) {
116 // std::cerr << "consuming " << t << std::endl;
117 consume( t.data );
118 }
119
120 void keep( typename Token::Id id, const std::string &data ) {
121 Token t( id, data );
122 t.position = current;
123 if ( t.data.length() > _match.data.length() )
124 _match = t;
125 }
126
127 template< typename I >
128 bool match( I begin, I end ) {
129 if ( !ensure_window( end - begin ) )
130 return false;
131 return std::equal( begin, end, _window.begin() );
132 }
133
134 void match( const std::string &data, typename Token::Id id ) {
135 if ( match( data.begin(), data.end() ) )
136 return keep( id, data );
137 }
138
139 void match( Regexp &r, typename Token::Id id ) {
140 unsigned n = 1, max = 0;
141 while ( r.match( window( n ) ) ) {
142 if ( max && max == r.matchLength( 0 ) )
143 return keep( id, window( max ) );
144 max = r.matchLength( 0 );
145 ++ n;
146 }
147 }
148
149 void match( int (*first)(int), int (*rest)(int), typename Token::Id id )
150 {
151 unsigned n = 1;
152
153 if ( !ensure_window( 1 ) )
154 return;
155
156 if ( !first( _window[0] ) )
157 return;
158
159 while ( true ) {
160 ++ n;
161 if ( ensure_window( n ) && rest( _window[ n - 1 ] ) )
162 continue;
163 return keep( id, window( n - 1 ) );
164 }
165 }
166
167 void match( const std::string &from, const std::string &to, typename Token::Id id ) {
168 if ( !match( from.begin(), from.end() ) )
169 return;
170
171 Window::iterator where = _window.begin();
172 int n = from.length();
173 where += n;
174 while ( true ) {
175 if ( !ensure_window( n + to.length() ) )
176 return;
177
178 if ( std::equal( to.begin(), to.end(), where ) )
179 return keep( id, window( n + to.length() ) );
180 ++ where;
181 ++ n;
182 }
183 }
184
186 while ( !eof() && isspace( window( 1 )[ 0 ] ) )
187 consume( 1 );
188 }
189
191 Token t;
192 std::swap( t, _match );
193 consume( t );
194 return t;
195 }
196
197 Lexer( Stream &s ) : stream( s ) {}
198};
199
200template< typename Token, typename Stream >
202{
204 Stream &stream() { assert( _stream ); return *_stream; }
205
206 std::deque< Token > window;
209 std::string name;
211 std::vector< This > children;
212
213 struct Fail {
215
217 const char *expected;
219
220 bool operator<( const Fail &other ) const {
221 return position > other.position;
222 }
223
224 Fail( const char *err, int pos, Type t = Syntax) {
225 expected = err;
226 position = pos;
227 type = t;
228 }
229 ~Fail() throw () {}
230 };
231
232 std::priority_queue< Fail > failures;
233
234 void clearErrors() {
235 failures = std::priority_queue< Fail >();
236 }
237
238 void error( std::ostream &o, std::string prefix, const Fail &fail ) {
239 Token t = window[ fail.position ];
240 switch ( fail.type ) {
241 case Fail::Syntax:
242 o << prefix
243 << "expected " << fail.expected
244 << " at line " << t.position.line
245 << ", column " << t.position.column
246 << ", but seen " << Token::tokenName[ t.id ] << " '" << t.data << "'"
247 << std::endl;
248 return;
249 case Fail::Semantic:
250 o << prefix
251 << fail.expected
252 << " at line " << t.position.line
253 << ", column " << t.position.column
254 << std::endl;
255 return;
256 }
257 }
258
259 void errors( std::ostream &o ) {
260 for ( typename std::vector< This >::iterator pc = children.begin(); pc != children.end(); ++pc )
261 pc->errors( o );
262
263 if ( failures.empty() )
264 return;
265
266 std::string prefix;
267 switch ( failures.top().type ) {
268 case Fail::Syntax:
269 o << "parse";
270 break;
271 case Fail::Semantic:
272 o << "semantic";
273 break;
274 }
275 o << " error in context " << name << ": ";
276 if ( failures.size() > 1 ) {
277 o << failures.size() << " rightmost alternatives:" << std::endl;
278 prefix = " ";
279 }
280 while ( !failures.empty() ) {
281 error( o, prefix, failures.top() );
282 failures.pop();
283 }
284 }
285
287 if ( int( window.size() ) <= window_pos ) {
288 Token t;
289 do {
290 t = stream().remove();
291 } while ( t.id == Token::Comment ); // XXX
292 window.push_back( t );
293 }
294
295 ++ window_pos;
296 ++ position;
297 return window[ window_pos - 1 ];
298 }
299
300 void rewind( int n ) {
301 assert( n >= 0 );
302 assert( n <= window_pos );
303 window_pos -= n;
304 position -= n;
305 }
306
307 This & createChild( Stream &s, std::string name ) {
308 children.push_back( This( s, name ) );
309 return children.back();
310 }
311
312 ParseContext( Stream &s, std::string name )
313 : _stream( &s ), window_pos( 0 ), position( 0 ), name( name )
314 {}
315};
316
317template< typename Token, typename Stream >
318struct Parser {
319 typedef typename Token::Id TokenId;
322 typedef typename Context::Fail Fail;
323 typedef typename Fail::Type FailType;
325
326 bool valid() const {
327 return ctx;
328 }
329
331 assert( ctx );
332 return *ctx;
333 }
334
335 int position() {
336 return context().position;
337 }
338
339 void rewind( int i ) {
340 context().rewind( i );
341 _position = context().position;
342 }
343
344 void fail( const char *what, FailType type = FailType::Syntax ) __attribute__((noreturn))
345 {
346 Fail f( what, _position, type );
347 context().failures.push( f );
348 while ( context().failures.top().position < _position )
349 context().failures.pop();
350 throw f;
351 }
352
353 void semicolon() {
354 Token t = eat();
355 if ( t.id == Token::Punctuation && t.data == ";" )
356 return;
357
358 rewind( 1 );
359 fail( "semicolon" );
360 }
361
362 void colon() {
363 Token t = eat();
364 if ( t.id == Token::Punctuation && t.data == ":" )
365 return;
366
367 rewind( 1 );
368 fail( "colon" );
369 }
370
372 Token t = eat( false );
373 if ( t.id == id )
374 return t;
375 rewind( 1 );
376 fail( Token::tokenName[id].c_str() );
377 }
378
379
380#if __cplusplus >= 201103L
381 template< typename F >
382 void either( void (F::*f)() ) {
383 (static_cast< F* >( this )->*f)();
384 }
385
386 template< typename F, typename... Args >
387 void either( F f, Args... args ) {
388 if ( maybe( f ) )
389 return;
390 either( args... );
391 }
392#else
393 template< typename F, typename G >
394 void either( F f, void (G::*g)() ) {
395 if ( maybe( f ) )
396 return;
397 (static_cast< G* >( this )->*g)();
398 }
399#endif
400
401
402#if __cplusplus >= 201103L
403 template< typename F, typename... Args >
404 bool maybe( F f, Args... args ) {
405 if ( maybe( f ) )
406 return true;
407 return maybe( args... );
408 }
409
410#else
411 template< typename F, typename G >
412 bool maybe( F f, G g ) {
413 if ( maybe( f ) )
414 return true;
415 return maybe( g );
416 }
417
418
419 template< typename F, typename G, typename H >
420 bool maybe( F f, G g, H h ) {
421 if ( maybe( f ) )
422 return true;
423 if ( maybe( g ) )
424 return true;
425 return maybe( h );
426 }
427
428#endif
429
430 template< typename F >
431 bool maybe( void (F::*f)() ) {
432 int fallback = position();
433 try {
434 (static_cast< F* >( this )->*f)();
435 return true;
436 } catch ( Fail fail ) {
437 rewind( position() - fallback );
438 return false;
439 }
440 }
441
442 bool maybe( TokenId id ) {
443 int fallback = position();
444 try {
445 eat( id );
446 return true;
447 } catch (Fail) {
448 rewind( position() - fallback );
449 return false;
450 }
451 }
452
453 template< typename T, typename I >
454 void many( I i ) {
455 int fallback = 0;
456 try {
457 while ( true ) {
458 fallback = position();
459 *i++ = T( context() );
460 }
461 } catch (Fail) {
462 rewind( position() - fallback );
463 }
464 }
465#if __cplusplus >= 201103L
466 template< typename F >
467 bool arbitrary( F f ) {
468 return maybe( f );
469 }
470
471 template< typename F, typename... Args >
472 bool arbitrary( F f, Args... args ) {
473 bool retval = arbitrary( args... );
474 retval |= maybe( f );
475 retval |= arbitrary( args... );
476 return retval;
477 }
478#endif
479
480 template< typename T, typename I >
481 void list( I i, TokenId sep ) {
482 do {
483 *i++ = T( context() );
484 } while ( next( sep ) );
485 }
486
487 template< typename T, typename I, typename F >
488 void list( I i, void (F::*sep)() ) {
489 int fallback = position();
490 try {
491 while ( true ) {
492 *i++ = T( context() );
493 fallback = position();
494 (static_cast< F* >( this )->*sep)();
495 }
496 } catch(Fail) {
497 rewind( position() - fallback );
498 }
499 }
500
501 template< typename T, typename I >
502 void list( I i, TokenId first, TokenId sep, TokenId last ) {
503 eat( first );
504 list< T >( i, sep );
505 eat( last );
506 }
507
508 Token eat( bool _fail = true ) {
509 Token t = context().remove();
510 _position = context().position;
511 if ( _fail && !t.valid() ) {
512 rewind( 1 );
513 fail( "valid token" );
514 }
515 return t;
516 }
517
518 bool next( TokenId id ) {
519 Token t = eat( false );
520 if ( t.id == id )
521 return true;
522 rewind( 1 );
523 return false;
524 }
525
526 Parser( Context &c ) : ctx( &c ) {}
527 Parser() : ctx( 0 ) {}
528
529};
530
531}
532
533#endif
Definition regexp.h:54
Definition amorph.h:17
std::ostream & operator<<(std::ostream &o, const std::pair< X, Y > &x)
Definition parse.h:52
Definition parse.h:65
void skipWhitespace()
Definition parse.h:185
Position current
Definition parse.h:69
void shift()
Definition parse.h:73
void match(Regexp &r, typename Token::Id id)
Definition parse.h:139
void match(const std::string &from, const std::string &to, typename Token::Id id)
Definition parse.h:167
Token _match
Definition parse.h:71
std::deque< char > Window
Definition parse.h:67
Stream & stream
Definition parse.h:66
bool ensure_window(unsigned n)
Definition parse.h:92
std::string window(unsigned n)
Definition parse.h:83
Window _window
Definition parse.h:68
bool eof()
Definition parse.h:79
Lexer(Stream &s)
Definition parse.h:197
bool match(I begin, I end)
Definition parse.h:128
void match(int(*first)(int), int(*rest)(int), typename Token::Id id)
Definition parse.h:149
void consume(const Token &t)
Definition parse.h:115
Token decide()
Definition parse.h:190
void consume(const std::string &s)
Definition parse.h:111
void match(const std::string &data, typename Token::Id id)
Definition parse.h:134
void consume(int n)
Definition parse.h:98
void keep(typename Token::Id id, const std::string &data)
Definition parse.h:120
Definition parse.h:213
Type type
Definition parse.h:218
Type
Definition parse.h:214
@ Syntax
Definition parse.h:214
@ Semantic
Definition parse.h:214
bool operator<(const Fail &other) const
Definition parse.h:220
const char * expected
Definition parse.h:217
~Fail()
Definition parse.h:229
int position
Definition parse.h:216
Fail(const char *err, int pos, Type t=Syntax)
Definition parse.h:224
Definition parse.h:202
std::vector< This > children
Definition parse.h:211
std::priority_queue< Fail > failures
Definition parse.h:232
void rewind(int n)
Definition parse.h:300
void errors(std::ostream &o)
Definition parse.h:259
This & createChild(Stream &s, std::string name)
Definition parse.h:307
Stream & stream()
Definition parse.h:204
ParseContext< Token, Stream > This
Definition parse.h:210
int position
Definition parse.h:208
void clearErrors()
Definition parse.h:234
ParseContext(Stream &s, std::string name)
Definition parse.h:312
std::deque< Token > window
Definition parse.h:206
void error(std::ostream &o, std::string prefix, const Fail &fail)
Definition parse.h:238
std::string name
Definition parse.h:209
Stream * _stream
Definition parse.h:203
Token remove()
Definition parse.h:286
int window_pos
Definition parse.h:207
Definition parse.h:318
void semicolon()
Definition parse.h:353
Context * ctx
Definition parse.h:321
void fail(const char *what, FailType type=FailType::Syntax) __attribute__((noreturn))
Definition parse.h:344
void many(I i)
Definition parse.h:454
void list(I i, TokenId sep)
Definition parse.h:481
bool maybe(void(F::*f)())
Definition parse.h:431
Parser(Context &c)
Definition parse.h:526
int _position
Definition parse.h:324
Parser()
Definition parse.h:527
Token::Id TokenId
Definition parse.h:319
bool maybe(F f, G g, H h)
Definition parse.h:420
void list(I i, void(F::*sep)())
Definition parse.h:488
int position()
Definition parse.h:335
void rewind(int i)
Definition parse.h:339
bool valid() const
Definition parse.h:326
Context & context()
Definition parse.h:330
void colon()
Definition parse.h:362
Token eat(bool _fail=true)
Definition parse.h:508
void either(F f, void(G::*g)())
Definition parse.h:394
bool next(TokenId id)
Definition parse.h:518
Fail::Type FailType
Definition parse.h:323
bool maybe(TokenId id)
Definition parse.h:442
ParseContext< Token, Stream > Context
Definition parse.h:320
Context::Fail Fail
Definition parse.h:322
bool maybe(F f, G g)
Definition parse.h:412
Token eat(TokenId id)
Definition parse.h:371
void list(I i, TokenId first, TokenId sep, TokenId last)
Definition parse.h:502
Definition parse.h:18
int column
Definition parse.h:21
bool operator==(const Position &o) const
Definition parse.h:23
Position()
Definition parse.h:22
int line
Definition parse.h:20
std::string source
Definition parse.h:19
Definition amorph.h:30
Definition parse.h:29
bool _valid
Definition parse.h:34
Position position
Definition parse.h:33
std::string data
Definition parse.h:32
Token(Id _id, std::string d)
Definition parse.h:42
_Id Id
Definition parse.h:30
bool valid()
Definition parse.h:36
Token()
Definition parse.h:43
bool operator==(const Token &o) const
Definition parse.h:45
Token(Id _id, char c)
Definition parse.h:38
Id id
Definition parse.h:31
#define assert(x)
Definition test.h:30