libstdc++
|
00001 // class template regex -*- C++ -*- 00002 00003 // Copyright (C) 2013-2014 Free Software Foundation, Inc. 00004 // 00005 // This file is part of the GNU ISO C++ Library. This library is free 00006 // software; you can redistribute it and/or modify it under the 00007 // terms of the GNU General Public License as published by the 00008 // Free Software Foundation; either version 3, or (at your option) 00009 // any later version. 00010 00011 // This library is distributed in the hope that it will be useful, 00012 // but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00014 // GNU General Public License for more details. 00015 00016 // Under Section 7 of GPL version 3, you are granted additional 00017 // permissions described in the GCC Runtime Library Exception, version 00018 // 3.1, as published by the Free Software Foundation. 00019 00020 // You should have received a copy of the GNU General Public License and 00021 // a copy of the GCC Runtime Library Exception along with this program; 00022 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 00023 // <http://www.gnu.org/licenses/>. 00024 00025 /** 00026 * @file bits/regex_compiler.tcc 00027 * This is an internal header file, included by other library headers. 00028 * Do not attempt to use it directly. @headername{regex} 00029 */ 00030 00031 // FIXME make comments doxygen format. 00032 00033 // This compiler refers to "Regular Expression Matching Can Be Simple And Fast" 00034 // (http://swtch.com/~rsc/regexp/regexp1.html"), 00035 // but doesn't strictly follow it. 00036 // 00037 // When compiling, states are *chained* instead of tree- or graph-constructed. 00038 // It's more like structured programs: there's if statement and loop statement. 00039 // 00040 // For alternative structure(say "a|b"), aka "if statement", two branchs should 00041 // be constructed. However, these two shall merge to an "end_tag" at the end of 00042 // this operator: 00043 // 00044 // branch1 00045 // / \ 00046 // => begin_tag end_tag => 00047 // \ / 00048 // branch2 00049 // 00050 // This is the difference between this implementation and that in Russ's 00051 // article. 00052 // 00053 // That's why we introduced dummy node here ------ "end_tag" is a dummy node. 00054 // All dummy node will be eliminated at the end of compiling process. 00055 00056 namespace std _GLIBCXX_VISIBILITY(default) 00057 { 00058 namespace __detail 00059 { 00060 _GLIBCXX_BEGIN_NAMESPACE_VERSION 00061 00062 template<typename _TraitsT> 00063 _Compiler<_TraitsT>:: 00064 _Compiler(_IterT __b, _IterT __e, 00065 const _TraitsT& __traits, _FlagT __flags) 00066 : _M_flags((__flags 00067 & (regex_constants::ECMAScript 00068 | regex_constants::basic 00069 | regex_constants::extended 00070 | regex_constants::grep 00071 | regex_constants::egrep 00072 | regex_constants::awk)) 00073 ? __flags 00074 : __flags | regex_constants::ECMAScript), 00075 _M_traits(__traits), 00076 _M_ctype(std::use_facet<_CtypeT>(_M_traits.getloc())), 00077 _M_scanner(__b, __e, _M_flags, _M_traits.getloc()), 00078 _M_nfa(_M_flags) 00079 { 00080 _StateSeqT __r(_M_nfa, _M_nfa._M_start()); 00081 __r._M_append(_M_nfa._M_insert_subexpr_begin()); 00082 this->_M_disjunction(); 00083 if (!_M_match_token(_ScannerT::_S_token_eof)) 00084 __throw_regex_error(regex_constants::error_paren); 00085 __r._M_append(_M_pop()); 00086 _GLIBCXX_DEBUG_ASSERT(_M_stack.empty()); 00087 __r._M_append(_M_nfa._M_insert_subexpr_end()); 00088 __r._M_append(_M_nfa._M_insert_accept()); 00089 _M_nfa._M_eliminate_dummy(); 00090 } 00091 00092 template<typename _TraitsT> 00093 void 00094 _Compiler<_TraitsT>:: 00095 _M_disjunction() 00096 { 00097 this->_M_alternative(); 00098 while (_M_match_token(_ScannerT::_S_token_or)) 00099 { 00100 _StateSeqT __alt1 = _M_pop(); 00101 this->_M_alternative(); 00102 _StateSeqT __alt2 = _M_pop(); 00103 auto __end = _M_nfa._M_insert_dummy(); 00104 __alt1._M_append(__end); 00105 __alt2._M_append(__end); 00106 _M_stack.push(_StateSeqT(_M_nfa, 00107 _M_nfa._M_insert_alt(__alt1._M_start, 00108 __alt2._M_start, false), 00109 __end)); 00110 } 00111 } 00112 00113 template<typename _TraitsT> 00114 void 00115 _Compiler<_TraitsT>:: 00116 _M_alternative() 00117 { 00118 if (this->_M_term()) 00119 { 00120 _StateSeqT __re = _M_pop(); 00121 this->_M_alternative(); 00122 __re._M_append(_M_pop()); 00123 _M_stack.push(__re); 00124 } 00125 else 00126 _M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_dummy())); 00127 } 00128 00129 template<typename _TraitsT> 00130 bool 00131 _Compiler<_TraitsT>:: 00132 _M_term() 00133 { 00134 if (this->_M_assertion()) 00135 return true; 00136 if (this->_M_atom()) 00137 { 00138 while (this->_M_quantifier()); 00139 return true; 00140 } 00141 return false; 00142 } 00143 00144 template<typename _TraitsT> 00145 bool 00146 _Compiler<_TraitsT>:: 00147 _M_assertion() 00148 { 00149 if (_M_match_token(_ScannerT::_S_token_line_begin)) 00150 _M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_line_begin())); 00151 else if (_M_match_token(_ScannerT::_S_token_line_end)) 00152 _M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_line_end())); 00153 else if (_M_match_token(_ScannerT::_S_token_word_bound)) 00154 // _M_value[0] == 'n' means it's negtive, say "not word boundary". 00155 _M_stack.push(_StateSeqT(_M_nfa, _M_nfa. 00156 _M_insert_word_bound(_M_value[0] == 'n'))); 00157 else if (_M_match_token(_ScannerT::_S_token_subexpr_lookahead_begin)) 00158 { 00159 auto __neg = _M_value[0] == 'n'; 00160 this->_M_disjunction(); 00161 if (!_M_match_token(_ScannerT::_S_token_subexpr_end)) 00162 __throw_regex_error(regex_constants::error_paren); 00163 auto __tmp = _M_pop(); 00164 __tmp._M_append(_M_nfa._M_insert_accept()); 00165 _M_stack.push( 00166 _StateSeqT( 00167 _M_nfa, 00168 _M_nfa._M_insert_lookahead(__tmp._M_start, __neg))); 00169 } 00170 else 00171 return false; 00172 return true; 00173 } 00174 00175 template<typename _TraitsT> 00176 bool 00177 _Compiler<_TraitsT>:: 00178 _M_quantifier() 00179 { 00180 bool __neg = (_M_flags & regex_constants::ECMAScript); 00181 auto __init = [this, &__neg]() 00182 { 00183 if (_M_stack.empty()) 00184 __throw_regex_error(regex_constants::error_badrepeat); 00185 __neg = __neg && _M_match_token(_ScannerT::_S_token_opt); 00186 }; 00187 if (_M_match_token(_ScannerT::_S_token_closure0)) 00188 { 00189 __init(); 00190 auto __e = _M_pop(); 00191 _StateSeqT __r(_M_nfa, _M_nfa._M_insert_alt(_S_invalid_state_id, 00192 __e._M_start, __neg)); 00193 __e._M_append(__r); 00194 _M_stack.push(__r); 00195 } 00196 else if (_M_match_token(_ScannerT::_S_token_closure1)) 00197 { 00198 __init(); 00199 auto __e = _M_pop(); 00200 __e._M_append(_M_nfa._M_insert_alt(_S_invalid_state_id, __e._M_start, 00201 __neg)); 00202 _M_stack.push(__e); 00203 } 00204 else if (_M_match_token(_ScannerT::_S_token_opt)) 00205 { 00206 __init(); 00207 auto __e = _M_pop(); 00208 auto __end = _M_nfa._M_insert_dummy(); 00209 _StateSeqT __r(_M_nfa, _M_nfa._M_insert_alt(_S_invalid_state_id, 00210 __e._M_start, __neg)); 00211 __e._M_append(__end); 00212 __r._M_append(__end); 00213 _M_stack.push(__r); 00214 } 00215 else if (_M_match_token(_ScannerT::_S_token_interval_begin)) 00216 { 00217 if (_M_stack.empty()) 00218 __throw_regex_error(regex_constants::error_badrepeat); 00219 if (!_M_match_token(_ScannerT::_S_token_dup_count)) 00220 __throw_regex_error(regex_constants::error_badbrace); 00221 _StateSeqT __r(_M_pop()); 00222 _StateSeqT __e(_M_nfa, _M_nfa._M_insert_dummy()); 00223 long __min_rep = _M_cur_int_value(10); 00224 bool __infi = false; 00225 long __n; 00226 00227 // {3 00228 if (_M_match_token(_ScannerT::_S_token_comma)) 00229 if (_M_match_token(_ScannerT::_S_token_dup_count)) // {3,7} 00230 __n = _M_cur_int_value(10) - __min_rep; 00231 else 00232 __infi = true; 00233 else 00234 __n = 0; 00235 if (!_M_match_token(_ScannerT::_S_token_interval_end)) 00236 __throw_regex_error(regex_constants::error_brace); 00237 00238 __neg = __neg && _M_match_token(_ScannerT::_S_token_opt); 00239 00240 for (long __i = 0; __i < __min_rep; ++__i) 00241 __e._M_append(__r._M_clone()); 00242 00243 if (__infi) 00244 { 00245 auto __tmp = __r._M_clone(); 00246 _StateSeqT __s(_M_nfa, 00247 _M_nfa._M_insert_alt(_S_invalid_state_id, 00248 __tmp._M_start, __neg)); 00249 __tmp._M_append(__s); 00250 __e._M_append(__s); 00251 } 00252 else 00253 { 00254 if (__n < 0) 00255 __throw_regex_error(regex_constants::error_badbrace); 00256 auto __end = _M_nfa._M_insert_dummy(); 00257 // _M_alt is the "match more" branch, and _M_next is the 00258 // "match less" one. Switch _M_alt and _M_next of all created 00259 // nodes. This is a hacking but IMO works well. 00260 std::stack<_StateIdT> __stack; 00261 for (long __i = 0; __i < __n; ++__i) 00262 { 00263 auto __tmp = __r._M_clone(); 00264 auto __alt = _M_nfa._M_insert_alt(__tmp._M_start, 00265 __end, __neg); 00266 __stack.push(__alt); 00267 __e._M_append(_StateSeqT(_M_nfa, __alt, __tmp._M_end)); 00268 } 00269 __e._M_append(__end); 00270 while (!__stack.empty()) 00271 { 00272 auto& __tmp = _M_nfa[__stack.top()]; 00273 __stack.pop(); 00274 std::swap(__tmp._M_next, __tmp._M_alt); 00275 } 00276 } 00277 _M_stack.push(__e); 00278 } 00279 else 00280 return false; 00281 return true; 00282 } 00283 00284 #define __INSERT_REGEX_MATCHER(__func, args...)\ 00285 do\ 00286 if (!(_M_flags & regex_constants::icase))\ 00287 if (!(_M_flags & regex_constants::collate))\ 00288 __func<false, false>(args);\ 00289 else\ 00290 __func<false, true>(args);\ 00291 else\ 00292 if (!(_M_flags & regex_constants::collate))\ 00293 __func<true, false>(args);\ 00294 else\ 00295 __func<true, true>(args);\ 00296 while (false) 00297 00298 template<typename _TraitsT> 00299 bool 00300 _Compiler<_TraitsT>:: 00301 _M_atom() 00302 { 00303 if (_M_match_token(_ScannerT::_S_token_anychar)) 00304 { 00305 if (!(_M_flags & regex_constants::ECMAScript)) 00306 __INSERT_REGEX_MATCHER(_M_insert_any_matcher_posix); 00307 else 00308 __INSERT_REGEX_MATCHER(_M_insert_any_matcher_ecma); 00309 } 00310 else if (_M_try_char()) 00311 __INSERT_REGEX_MATCHER(_M_insert_char_matcher); 00312 else if (_M_match_token(_ScannerT::_S_token_backref)) 00313 _M_stack.push(_StateSeqT(_M_nfa, _M_nfa. 00314 _M_insert_backref(_M_cur_int_value(10)))); 00315 else if (_M_match_token(_ScannerT::_S_token_quoted_class)) 00316 __INSERT_REGEX_MATCHER(_M_insert_character_class_matcher); 00317 else if (_M_match_token(_ScannerT::_S_token_subexpr_no_group_begin)) 00318 { 00319 _StateSeqT __r(_M_nfa, _M_nfa._M_insert_dummy()); 00320 this->_M_disjunction(); 00321 if (!_M_match_token(_ScannerT::_S_token_subexpr_end)) 00322 __throw_regex_error(regex_constants::error_paren); 00323 __r._M_append(_M_pop()); 00324 _M_stack.push(__r); 00325 } 00326 else if (_M_match_token(_ScannerT::_S_token_subexpr_begin)) 00327 { 00328 _StateSeqT __r(_M_nfa, _M_nfa._M_insert_subexpr_begin()); 00329 this->_M_disjunction(); 00330 if (!_M_match_token(_ScannerT::_S_token_subexpr_end)) 00331 __throw_regex_error(regex_constants::error_paren); 00332 __r._M_append(_M_pop()); 00333 __r._M_append(_M_nfa._M_insert_subexpr_end()); 00334 _M_stack.push(__r); 00335 } 00336 else if (!_M_bracket_expression()) 00337 return false; 00338 return true; 00339 } 00340 00341 template<typename _TraitsT> 00342 bool 00343 _Compiler<_TraitsT>:: 00344 _M_bracket_expression() 00345 { 00346 bool __neg = 00347 _M_match_token(_ScannerT::_S_token_bracket_neg_begin); 00348 if (!(__neg || _M_match_token(_ScannerT::_S_token_bracket_begin))) 00349 return false; 00350 __INSERT_REGEX_MATCHER(_M_insert_bracket_matcher, __neg); 00351 return true; 00352 } 00353 #undef __INSERT_REGEX_MATCHER 00354 00355 template<typename _TraitsT> 00356 template<bool __icase, bool __collate> 00357 void 00358 _Compiler<_TraitsT>:: 00359 _M_insert_any_matcher_ecma() 00360 { 00361 _M_stack.push(_StateSeqT(_M_nfa, 00362 _M_nfa._M_insert_matcher 00363 (_AnyMatcher<_TraitsT, true, __icase, __collate> 00364 (_M_traits)))); 00365 } 00366 00367 template<typename _TraitsT> 00368 template<bool __icase, bool __collate> 00369 void 00370 _Compiler<_TraitsT>:: 00371 _M_insert_any_matcher_posix() 00372 { 00373 _M_stack.push(_StateSeqT(_M_nfa, 00374 _M_nfa._M_insert_matcher 00375 (_AnyMatcher<_TraitsT, false, __icase, __collate> 00376 (_M_traits)))); 00377 } 00378 00379 template<typename _TraitsT> 00380 template<bool __icase, bool __collate> 00381 void 00382 _Compiler<_TraitsT>:: 00383 _M_insert_char_matcher() 00384 { 00385 _M_stack.push(_StateSeqT(_M_nfa, 00386 _M_nfa._M_insert_matcher 00387 (_CharMatcher<_TraitsT, __icase, __collate> 00388 (_M_value[0], _M_traits)))); 00389 } 00390 00391 template<typename _TraitsT> 00392 template<bool __icase, bool __collate> 00393 void 00394 _Compiler<_TraitsT>:: 00395 _M_insert_character_class_matcher() 00396 { 00397 _GLIBCXX_DEBUG_ASSERT(_M_value.size() == 1); 00398 _BracketMatcher<_TraitsT, __icase, __collate> __matcher 00399 (_M_ctype.is(_CtypeT::upper, _M_value[0]), _M_traits); 00400 __matcher._M_add_character_class(_M_value, false); 00401 __matcher._M_ready(); 00402 _M_stack.push(_StateSeqT(_M_nfa, 00403 _M_nfa._M_insert_matcher(std::move(__matcher)))); 00404 } 00405 00406 template<typename _TraitsT> 00407 template<bool __icase, bool __collate> 00408 void 00409 _Compiler<_TraitsT>:: 00410 _M_insert_bracket_matcher(bool __neg) 00411 { 00412 _BracketMatcher<_TraitsT, __icase, __collate> __matcher(__neg, _M_traits); 00413 pair<bool, _CharT> __last_char; // Optional<_CharT> 00414 __last_char.first = false; 00415 if (!(_M_flags & regex_constants::ECMAScript)) 00416 if (_M_try_char()) 00417 { 00418 __matcher._M_add_char(_M_value[0]); 00419 __last_char.first = true; 00420 __last_char.second = _M_value[0]; 00421 } 00422 while (_M_expression_term(__last_char, __matcher)); 00423 __matcher._M_ready(); 00424 _M_stack.push(_StateSeqT( 00425 _M_nfa, 00426 _M_nfa._M_insert_matcher(std::move(__matcher)))); 00427 } 00428 00429 template<typename _TraitsT> 00430 template<bool __icase, bool __collate> 00431 bool 00432 _Compiler<_TraitsT>:: 00433 _M_expression_term(pair<bool, _CharT>& __last_char, 00434 _BracketMatcher<_TraitsT, __icase, __collate>& __matcher) 00435 00436 { 00437 if (_M_match_token(_ScannerT::_S_token_bracket_end)) 00438 return false; 00439 00440 if (_M_match_token(_ScannerT::_S_token_collsymbol)) 00441 { 00442 auto __symbol = __matcher._M_add_collate_element(_M_value); 00443 if (__symbol.size() == 1) 00444 { 00445 __last_char.first = true; 00446 __last_char.second = __symbol[0]; 00447 } 00448 } 00449 else if (_M_match_token(_ScannerT::_S_token_equiv_class_name)) 00450 __matcher._M_add_equivalence_class(_M_value); 00451 else if (_M_match_token(_ScannerT::_S_token_char_class_name)) 00452 __matcher._M_add_character_class(_M_value, false); 00453 // POSIX doesn't allow '-' as a start-range char (say [a-z--0]), 00454 // except when the '-' is the first or last character in the bracket 00455 // expression ([--0]). ECMAScript treats all '-' after a range as a 00456 // normal character. Also see above, where _M_expression_term gets called. 00457 // 00458 // As a result, POSIX rejects [-----], but ECMAScript doesn't. 00459 // Boost (1.57.0) always uses POSIX style even in its ECMAScript syntax. 00460 // Clang (3.5) always uses ECMAScript style even in its POSIX syntax. 00461 // 00462 // It turns out that no one reads BNFs ;) 00463 else if (_M_try_char()) 00464 { 00465 if (!__last_char.first) 00466 { 00467 __matcher._M_add_char(_M_value[0]); 00468 if (_M_value[0] == '-' 00469 && !(_M_flags & regex_constants::ECMAScript)) 00470 { 00471 if (_M_match_token(_ScannerT::_S_token_bracket_end)) 00472 return false; 00473 __throw_regex_error(regex_constants::error_range); 00474 } 00475 __last_char.first = true; 00476 __last_char.second = _M_value[0]; 00477 } 00478 else 00479 { 00480 if (_M_value[0] == '-') 00481 { 00482 if (_M_try_char()) 00483 { 00484 __matcher._M_make_range(__last_char.second , _M_value[0]); 00485 __last_char.first = false; 00486 } 00487 else 00488 { 00489 if (_M_scanner._M_get_token() 00490 != _ScannerT::_S_token_bracket_end) 00491 __throw_regex_error(regex_constants::error_range); 00492 __matcher._M_add_char(_M_value[0]); 00493 } 00494 } 00495 else 00496 { 00497 __matcher._M_add_char(_M_value[0]); 00498 __last_char.second = _M_value[0]; 00499 } 00500 } 00501 } 00502 else if (_M_match_token(_ScannerT::_S_token_quoted_class)) 00503 __matcher._M_add_character_class(_M_value, 00504 _M_ctype.is(_CtypeT::upper, 00505 _M_value[0])); 00506 else 00507 __throw_regex_error(regex_constants::error_brack); 00508 00509 return true; 00510 } 00511 00512 template<typename _TraitsT> 00513 bool 00514 _Compiler<_TraitsT>:: 00515 _M_try_char() 00516 { 00517 bool __is_char = false; 00518 if (_M_match_token(_ScannerT::_S_token_oct_num)) 00519 { 00520 __is_char = true; 00521 _M_value.assign(1, _M_cur_int_value(8)); 00522 } 00523 else if (_M_match_token(_ScannerT::_S_token_hex_num)) 00524 { 00525 __is_char = true; 00526 _M_value.assign(1, _M_cur_int_value(16)); 00527 } 00528 else if (_M_match_token(_ScannerT::_S_token_ord_char)) 00529 __is_char = true; 00530 return __is_char; 00531 } 00532 00533 template<typename _TraitsT> 00534 bool 00535 _Compiler<_TraitsT>:: 00536 _M_match_token(_TokenT token) 00537 { 00538 if (token == _M_scanner._M_get_token()) 00539 { 00540 _M_value = _M_scanner._M_get_value(); 00541 _M_scanner._M_advance(); 00542 return true; 00543 } 00544 return false; 00545 } 00546 00547 template<typename _TraitsT> 00548 int 00549 _Compiler<_TraitsT>:: 00550 _M_cur_int_value(int __radix) 00551 { 00552 long __v = 0; 00553 for (typename _StringT::size_type __i = 0; 00554 __i < _M_value.length(); ++__i) 00555 __v =__v * __radix + _M_traits.value(_M_value[__i], __radix); 00556 return __v; 00557 } 00558 00559 template<typename _TraitsT, bool __icase, bool __collate> 00560 bool 00561 _BracketMatcher<_TraitsT, __icase, __collate>:: 00562 _M_apply(_CharT __ch, false_type) const 00563 { 00564 bool __ret = false; 00565 if (std::find(_M_char_set.begin(), _M_char_set.end(), 00566 _M_translator._M_translate(__ch)) 00567 != _M_char_set.end()) 00568 __ret = true; 00569 else 00570 { 00571 auto __s = _M_translator._M_transform(__ch); 00572 for (auto& __it : _M_range_set) 00573 if (__it.first <= __s && __s <= __it.second) 00574 { 00575 __ret = true; 00576 break; 00577 } 00578 if (_M_traits.isctype(__ch, _M_class_set)) 00579 __ret = true; 00580 else if (std::find(_M_equiv_set.begin(), _M_equiv_set.end(), 00581 _M_traits.transform_primary(&__ch, &__ch+1)) 00582 != _M_equiv_set.end()) 00583 __ret = true; 00584 else 00585 { 00586 for (auto& __it : _M_neg_class_set) 00587 if (!_M_traits.isctype(__ch, __it)) 00588 { 00589 __ret = true; 00590 break; 00591 } 00592 } 00593 } 00594 if (_M_is_non_matching) 00595 return !__ret; 00596 else 00597 return __ret; 00598 } 00599 00600 _GLIBCXX_END_NAMESPACE_VERSION 00601 } // namespace __detail 00602 } // namespace