libstdc++
|
00001 // class template regex -*- C++ -*- 00002 00003 // Copyright (C) 2013-2015 Free Software Foundation, Inc. 00004 // 00005 // This file is part of the GNU ISO C++ Library. This library is free 00006 // software; you can redistribute it and/or modify it under the 00007 // terms of the GNU General Public License as published by the 00008 // Free Software Foundation; either version 3, or (at your option) 00009 // any later version. 00010 00011 // This library is distributed in the hope that it will be useful, 00012 // but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00014 // GNU General Public License for more details. 00015 00016 // Under Section 7 of GPL version 3, you are granted additional 00017 // permissions described in the GCC Runtime Library Exception, version 00018 // 3.1, as published by the Free Software Foundation. 00019 00020 // You should have received a copy of the GNU General Public License and 00021 // a copy of the GCC Runtime Library Exception along with this program; 00022 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 00023 // <http://www.gnu.org/licenses/>. 00024 00025 /** 00026 * @file bits/regex_compiler.tcc 00027 * This is an internal header file, included by other library headers. 00028 * Do not attempt to use it directly. @headername{regex} 00029 */ 00030 00031 // FIXME make comments doxygen format. 00032 00033 // This compiler refers to "Regular Expression Matching Can Be Simple And Fast" 00034 // (http://swtch.com/~rsc/regexp/regexp1.html"), 00035 // but doesn't strictly follow it. 00036 // 00037 // When compiling, states are *chained* instead of tree- or graph-constructed. 00038 // It's more like structured programs: there's if statement and loop statement. 00039 // 00040 // For alternative structure (say "a|b"), aka "if statement", two branches 00041 // should be constructed. However, these two shall merge to an "end_tag" at 00042 // the end of this operator: 00043 // 00044 // branch1 00045 // / \ 00046 // => begin_tag end_tag => 00047 // \ / 00048 // branch2 00049 // 00050 // This is the difference between this implementation and that in Russ's 00051 // article. 00052 // 00053 // That's why we introduced dummy node here ------ "end_tag" is a dummy node. 00054 // All dummy node will be eliminated at the end of compiling process. 00055 00056 namespace std _GLIBCXX_VISIBILITY(default) 00057 { 00058 namespace __detail 00059 { 00060 _GLIBCXX_BEGIN_NAMESPACE_VERSION 00061 00062 template<typename _TraitsT> 00063 _Compiler<_TraitsT>:: 00064 _Compiler(_IterT __b, _IterT __e, 00065 const typename _TraitsT::locale_type& __loc, _FlagT __flags) 00066 : _M_flags((__flags 00067 & (regex_constants::ECMAScript 00068 | regex_constants::basic 00069 | regex_constants::extended 00070 | regex_constants::grep 00071 | regex_constants::egrep 00072 | regex_constants::awk)) 00073 ? __flags 00074 : __flags | regex_constants::ECMAScript), 00075 _M_scanner(__b, __e, _M_flags, __loc), 00076 _M_nfa(make_shared<_RegexT>(__loc, _M_flags)), 00077 _M_traits(_M_nfa->_M_traits), 00078 _M_ctype(std::use_facet<_CtypeT>(__loc)) 00079 { 00080 _StateSeqT __r(*_M_nfa, _M_nfa->_M_start()); 00081 __r._M_append(_M_nfa->_M_insert_subexpr_begin()); 00082 this->_M_disjunction(); 00083 if (!_M_match_token(_ScannerT::_S_token_eof)) 00084 __throw_regex_error(regex_constants::error_paren); 00085 __r._M_append(_M_pop()); 00086 _GLIBCXX_DEBUG_ASSERT(_M_stack.empty()); 00087 __r._M_append(_M_nfa->_M_insert_subexpr_end()); 00088 __r._M_append(_M_nfa->_M_insert_accept()); 00089 _M_nfa->_M_eliminate_dummy(); 00090 } 00091 00092 template<typename _TraitsT> 00093 void 00094 _Compiler<_TraitsT>:: 00095 _M_disjunction() 00096 { 00097 this->_M_alternative(); 00098 while (_M_match_token(_ScannerT::_S_token_or)) 00099 { 00100 _StateSeqT __alt1 = _M_pop(); 00101 this->_M_alternative(); 00102 _StateSeqT __alt2 = _M_pop(); 00103 auto __end = _M_nfa->_M_insert_dummy(); 00104 __alt1._M_append(__end); 00105 __alt2._M_append(__end); 00106 // __alt2 is state._M_next, __alt1 is state._M_alt. The executor 00107 // executes _M_alt before _M_next, as well as executing left 00108 // alternative before right one. 00109 _M_stack.push(_StateSeqT(*_M_nfa, 00110 _M_nfa->_M_insert_alt( 00111 __alt2._M_start, __alt1._M_start, false), 00112 __end)); 00113 } 00114 } 00115 00116 template<typename _TraitsT> 00117 void 00118 _Compiler<_TraitsT>:: 00119 _M_alternative() 00120 { 00121 if (this->_M_term()) 00122 { 00123 _StateSeqT __re = _M_pop(); 00124 this->_M_alternative(); 00125 __re._M_append(_M_pop()); 00126 _M_stack.push(__re); 00127 } 00128 else 00129 _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->_M_insert_dummy())); 00130 } 00131 00132 template<typename _TraitsT> 00133 bool 00134 _Compiler<_TraitsT>:: 00135 _M_term() 00136 { 00137 if (this->_M_assertion()) 00138 return true; 00139 if (this->_M_atom()) 00140 { 00141 while (this->_M_quantifier()); 00142 return true; 00143 } 00144 return false; 00145 } 00146 00147 template<typename _TraitsT> 00148 bool 00149 _Compiler<_TraitsT>:: 00150 _M_assertion() 00151 { 00152 if (_M_match_token(_ScannerT::_S_token_line_begin)) 00153 _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->_M_insert_line_begin())); 00154 else if (_M_match_token(_ScannerT::_S_token_line_end)) 00155 _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->_M_insert_line_end())); 00156 else if (_M_match_token(_ScannerT::_S_token_word_bound)) 00157 // _M_value[0] == 'n' means it's negative, say "not word boundary". 00158 _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa-> 00159 _M_insert_word_bound(_M_value[0] == 'n'))); 00160 else if (_M_match_token(_ScannerT::_S_token_subexpr_lookahead_begin)) 00161 { 00162 auto __neg = _M_value[0] == 'n'; 00163 this->_M_disjunction(); 00164 if (!_M_match_token(_ScannerT::_S_token_subexpr_end)) 00165 __throw_regex_error(regex_constants::error_paren); 00166 auto __tmp = _M_pop(); 00167 __tmp._M_append(_M_nfa->_M_insert_accept()); 00168 _M_stack.push( 00169 _StateSeqT( 00170 *_M_nfa, 00171 _M_nfa->_M_insert_lookahead(__tmp._M_start, __neg))); 00172 } 00173 else 00174 return false; 00175 return true; 00176 } 00177 00178 template<typename _TraitsT> 00179 bool 00180 _Compiler<_TraitsT>:: 00181 _M_quantifier() 00182 { 00183 bool __neg = (_M_flags & regex_constants::ECMAScript); 00184 auto __init = [this, &__neg]() 00185 { 00186 if (_M_stack.empty()) 00187 __throw_regex_error(regex_constants::error_badrepeat); 00188 __neg = __neg && _M_match_token(_ScannerT::_S_token_opt); 00189 }; 00190 if (_M_match_token(_ScannerT::_S_token_closure0)) 00191 { 00192 __init(); 00193 auto __e = _M_pop(); 00194 _StateSeqT __r(*_M_nfa, 00195 _M_nfa->_M_insert_repeat(_S_invalid_state_id, 00196 __e._M_start, __neg)); 00197 __e._M_append(__r); 00198 _M_stack.push(__r); 00199 } 00200 else if (_M_match_token(_ScannerT::_S_token_closure1)) 00201 { 00202 __init(); 00203 auto __e = _M_pop(); 00204 __e._M_append(_M_nfa->_M_insert_repeat(_S_invalid_state_id, 00205 __e._M_start, __neg)); 00206 _M_stack.push(__e); 00207 } 00208 else if (_M_match_token(_ScannerT::_S_token_opt)) 00209 { 00210 __init(); 00211 auto __e = _M_pop(); 00212 auto __end = _M_nfa->_M_insert_dummy(); 00213 _StateSeqT __r(*_M_nfa, 00214 _M_nfa->_M_insert_repeat(_S_invalid_state_id, 00215 __e._M_start, __neg)); 00216 __e._M_append(__end); 00217 __r._M_append(__end); 00218 _M_stack.push(__r); 00219 } 00220 else if (_M_match_token(_ScannerT::_S_token_interval_begin)) 00221 { 00222 if (_M_stack.empty()) 00223 __throw_regex_error(regex_constants::error_badrepeat); 00224 if (!_M_match_token(_ScannerT::_S_token_dup_count)) 00225 __throw_regex_error(regex_constants::error_badbrace); 00226 _StateSeqT __r(_M_pop()); 00227 _StateSeqT __e(*_M_nfa, _M_nfa->_M_insert_dummy()); 00228 long __min_rep = _M_cur_int_value(10); 00229 bool __infi = false; 00230 long __n; 00231 00232 // {3 00233 if (_M_match_token(_ScannerT::_S_token_comma)) 00234 if (_M_match_token(_ScannerT::_S_token_dup_count)) // {3,7} 00235 __n = _M_cur_int_value(10) - __min_rep; 00236 else 00237 __infi = true; 00238 else 00239 __n = 0; 00240 if (!_M_match_token(_ScannerT::_S_token_interval_end)) 00241 __throw_regex_error(regex_constants::error_brace); 00242 00243 __neg = __neg && _M_match_token(_ScannerT::_S_token_opt); 00244 00245 for (long __i = 0; __i < __min_rep; ++__i) 00246 __e._M_append(__r._M_clone()); 00247 00248 if (__infi) 00249 { 00250 auto __tmp = __r._M_clone(); 00251 _StateSeqT __s(*_M_nfa, 00252 _M_nfa->_M_insert_repeat(_S_invalid_state_id, 00253 __tmp._M_start, __neg)); 00254 __tmp._M_append(__s); 00255 __e._M_append(__s); 00256 } 00257 else 00258 { 00259 if (__n < 0) 00260 __throw_regex_error(regex_constants::error_badbrace); 00261 auto __end = _M_nfa->_M_insert_dummy(); 00262 // _M_alt is the "match more" branch, and _M_next is the 00263 // "match less" one. Switch _M_alt and _M_next of all created 00264 // nodes. This is a hack but IMO works well. 00265 std::stack<_StateIdT> __stack; 00266 for (long __i = 0; __i < __n; ++__i) 00267 { 00268 auto __tmp = __r._M_clone(); 00269 auto __alt = _M_nfa->_M_insert_repeat(__tmp._M_start, 00270 __end, __neg); 00271 __stack.push(__alt); 00272 __e._M_append(_StateSeqT(*_M_nfa, __alt, __tmp._M_end)); 00273 } 00274 __e._M_append(__end); 00275 while (!__stack.empty()) 00276 { 00277 auto& __tmp = (*_M_nfa)[__stack.top()]; 00278 __stack.pop(); 00279 std::swap(__tmp._M_next, __tmp._M_alt); 00280 } 00281 } 00282 _M_stack.push(__e); 00283 } 00284 else 00285 return false; 00286 return true; 00287 } 00288 00289 #define __INSERT_REGEX_MATCHER(__func, args...)\ 00290 do\ 00291 if (!(_M_flags & regex_constants::icase))\ 00292 if (!(_M_flags & regex_constants::collate))\ 00293 __func<false, false>(args);\ 00294 else\ 00295 __func<false, true>(args);\ 00296 else\ 00297 if (!(_M_flags & regex_constants::collate))\ 00298 __func<true, false>(args);\ 00299 else\ 00300 __func<true, true>(args);\ 00301 while (false) 00302 00303 template<typename _TraitsT> 00304 bool 00305 _Compiler<_TraitsT>:: 00306 _M_atom() 00307 { 00308 if (_M_match_token(_ScannerT::_S_token_anychar)) 00309 { 00310 if (!(_M_flags & regex_constants::ECMAScript)) 00311 __INSERT_REGEX_MATCHER(_M_insert_any_matcher_posix); 00312 else 00313 __INSERT_REGEX_MATCHER(_M_insert_any_matcher_ecma); 00314 } 00315 else if (_M_try_char()) 00316 __INSERT_REGEX_MATCHER(_M_insert_char_matcher); 00317 else if (_M_match_token(_ScannerT::_S_token_backref)) 00318 _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa-> 00319 _M_insert_backref(_M_cur_int_value(10)))); 00320 else if (_M_match_token(_ScannerT::_S_token_quoted_class)) 00321 __INSERT_REGEX_MATCHER(_M_insert_character_class_matcher); 00322 else if (_M_match_token(_ScannerT::_S_token_subexpr_no_group_begin)) 00323 { 00324 _StateSeqT __r(*_M_nfa, _M_nfa->_M_insert_dummy()); 00325 this->_M_disjunction(); 00326 if (!_M_match_token(_ScannerT::_S_token_subexpr_end)) 00327 __throw_regex_error(regex_constants::error_paren); 00328 __r._M_append(_M_pop()); 00329 _M_stack.push(__r); 00330 } 00331 else if (_M_match_token(_ScannerT::_S_token_subexpr_begin)) 00332 { 00333 _StateSeqT __r(*_M_nfa, _M_nfa->_M_insert_subexpr_begin()); 00334 this->_M_disjunction(); 00335 if (!_M_match_token(_ScannerT::_S_token_subexpr_end)) 00336 __throw_regex_error(regex_constants::error_paren); 00337 __r._M_append(_M_pop()); 00338 __r._M_append(_M_nfa->_M_insert_subexpr_end()); 00339 _M_stack.push(__r); 00340 } 00341 else if (!_M_bracket_expression()) 00342 return false; 00343 return true; 00344 } 00345 00346 template<typename _TraitsT> 00347 bool 00348 _Compiler<_TraitsT>:: 00349 _M_bracket_expression() 00350 { 00351 bool __neg = 00352 _M_match_token(_ScannerT::_S_token_bracket_neg_begin); 00353 if (!(__neg || _M_match_token(_ScannerT::_S_token_bracket_begin))) 00354 return false; 00355 __INSERT_REGEX_MATCHER(_M_insert_bracket_matcher, __neg); 00356 return true; 00357 } 00358 #undef __INSERT_REGEX_MATCHER 00359 00360 template<typename _TraitsT> 00361 template<bool __icase, bool __collate> 00362 void 00363 _Compiler<_TraitsT>:: 00364 _M_insert_any_matcher_ecma() 00365 { 00366 _M_stack.push(_StateSeqT(*_M_nfa, 00367 _M_nfa->_M_insert_matcher 00368 (_AnyMatcher<_TraitsT, true, __icase, __collate> 00369 (_M_traits)))); 00370 } 00371 00372 template<typename _TraitsT> 00373 template<bool __icase, bool __collate> 00374 void 00375 _Compiler<_TraitsT>:: 00376 _M_insert_any_matcher_posix() 00377 { 00378 _M_stack.push(_StateSeqT(*_M_nfa, 00379 _M_nfa->_M_insert_matcher 00380 (_AnyMatcher<_TraitsT, false, __icase, __collate> 00381 (_M_traits)))); 00382 } 00383 00384 template<typename _TraitsT> 00385 template<bool __icase, bool __collate> 00386 void 00387 _Compiler<_TraitsT>:: 00388 _M_insert_char_matcher() 00389 { 00390 _M_stack.push(_StateSeqT(*_M_nfa, 00391 _M_nfa->_M_insert_matcher 00392 (_CharMatcher<_TraitsT, __icase, __collate> 00393 (_M_value[0], _M_traits)))); 00394 } 00395 00396 template<typename _TraitsT> 00397 template<bool __icase, bool __collate> 00398 void 00399 _Compiler<_TraitsT>:: 00400 _M_insert_character_class_matcher() 00401 { 00402 _GLIBCXX_DEBUG_ASSERT(_M_value.size() == 1); 00403 _BracketMatcher<_TraitsT, __icase, __collate> __matcher 00404 (_M_ctype.is(_CtypeT::upper, _M_value[0]), _M_traits); 00405 __matcher._M_add_character_class(_M_value, false); 00406 __matcher._M_ready(); 00407 _M_stack.push(_StateSeqT(*_M_nfa, 00408 _M_nfa->_M_insert_matcher(std::move(__matcher)))); 00409 } 00410 00411 template<typename _TraitsT> 00412 template<bool __icase, bool __collate> 00413 void 00414 _Compiler<_TraitsT>:: 00415 _M_insert_bracket_matcher(bool __neg) 00416 { 00417 _BracketMatcher<_TraitsT, __icase, __collate> __matcher(__neg, _M_traits); 00418 pair<bool, _CharT> __last_char; // Optional<_CharT> 00419 __last_char.first = false; 00420 if (!(_M_flags & regex_constants::ECMAScript)) 00421 if (_M_try_char()) 00422 { 00423 __matcher._M_add_char(_M_value[0]); 00424 __last_char.first = true; 00425 __last_char.second = _M_value[0]; 00426 } 00427 while (_M_expression_term(__last_char, __matcher)); 00428 __matcher._M_ready(); 00429 _M_stack.push(_StateSeqT( 00430 *_M_nfa, 00431 _M_nfa->_M_insert_matcher(std::move(__matcher)))); 00432 } 00433 00434 template<typename _TraitsT> 00435 template<bool __icase, bool __collate> 00436 bool 00437 _Compiler<_TraitsT>:: 00438 _M_expression_term(pair<bool, _CharT>& __last_char, 00439 _BracketMatcher<_TraitsT, __icase, __collate>& __matcher) 00440 { 00441 if (_M_match_token(_ScannerT::_S_token_bracket_end)) 00442 return false; 00443 00444 if (_M_match_token(_ScannerT::_S_token_collsymbol)) 00445 { 00446 auto __symbol = __matcher._M_add_collate_element(_M_value); 00447 if (__symbol.size() == 1) 00448 { 00449 __last_char.first = true; 00450 __last_char.second = __symbol[0]; 00451 } 00452 } 00453 else if (_M_match_token(_ScannerT::_S_token_equiv_class_name)) 00454 __matcher._M_add_equivalence_class(_M_value); 00455 else if (_M_match_token(_ScannerT::_S_token_char_class_name)) 00456 __matcher._M_add_character_class(_M_value, false); 00457 // POSIX doesn't allow '-' as a start-range char (say [a-z--0]), 00458 // except when the '-' is the first or last character in the bracket 00459 // expression ([--0]). ECMAScript treats all '-' after a range as a 00460 // normal character. Also see above, where _M_expression_term gets called. 00461 // 00462 // As a result, POSIX rejects [-----], but ECMAScript doesn't. 00463 // Boost (1.57.0) always uses POSIX style even in its ECMAScript syntax. 00464 // Clang (3.5) always uses ECMAScript style even in its POSIX syntax. 00465 // 00466 // It turns out that no one reads BNFs ;) 00467 else if (_M_try_char()) 00468 { 00469 if (!__last_char.first) 00470 { 00471 __matcher._M_add_char(_M_value[0]); 00472 if (_M_value[0] == '-' 00473 && !(_M_flags & regex_constants::ECMAScript)) 00474 { 00475 if (_M_match_token(_ScannerT::_S_token_bracket_end)) 00476 return false; 00477 __throw_regex_error(regex_constants::error_range); 00478 } 00479 __last_char.first = true; 00480 __last_char.second = _M_value[0]; 00481 } 00482 else 00483 { 00484 if (_M_value[0] == '-') 00485 { 00486 if (_M_try_char()) 00487 { 00488 __matcher._M_make_range(__last_char.second , _M_value[0]); 00489 __last_char.first = false; 00490 } 00491 else 00492 { 00493 if (_M_scanner._M_get_token() 00494 != _ScannerT::_S_token_bracket_end) 00495 __throw_regex_error(regex_constants::error_range); 00496 __matcher._M_add_char(_M_value[0]); 00497 } 00498 } 00499 else 00500 { 00501 __matcher._M_add_char(_M_value[0]); 00502 __last_char.second = _M_value[0]; 00503 } 00504 } 00505 } 00506 else if (_M_match_token(_ScannerT::_S_token_quoted_class)) 00507 __matcher._M_add_character_class(_M_value, 00508 _M_ctype.is(_CtypeT::upper, 00509 _M_value[0])); 00510 else 00511 __throw_regex_error(regex_constants::error_brack); 00512 00513 return true; 00514 } 00515 00516 template<typename _TraitsT> 00517 bool 00518 _Compiler<_TraitsT>:: 00519 _M_try_char() 00520 { 00521 bool __is_char = false; 00522 if (_M_match_token(_ScannerT::_S_token_oct_num)) 00523 { 00524 __is_char = true; 00525 _M_value.assign(1, _M_cur_int_value(8)); 00526 } 00527 else if (_M_match_token(_ScannerT::_S_token_hex_num)) 00528 { 00529 __is_char = true; 00530 _M_value.assign(1, _M_cur_int_value(16)); 00531 } 00532 else if (_M_match_token(_ScannerT::_S_token_ord_char)) 00533 __is_char = true; 00534 return __is_char; 00535 } 00536 00537 template<typename _TraitsT> 00538 bool 00539 _Compiler<_TraitsT>:: 00540 _M_match_token(_TokenT token) 00541 { 00542 if (token == _M_scanner._M_get_token()) 00543 { 00544 _M_value = _M_scanner._M_get_value(); 00545 _M_scanner._M_advance(); 00546 return true; 00547 } 00548 return false; 00549 } 00550 00551 template<typename _TraitsT> 00552 int 00553 _Compiler<_TraitsT>:: 00554 _M_cur_int_value(int __radix) 00555 { 00556 long __v = 0; 00557 for (typename _StringT::size_type __i = 0; 00558 __i < _M_value.length(); ++__i) 00559 __v =__v * __radix + _M_traits.value(_M_value[__i], __radix); 00560 return __v; 00561 } 00562 00563 template<typename _TraitsT, bool __icase, bool __collate> 00564 bool 00565 _BracketMatcher<_TraitsT, __icase, __collate>:: 00566 _M_apply(_CharT __ch, false_type) const 00567 { 00568 bool __ret = std::binary_search(_M_char_set.begin(), _M_char_set.end(), 00569 _M_translator._M_translate(__ch)); 00570 if (!__ret) 00571 { 00572 auto __s = _M_translator._M_transform(__ch); 00573 for (auto& __it : _M_range_set) 00574 if (__it.first <= __s && __s <= __it.second) 00575 { 00576 __ret = true; 00577 break; 00578 } 00579 if (_M_traits.isctype(__ch, _M_class_set)) 00580 __ret = true; 00581 else if (std::find(_M_equiv_set.begin(), _M_equiv_set.end(), 00582 _M_traits.transform_primary(&__ch, &__ch+1)) 00583 != _M_equiv_set.end()) 00584 __ret = true; 00585 else 00586 { 00587 for (auto& __it : _M_neg_class_set) 00588 if (!_M_traits.isctype(__ch, __it)) 00589 { 00590 __ret = true; 00591 break; 00592 } 00593 } 00594 } 00595 if (_M_is_non_matching) 00596 return !__ret; 00597 else 00598 return __ret; 00599 } 00600 00601 _GLIBCXX_END_NAMESPACE_VERSION 00602 } // namespace __detail 00603 } // namespace