libstdc++
|
00001 // class template regex -*- C++ -*- 00002 00003 // Copyright (C) 2013-2015 Free Software Foundation, Inc. 00004 // 00005 // This file is part of the GNU ISO C++ Library. This library is free 00006 // software; you can redistribute it and/or modify it under the 00007 // terms of the GNU General Public License as published by the 00008 // Free Software Foundation; either version 3, or (at your option) 00009 // any later version. 00010 00011 // This library is distributed in the hope that it will be useful, 00012 // but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00014 // GNU General Public License for more details. 00015 00016 // Under Section 7 of GPL version 3, you are granted additional 00017 // permissions described in the GCC Runtime Library Exception, version 00018 // 3.1, as published by the Free Software Foundation. 00019 00020 // You should have received a copy of the GNU General Public License and 00021 // a copy of the GCC Runtime Library Exception along with this program; 00022 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 00023 // <http://www.gnu.org/licenses/>. 00024 00025 /** 00026 * @file bits/regex_executor.tcc 00027 * This is an internal header file, included by other library headers. 00028 * Do not attempt to use it directly. @headername{regex} 00029 */ 00030 00031 namespace std _GLIBCXX_VISIBILITY(default) 00032 { 00033 namespace __detail 00034 { 00035 _GLIBCXX_BEGIN_NAMESPACE_VERSION 00036 00037 template<typename _BiIter, typename _Alloc, typename _TraitsT, 00038 bool __dfs_mode> 00039 bool _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: 00040 _M_search() 00041 { 00042 if (_M_search_from_first()) 00043 return true; 00044 if (_M_flags & regex_constants::match_continuous) 00045 return false; 00046 _M_flags |= regex_constants::match_prev_avail; 00047 while (_M_begin != _M_end) 00048 { 00049 ++_M_begin; 00050 if (_M_search_from_first()) 00051 return true; 00052 } 00053 return false; 00054 } 00055 00056 // The _M_main function operates in different modes, DFS mode or BFS mode, 00057 // indicated by template parameter __dfs_mode, and dispatches to one of the 00058 // _M_main_dispatch overloads. 00059 // 00060 // ------------------------------------------------------------ 00061 // 00062 // DFS mode: 00063 // 00064 // It applies a Depth-First-Search (aka backtracking) on given NFA and input 00065 // string. 00066 // At the very beginning the executor stands in the start state, then it 00067 // tries every possible state transition in current state recursively. Some 00068 // state transitions consume input string, say, a single-char-matcher or a 00069 // back-reference matcher; some don't, like assertion or other anchor nodes. 00070 // When the input is exhausted and/or the current state is an accepting 00071 // state, the whole executor returns true. 00072 // 00073 // TODO: This approach is exponentially slow for certain input. 00074 // Try to compile the NFA to a DFA. 00075 // 00076 // Time complexity: \Omega(match_length), O(2^(_M_nfa.size())) 00077 // Space complexity: \theta(match_results.size() + match_length) 00078 // 00079 template<typename _BiIter, typename _Alloc, typename _TraitsT, 00080 bool __dfs_mode> 00081 bool _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: 00082 _M_main_dispatch(_Match_mode __match_mode, __dfs) 00083 { 00084 _M_has_sol = false; 00085 *_M_states._M_get_sol_pos() = _BiIter(); 00086 _M_cur_results = _M_results; 00087 _M_dfs(__match_mode, _M_states._M_start); 00088 return _M_has_sol; 00089 } 00090 00091 // ------------------------------------------------------------ 00092 // 00093 // BFS mode: 00094 // 00095 // Russ Cox's article (http://swtch.com/~rsc/regexp/regexp1.html) 00096 // explained this algorithm clearly. 00097 // 00098 // It first computes epsilon closure (states that can be achieved without 00099 // consuming characters) for every state that's still matching, 00100 // using the same DFS algorithm, but doesn't re-enter states (using 00101 // _M_states._M_visited to check), nor follow _S_opcode_match. 00102 // 00103 // Then apply DFS using every _S_opcode_match (in _M_states._M_match_queue) 00104 // as the start state. 00105 // 00106 // It significantly reduces potential duplicate states, so has a better 00107 // upper bound; but it requires more overhead. 00108 // 00109 // Time complexity: \Omega(match_length * match_results.size()) 00110 // O(match_length * _M_nfa.size() * match_results.size()) 00111 // Space complexity: \Omega(_M_nfa.size() + match_results.size()) 00112 // O(_M_nfa.size() * match_results.size()) 00113 template<typename _BiIter, typename _Alloc, typename _TraitsT, 00114 bool __dfs_mode> 00115 bool _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: 00116 _M_main_dispatch(_Match_mode __match_mode, __bfs) 00117 { 00118 _M_states._M_queue(_M_states._M_start, _M_results); 00119 bool __ret = false; 00120 while (1) 00121 { 00122 _M_has_sol = false; 00123 if (_M_states._M_match_queue.empty()) 00124 break; 00125 std::fill_n(_M_states._M_visited_states.get(), _M_nfa.size(), false); 00126 auto __old_queue = std::move(_M_states._M_match_queue); 00127 for (auto& __task : __old_queue) 00128 { 00129 _M_cur_results = std::move(__task.second); 00130 _M_dfs(__match_mode, __task.first); 00131 } 00132 if (__match_mode == _Match_mode::_Prefix) 00133 __ret |= _M_has_sol; 00134 if (_M_current == _M_end) 00135 break; 00136 ++_M_current; 00137 } 00138 if (__match_mode == _Match_mode::_Exact) 00139 __ret = _M_has_sol; 00140 _M_states._M_match_queue.clear(); 00141 return __ret; 00142 } 00143 00144 // Return whether now match the given sub-NFA. 00145 template<typename _BiIter, typename _Alloc, typename _TraitsT, 00146 bool __dfs_mode> 00147 bool _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: 00148 _M_lookahead(_State<_TraitsT> __state) 00149 { 00150 // Backreferences may refer to captured content. 00151 // We may want to make this faster by not copying, 00152 // but let's not be clever prematurely. 00153 _ResultsVec __what(_M_cur_results); 00154 _Executor __sub(_M_current, _M_end, __what, _M_re, _M_flags); 00155 __sub._M_states._M_start = __state._M_alt; 00156 if (__sub._M_search_from_first()) 00157 { 00158 for (size_t __i = 0; __i < __what.size(); __i++) 00159 if (__what[__i].matched) 00160 _M_cur_results[__i] = __what[__i]; 00161 return true; 00162 } 00163 return false; 00164 } 00165 00166 // __rep_count records how many times (__rep_count.second) 00167 // this node is visited under certain input iterator 00168 // (__rep_count.first). This prevent the executor from entering 00169 // infinite loop by refusing to continue when it's already been 00170 // visited more than twice. It's `twice` instead of `once` because 00171 // we need to spare one more time for potential group capture. 00172 template<typename _BiIter, typename _Alloc, typename _TraitsT, 00173 bool __dfs_mode> 00174 void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: 00175 _M_rep_once_more(_Match_mode __match_mode, _StateIdT __i) 00176 { 00177 const auto& __state = _M_nfa[__i]; 00178 auto& __rep_count = _M_rep_count[__i]; 00179 if (__rep_count.second == 0 || __rep_count.first != _M_current) 00180 { 00181 auto __back = __rep_count; 00182 __rep_count.first = _M_current; 00183 __rep_count.second = 1; 00184 _M_dfs(__match_mode, __state._M_alt); 00185 __rep_count = __back; 00186 } 00187 else 00188 { 00189 if (__rep_count.second < 2) 00190 { 00191 __rep_count.second++; 00192 _M_dfs(__match_mode, __state._M_alt); 00193 __rep_count.second--; 00194 } 00195 } 00196 }; 00197 00198 template<typename _BiIter, typename _Alloc, typename _TraitsT, 00199 bool __dfs_mode> 00200 void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: 00201 _M_dfs(_Match_mode __match_mode, _StateIdT __i) 00202 { 00203 if (_M_states._M_visited(__i)) 00204 return; 00205 00206 const auto& __state = _M_nfa[__i]; 00207 // Every change on _M_cur_results and _M_current will be rolled back after 00208 // finishing the recursion step. 00209 switch (__state._M_opcode) 00210 { 00211 // _M_alt branch is "match once more", while _M_next is "get me out 00212 // of this quantifier". Executing _M_next first or _M_alt first don't 00213 // mean the same thing, and we need to choose the correct order under 00214 // given greedy mode. 00215 case _S_opcode_repeat: 00216 { 00217 // Greedy. 00218 if (!__state._M_neg) 00219 { 00220 _M_rep_once_more(__match_mode, __i); 00221 // If it's DFS executor and already accepted, we're done. 00222 if (!__dfs_mode || !_M_has_sol) 00223 _M_dfs(__match_mode, __state._M_next); 00224 } 00225 else // Non-greedy mode 00226 { 00227 if (__dfs_mode) 00228 { 00229 // vice-versa. 00230 _M_dfs(__match_mode, __state._M_next); 00231 if (!_M_has_sol) 00232 _M_rep_once_more(__match_mode, __i); 00233 } 00234 else 00235 { 00236 // DON'T attempt anything, because there's already another 00237 // state with higher priority accepted. This state cannot 00238 // be better by attempting its next node. 00239 if (!_M_has_sol) 00240 { 00241 _M_dfs(__match_mode, __state._M_next); 00242 // DON'T attempt anything if it's already accepted. An 00243 // accepted state *must* be better than a solution that 00244 // matches a non-greedy quantifier one more time. 00245 if (!_M_has_sol) 00246 _M_rep_once_more(__match_mode, __i); 00247 } 00248 } 00249 } 00250 } 00251 break; 00252 case _S_opcode_subexpr_begin: 00253 { 00254 auto& __res = _M_cur_results[__state._M_subexpr]; 00255 auto __back = __res.first; 00256 __res.first = _M_current; 00257 _M_dfs(__match_mode, __state._M_next); 00258 __res.first = __back; 00259 } 00260 break; 00261 case _S_opcode_subexpr_end: 00262 { 00263 auto& __res = _M_cur_results[__state._M_subexpr]; 00264 auto __back = __res; 00265 __res.second = _M_current; 00266 __res.matched = true; 00267 _M_dfs(__match_mode, __state._M_next); 00268 __res = __back; 00269 } 00270 break; 00271 case _S_opcode_line_begin_assertion: 00272 if (_M_at_begin()) 00273 _M_dfs(__match_mode, __state._M_next); 00274 break; 00275 case _S_opcode_line_end_assertion: 00276 if (_M_at_end()) 00277 _M_dfs(__match_mode, __state._M_next); 00278 break; 00279 case _S_opcode_word_boundary: 00280 if (_M_word_boundary() == !__state._M_neg) 00281 _M_dfs(__match_mode, __state._M_next); 00282 break; 00283 // Here __state._M_alt offers a single start node for a sub-NFA. 00284 // We recursively invoke our algorithm to match the sub-NFA. 00285 case _S_opcode_subexpr_lookahead: 00286 if (_M_lookahead(__state) == !__state._M_neg) 00287 _M_dfs(__match_mode, __state._M_next); 00288 break; 00289 case _S_opcode_match: 00290 if (_M_current == _M_end) 00291 break; 00292 if (__dfs_mode) 00293 { 00294 if (__state._M_matches(*_M_current)) 00295 { 00296 ++_M_current; 00297 _M_dfs(__match_mode, __state._M_next); 00298 --_M_current; 00299 } 00300 } 00301 else 00302 if (__state._M_matches(*_M_current)) 00303 _M_states._M_queue(__state._M_next, _M_cur_results); 00304 break; 00305 // First fetch the matched result from _M_cur_results as __submatch; 00306 // then compare it with 00307 // (_M_current, _M_current + (__submatch.second - __submatch.first)). 00308 // If matched, keep going; else just return and try another state. 00309 case _S_opcode_backref: 00310 { 00311 _GLIBCXX_DEBUG_ASSERT(__dfs_mode); 00312 auto& __submatch = _M_cur_results[__state._M_backref_index]; 00313 if (!__submatch.matched) 00314 break; 00315 auto __last = _M_current; 00316 for (auto __tmp = __submatch.first; 00317 __last != _M_end && __tmp != __submatch.second; 00318 ++__tmp) 00319 ++__last; 00320 if (_M_re._M_automaton->_M_traits.transform(__submatch.first, 00321 __submatch.second) 00322 == _M_re._M_automaton->_M_traits.transform(_M_current, __last)) 00323 { 00324 if (__last != _M_current) 00325 { 00326 auto __backup = _M_current; 00327 _M_current = __last; 00328 _M_dfs(__match_mode, __state._M_next); 00329 _M_current = __backup; 00330 } 00331 else 00332 _M_dfs(__match_mode, __state._M_next); 00333 } 00334 } 00335 break; 00336 case _S_opcode_accept: 00337 if (__dfs_mode) 00338 { 00339 _GLIBCXX_DEBUG_ASSERT(!_M_has_sol); 00340 if (__match_mode == _Match_mode::_Exact) 00341 _M_has_sol = _M_current == _M_end; 00342 else 00343 _M_has_sol = true; 00344 if (_M_current == _M_begin 00345 && (_M_flags & regex_constants::match_not_null)) 00346 _M_has_sol = false; 00347 if (_M_has_sol) 00348 { 00349 if (_M_nfa._M_flags & regex_constants::ECMAScript) 00350 _M_results = _M_cur_results; 00351 else // POSIX 00352 { 00353 _GLIBCXX_DEBUG_ASSERT(_M_states._M_get_sol_pos()); 00354 // Here's POSIX's logic: match the longest one. However 00355 // we never know which one (lhs or rhs of "|") is longer 00356 // unless we try both of them and compare the results. 00357 // The member variable _M_sol_pos records the end 00358 // position of the last successful match. It's better 00359 // to be larger, because POSIX regex is always greedy. 00360 // TODO: This could be slow. 00361 if (*_M_states._M_get_sol_pos() == _BiIter() 00362 || std::distance(_M_begin, 00363 *_M_states._M_get_sol_pos()) 00364 < std::distance(_M_begin, _M_current)) 00365 { 00366 *_M_states._M_get_sol_pos() = _M_current; 00367 _M_results = _M_cur_results; 00368 } 00369 } 00370 } 00371 } 00372 else 00373 { 00374 if (_M_current == _M_begin 00375 && (_M_flags & regex_constants::match_not_null)) 00376 break; 00377 if (__match_mode == _Match_mode::_Prefix || _M_current == _M_end) 00378 if (!_M_has_sol) 00379 { 00380 _M_has_sol = true; 00381 _M_results = _M_cur_results; 00382 } 00383 } 00384 break; 00385 case _S_opcode_alternative: 00386 if (_M_nfa._M_flags & regex_constants::ECMAScript) 00387 { 00388 // TODO: Let BFS support ECMAScript's alternative operation. 00389 _GLIBCXX_DEBUG_ASSERT(__dfs_mode); 00390 _M_dfs(__match_mode, __state._M_alt); 00391 // Pick lhs if it matches. Only try rhs if it doesn't. 00392 if (!_M_has_sol) 00393 _M_dfs(__match_mode, __state._M_next); 00394 } 00395 else 00396 { 00397 // Try both and compare the result. 00398 // See "case _S_opcode_accept:" handling above. 00399 _M_dfs(__match_mode, __state._M_alt); 00400 auto __has_sol = _M_has_sol; 00401 _M_has_sol = false; 00402 _M_dfs(__match_mode, __state._M_next); 00403 _M_has_sol |= __has_sol; 00404 } 00405 break; 00406 default: 00407 _GLIBCXX_DEBUG_ASSERT(false); 00408 } 00409 } 00410 00411 // Return whether now is at some word boundary. 00412 template<typename _BiIter, typename _Alloc, typename _TraitsT, 00413 bool __dfs_mode> 00414 bool _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: 00415 _M_word_boundary() const 00416 { 00417 bool __left_is_word = false; 00418 if (_M_current != _M_begin 00419 || (_M_flags & regex_constants::match_prev_avail)) 00420 { 00421 auto __prev = _M_current; 00422 if (_M_is_word(*std::prev(__prev))) 00423 __left_is_word = true; 00424 } 00425 bool __right_is_word = 00426 _M_current != _M_end && _M_is_word(*_M_current); 00427 00428 if (__left_is_word == __right_is_word) 00429 return false; 00430 if (__left_is_word && !(_M_flags & regex_constants::match_not_eow)) 00431 return true; 00432 if (__right_is_word && !(_M_flags & regex_constants::match_not_bow)) 00433 return true; 00434 return false; 00435 } 00436 00437 _GLIBCXX_END_NAMESPACE_VERSION 00438 } // namespace __detail 00439 } // namespace