libstdc++
|
00001 // class template regex -*- C++ -*- 00002 00003 // Copyright (C) 2013-2014 Free Software Foundation, Inc. 00004 // 00005 // This file is part of the GNU ISO C++ Library. This library is free 00006 // software; you can redistribute it and/or modify it under the 00007 // terms of the GNU General Public License as published by the 00008 // Free Software Foundation; either version 3, or (at your option) 00009 // any later version. 00010 00011 // This library is distributed in the hope that it will be useful, 00012 // but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00014 // GNU General Public License for more details. 00015 00016 // Under Section 7 of GPL version 3, you are granted additional 00017 // permissions described in the GCC Runtime Library Exception, version 00018 // 3.1, as published by the Free Software Foundation. 00019 00020 // You should have received a copy of the GNU General Public License and 00021 // a copy of the GCC Runtime Library Exception along with this program; 00022 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 00023 // <http://www.gnu.org/licenses/>. 00024 00025 /** 00026 * @file bits/regex_scanner.tcc 00027 * This is an internal header file, included by other library headers. 00028 * Do not attempt to use it directly. @headername{regex} 00029 */ 00030 00031 // FIXME make comments doxygen format. 00032 00033 // N3376 specified 6 regex styles: ECMAScript, basic, extended, grep, egrep 00034 // and awk 00035 // 1) grep is basic except '\n' is treated as '|' 00036 // 2) egrep is extended except '\n' is treated as '|' 00037 // 3) awk is extended except special escaping rules, and there's no 00038 // back-reference. 00039 // 00040 // References: 00041 // 00042 // ECMAScript: ECMA-262 15.10 00043 // 00044 // basic, extended: 00045 // http://pubs.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html 00046 // 00047 // awk: http://pubs.opengroup.org/onlinepubs/000095399/utilities/awk.html 00048 00049 namespace std _GLIBCXX_VISIBILITY(default) 00050 { 00051 namespace __detail 00052 { 00053 _GLIBCXX_BEGIN_NAMESPACE_VERSION 00054 00055 template<typename _CharT> 00056 _Scanner<_CharT>:: 00057 _Scanner(typename _Scanner::_IterT __begin, 00058 typename _Scanner::_IterT __end, 00059 _FlagT __flags, std::locale __loc) 00060 : _ScannerBase(__flags), 00061 _M_current(__begin), _M_end(__end), 00062 _M_ctype(std::use_facet<_CtypeT>(__loc)), 00063 _M_eat_escape(_M_is_ecma() 00064 ? &_Scanner::_M_eat_escape_ecma 00065 : &_Scanner::_M_eat_escape_posix) 00066 { _M_advance(); } 00067 00068 template<typename _CharT> 00069 void 00070 _Scanner<_CharT>:: 00071 _M_advance() 00072 { 00073 if (_M_current == _M_end) 00074 { 00075 _M_token = _S_token_eof; 00076 return; 00077 } 00078 00079 if (_M_state == _S_state_normal) 00080 _M_scan_normal(); 00081 else if (_M_state == _S_state_in_bracket) 00082 _M_scan_in_bracket(); 00083 else if (_M_state == _S_state_in_brace) 00084 _M_scan_in_brace(); 00085 else 00086 _GLIBCXX_DEBUG_ASSERT(false); 00087 } 00088 00089 // Differences between styles: 00090 // 1) "\(", "\)", "\{" in basic. It's not escaping. 00091 // 2) "(?:", "(?=", "(?!" in ECMAScript. 00092 template<typename _CharT> 00093 void 00094 _Scanner<_CharT>:: 00095 _M_scan_normal() 00096 { 00097 auto __c = *_M_current++; 00098 const char* __pos; 00099 00100 if (std::strchr(_M_spec_char, _M_ctype.narrow(__c, '\0')) == nullptr) 00101 { 00102 _M_token = _S_token_ord_char; 00103 _M_value.assign(1, __c); 00104 return; 00105 } 00106 if (__c == '\\') 00107 { 00108 if (_M_current == _M_end) 00109 __throw_regex_error(regex_constants::error_escape); 00110 00111 if (!_M_is_basic() 00112 || (*_M_current != '(' 00113 && *_M_current != ')' 00114 && *_M_current != '{')) 00115 { 00116 (this->*_M_eat_escape)(); 00117 return; 00118 } 00119 __c = *_M_current++; 00120 } 00121 if (__c == '(') 00122 { 00123 if (_M_is_ecma() && *_M_current == '?') 00124 { 00125 if (++_M_current == _M_end) 00126 __throw_regex_error(regex_constants::error_paren); 00127 00128 if (*_M_current == ':') 00129 { 00130 ++_M_current; 00131 _M_token = _S_token_subexpr_no_group_begin; 00132 } 00133 else if (*_M_current == '=') 00134 { 00135 ++_M_current; 00136 _M_token = _S_token_subexpr_lookahead_begin; 00137 _M_value.assign(1, 'p'); 00138 } 00139 else if (*_M_current == '!') 00140 { 00141 ++_M_current; 00142 _M_token = _S_token_subexpr_lookahead_begin; 00143 _M_value.assign(1, 'n'); 00144 } 00145 else 00146 __throw_regex_error(regex_constants::error_paren); 00147 } 00148 else if (_M_flags & regex_constants::nosubs) 00149 _M_token = _S_token_subexpr_no_group_begin; 00150 else 00151 _M_token = _S_token_subexpr_begin; 00152 } 00153 else if (__c == ')') 00154 _M_token = _S_token_subexpr_end; 00155 else if (__c == '[') 00156 { 00157 _M_state = _S_state_in_bracket; 00158 _M_at_bracket_start = true; 00159 if (_M_current != _M_end && *_M_current == '^') 00160 { 00161 _M_token = _S_token_bracket_neg_begin; 00162 ++_M_current; 00163 } 00164 else 00165 _M_token = _S_token_bracket_begin; 00166 } 00167 else if (__c == '{') 00168 { 00169 _M_state = _S_state_in_brace; 00170 _M_token = _S_token_interval_begin; 00171 } 00172 else if (((__pos = std::strchr(_M_spec_char, _M_ctype.narrow(__c, '\0'))) 00173 != nullptr 00174 && *__pos != '\0' 00175 && __c != ']' 00176 && __c != '}') 00177 || (_M_is_grep() && __c == '\n')) 00178 { 00179 auto __it = _M_token_tbl; 00180 auto __narrowc = _M_ctype.narrow(__c, '\0'); 00181 for (; __it->first != '\0'; ++__it) 00182 if (__it->first == __narrowc) 00183 { 00184 _M_token = __it->second; 00185 return; 00186 } 00187 _GLIBCXX_DEBUG_ASSERT(false); 00188 } 00189 else 00190 { 00191 _M_token = _S_token_ord_char; 00192 _M_value.assign(1, __c); 00193 } 00194 } 00195 00196 // Differences between styles: 00197 // 1) different semantics of "[]" and "[^]". 00198 // 2) Escaping in bracket expr. 00199 template<typename _CharT> 00200 void 00201 _Scanner<_CharT>:: 00202 _M_scan_in_bracket() 00203 { 00204 if (_M_current == _M_end) 00205 __throw_regex_error(regex_constants::error_brack); 00206 00207 auto __c = *_M_current++; 00208 00209 if (__c == '[') 00210 { 00211 if (_M_current == _M_end) 00212 __throw_regex_error(regex_constants::error_brack); 00213 00214 if (*_M_current == '.') 00215 { 00216 _M_token = _S_token_collsymbol; 00217 _M_eat_class(*_M_current++); 00218 } 00219 else if (*_M_current == ':') 00220 { 00221 _M_token = _S_token_char_class_name; 00222 _M_eat_class(*_M_current++); 00223 } 00224 else if (*_M_current == '=') 00225 { 00226 _M_token = _S_token_equiv_class_name; 00227 _M_eat_class(*_M_current++); 00228 } 00229 else 00230 { 00231 _M_token = _S_token_ord_char; 00232 _M_value.assign(1, __c); 00233 } 00234 } 00235 // In POSIX, when encountering "[]" or "[^]", the ']' is interpreted 00236 // literally. So "[]]" or "[^]]" is valid regex. See the testcases 00237 // `*/empty_range.cc`. 00238 else if (__c == ']' && (_M_is_ecma() || !_M_at_bracket_start)) 00239 { 00240 _M_token = _S_token_bracket_end; 00241 _M_state = _S_state_normal; 00242 } 00243 // ECMAScirpt and awk permmits escaping in bracket. 00244 else if (__c == '\\' && (_M_is_ecma() || _M_is_awk())) 00245 (this->*_M_eat_escape)(); 00246 else 00247 { 00248 _M_token = _S_token_ord_char; 00249 _M_value.assign(1, __c); 00250 } 00251 _M_at_bracket_start = false; 00252 } 00253 00254 // Differences between styles: 00255 // 1) "\}" in basic style. 00256 template<typename _CharT> 00257 void 00258 _Scanner<_CharT>:: 00259 _M_scan_in_brace() 00260 { 00261 if (_M_current == _M_end) 00262 __throw_regex_error(regex_constants::error_brace); 00263 00264 auto __c = *_M_current++; 00265 00266 if (_M_ctype.is(_CtypeT::digit, __c)) 00267 { 00268 _M_token = _S_token_dup_count; 00269 _M_value.assign(1, __c); 00270 while (_M_current != _M_end 00271 && _M_ctype.is(_CtypeT::digit, *_M_current)) 00272 _M_value += *_M_current++; 00273 } 00274 else if (__c == ',') 00275 _M_token = _S_token_comma; 00276 // basic use \}. 00277 else if (_M_is_basic()) 00278 { 00279 if (__c == '\\' && _M_current != _M_end && *_M_current == '}') 00280 { 00281 _M_state = _S_state_normal; 00282 _M_token = _S_token_interval_end; 00283 ++_M_current; 00284 } 00285 else 00286 __throw_regex_error(regex_constants::error_badbrace); 00287 } 00288 else if (__c == '}') 00289 { 00290 _M_state = _S_state_normal; 00291 _M_token = _S_token_interval_end; 00292 } 00293 else 00294 __throw_regex_error(regex_constants::error_badbrace); 00295 } 00296 00297 template<typename _CharT> 00298 void 00299 _Scanner<_CharT>:: 00300 _M_eat_escape_ecma() 00301 { 00302 if (_M_current == _M_end) 00303 __throw_regex_error(regex_constants::error_escape); 00304 00305 auto __c = *_M_current++; 00306 auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0')); 00307 00308 if (__pos != nullptr && (__c != 'b' || _M_state == _S_state_in_bracket)) 00309 { 00310 _M_token = _S_token_ord_char; 00311 _M_value.assign(1, *__pos); 00312 } 00313 else if (__c == 'b') 00314 { 00315 _M_token = _S_token_word_bound; 00316 _M_value.assign(1, 'p'); 00317 } 00318 else if (__c == 'B') 00319 { 00320 _M_token = _S_token_word_bound; 00321 _M_value.assign(1, 'n'); 00322 } 00323 // N3376 28.13 00324 else if (__c == 'd' 00325 || __c == 'D' 00326 || __c == 's' 00327 || __c == 'S' 00328 || __c == 'w' 00329 || __c == 'W') 00330 { 00331 _M_token = _S_token_quoted_class; 00332 _M_value.assign(1, __c); 00333 } 00334 else if (__c == 'c') 00335 { 00336 if (_M_current == _M_end) 00337 __throw_regex_error(regex_constants::error_escape); 00338 _M_token = _S_token_ord_char; 00339 _M_value.assign(1, *_M_current++); 00340 } 00341 else if (__c == 'x' || __c == 'u') 00342 { 00343 _M_value.erase(); 00344 for (int i = 0; i < (__c == 'x' ? 2 : 4); i++) 00345 { 00346 if (_M_current == _M_end 00347 || !_M_ctype.is(_CtypeT::xdigit, *_M_current)) 00348 __throw_regex_error(regex_constants::error_escape); 00349 _M_value += *_M_current++; 00350 } 00351 _M_token = _S_token_hex_num; 00352 } 00353 // ECMAScript recongnizes multi-digit back-references. 00354 else if (_M_ctype.is(_CtypeT::digit, __c)) 00355 { 00356 _M_value.assign(1, __c); 00357 while (_M_current != _M_end 00358 && _M_ctype.is(_CtypeT::digit, *_M_current)) 00359 _M_value += *_M_current++; 00360 _M_token = _S_token_backref; 00361 } 00362 else 00363 { 00364 _M_token = _S_token_ord_char; 00365 _M_value.assign(1, __c); 00366 } 00367 } 00368 00369 // Differences between styles: 00370 // 1) Extended doesn't support backref, but basic does. 00371 template<typename _CharT> 00372 void 00373 _Scanner<_CharT>:: 00374 _M_eat_escape_posix() 00375 { 00376 if (_M_current == _M_end) 00377 __throw_regex_error(regex_constants::error_escape); 00378 00379 auto __c = *_M_current; 00380 auto __pos = std::strchr(_M_spec_char, _M_ctype.narrow(__c, '\0')); 00381 00382 if (__pos != nullptr && *__pos != '\0') 00383 { 00384 _M_token = _S_token_ord_char; 00385 _M_value.assign(1, __c); 00386 } 00387 // We MUST judge awk before handling backrefs. There's no backref in awk. 00388 else if (_M_is_awk()) 00389 { 00390 _M_eat_escape_awk(); 00391 return; 00392 } 00393 else if (_M_is_basic() && _M_ctype.is(_CtypeT::digit, __c) && __c != '0') 00394 { 00395 _M_token = _S_token_backref; 00396 _M_value.assign(1, __c); 00397 } 00398 else 00399 { 00400 #ifdef __STRICT_ANSI__ 00401 __throw_regex_error(regex_constants::error_escape); 00402 #else 00403 _M_token = _S_token_ord_char; 00404 _M_value.assign(1, __c); 00405 #endif 00406 } 00407 ++_M_current; 00408 } 00409 00410 template<typename _CharT> 00411 void 00412 _Scanner<_CharT>:: 00413 _M_eat_escape_awk() 00414 { 00415 auto __c = *_M_current++; 00416 auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0')); 00417 00418 if (__pos != nullptr) 00419 { 00420 _M_token = _S_token_ord_char; 00421 _M_value.assign(1, *__pos); 00422 } 00423 // \ddd for oct representation 00424 else if (_M_ctype.is(_CtypeT::digit, __c) 00425 && __c != '8' 00426 && __c != '9') 00427 { 00428 _M_value.assign(1, __c); 00429 for (int __i = 0; 00430 __i < 2 00431 && _M_current != _M_end 00432 && _M_ctype.is(_CtypeT::digit, *_M_current) 00433 && *_M_current != '8' 00434 && *_M_current != '9'; 00435 __i++) 00436 _M_value += *_M_current++; 00437 _M_token = _S_token_oct_num; 00438 return; 00439 } 00440 else 00441 __throw_regex_error(regex_constants::error_escape); 00442 } 00443 00444 // Eats a character class or throwns an exception. 00445 // __ch cound be ':', '.' or '=', _M_current is the char after ']' when 00446 // returning. 00447 template<typename _CharT> 00448 void 00449 _Scanner<_CharT>:: 00450 _M_eat_class(char __ch) 00451 { 00452 for (_M_value.clear(); _M_current != _M_end && *_M_current != __ch;) 00453 _M_value += *_M_current++; 00454 if (_M_current == _M_end 00455 || *_M_current++ != __ch 00456 || _M_current == _M_end // skip __ch 00457 || *_M_current++ != ']') // skip ']' 00458 { 00459 if (__ch == ':') 00460 __throw_regex_error(regex_constants::error_ctype); 00461 else 00462 __throw_regex_error(regex_constants::error_collate); 00463 } 00464 } 00465 00466 #ifdef _GLIBCXX_DEBUG 00467 template<typename _CharT> 00468 std::ostream& 00469 _Scanner<_CharT>:: 00470 _M_print(std::ostream& ostr) 00471 { 00472 switch (_M_token) 00473 { 00474 case _S_token_anychar: 00475 ostr << "any-character\n"; 00476 break; 00477 case _S_token_backref: 00478 ostr << "backref\n"; 00479 break; 00480 case _S_token_bracket_begin: 00481 ostr << "bracket-begin\n"; 00482 break; 00483 case _S_token_bracket_neg_begin: 00484 ostr << "bracket-neg-begin\n"; 00485 break; 00486 case _S_token_bracket_end: 00487 ostr << "bracket-end\n"; 00488 break; 00489 case _S_token_char_class_name: 00490 ostr << "char-class-name \"" << _M_value << "\"\n"; 00491 break; 00492 case _S_token_closure0: 00493 ostr << "closure0\n"; 00494 break; 00495 case _S_token_closure1: 00496 ostr << "closure1\n"; 00497 break; 00498 case _S_token_collsymbol: 00499 ostr << "collsymbol \"" << _M_value << "\"\n"; 00500 break; 00501 case _S_token_comma: 00502 ostr << "comma\n"; 00503 break; 00504 case _S_token_dup_count: 00505 ostr << "dup count: " << _M_value << "\n"; 00506 break; 00507 case _S_token_eof: 00508 ostr << "EOF\n"; 00509 break; 00510 case _S_token_equiv_class_name: 00511 ostr << "equiv-class-name \"" << _M_value << "\"\n"; 00512 break; 00513 case _S_token_interval_begin: 00514 ostr << "interval begin\n"; 00515 break; 00516 case _S_token_interval_end: 00517 ostr << "interval end\n"; 00518 break; 00519 case _S_token_line_begin: 00520 ostr << "line begin\n"; 00521 break; 00522 case _S_token_line_end: 00523 ostr << "line end\n"; 00524 break; 00525 case _S_token_opt: 00526 ostr << "opt\n"; 00527 break; 00528 case _S_token_or: 00529 ostr << "or\n"; 00530 break; 00531 case _S_token_ord_char: 00532 ostr << "ordinary character: \"" << _M_value << "\"\n"; 00533 break; 00534 case _S_token_subexpr_begin: 00535 ostr << "subexpr begin\n"; 00536 break; 00537 case _S_token_subexpr_no_group_begin: 00538 ostr << "no grouping subexpr begin\n"; 00539 break; 00540 case _S_token_subexpr_lookahead_begin: 00541 ostr << "lookahead subexpr begin\n"; 00542 break; 00543 case _S_token_subexpr_end: 00544 ostr << "subexpr end\n"; 00545 break; 00546 case _S_token_unknown: 00547 ostr << "-- unknown token --\n"; 00548 break; 00549 case _S_token_oct_num: 00550 ostr << "oct number " << _M_value << "\n"; 00551 break; 00552 case _S_token_hex_num: 00553 ostr << "hex number " << _M_value << "\n"; 00554 break; 00555 case _S_token_quoted_class: 00556 ostr << "quoted class " << "\\" << _M_value << "\n"; 00557 break; 00558 default: 00559 _GLIBCXX_DEBUG_ASSERT(false); 00560 } 00561 return ostr; 00562 } 00563 #endif 00564 00565 _GLIBCXX_END_NAMESPACE_VERSION 00566 } // namespace __detail 00567 } // namespace