libstdc++
|
00001 // class template regex -*- C++ -*- 00002 00003 // Copyright (C) 2013-2015 Free Software Foundation, Inc. 00004 // 00005 // This file is part of the GNU ISO C++ Library. This library is free 00006 // software; you can redistribute it and/or modify it under the 00007 // terms of the GNU General Public License as published by the 00008 // Free Software Foundation; either version 3, or (at your option) 00009 // any later version. 00010 00011 // This library is distributed in the hope that it will be useful, 00012 // but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00014 // GNU General Public License for more details. 00015 00016 // Under Section 7 of GPL version 3, you are granted additional 00017 // permissions described in the GCC Runtime Library Exception, version 00018 // 3.1, as published by the Free Software Foundation. 00019 00020 // You should have received a copy of the GNU General Public License and 00021 // a copy of the GCC Runtime Library Exception along with this program; 00022 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 00023 // <http://www.gnu.org/licenses/>. 00024 00025 /** 00026 * @file bits/regex_scanner.tcc 00027 * This is an internal header file, included by other library headers. 00028 * Do not attempt to use it directly. @headername{regex} 00029 */ 00030 00031 // FIXME make comments doxygen format. 00032 00033 // N3376 specified 6 regex styles: ECMAScript, basic, extended, grep, egrep 00034 // and awk 00035 // 1) grep is basic except '\n' is treated as '|' 00036 // 2) egrep is extended except '\n' is treated as '|' 00037 // 3) awk is extended except special escaping rules, and there's no 00038 // back-reference. 00039 // 00040 // References: 00041 // 00042 // ECMAScript: ECMA-262 15.10 00043 // 00044 // basic, extended: 00045 // http://pubs.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html 00046 // 00047 // awk: http://pubs.opengroup.org/onlinepubs/000095399/utilities/awk.html 00048 00049 namespace std _GLIBCXX_VISIBILITY(default) 00050 { 00051 namespace __detail 00052 { 00053 _GLIBCXX_BEGIN_NAMESPACE_VERSION 00054 00055 template<typename _CharT> 00056 _Scanner<_CharT>:: 00057 _Scanner(typename _Scanner::_IterT __begin, 00058 typename _Scanner::_IterT __end, 00059 _FlagT __flags, std::locale __loc) 00060 : _ScannerBase(__flags), 00061 _M_current(__begin), _M_end(__end), 00062 _M_ctype(std::use_facet<_CtypeT>(__loc)), 00063 _M_eat_escape(_M_is_ecma() 00064 ? &_Scanner::_M_eat_escape_ecma 00065 : &_Scanner::_M_eat_escape_posix) 00066 { _M_advance(); } 00067 00068 template<typename _CharT> 00069 void 00070 _Scanner<_CharT>:: 00071 _M_advance() 00072 { 00073 if (_M_current == _M_end) 00074 { 00075 _M_token = _S_token_eof; 00076 return; 00077 } 00078 00079 if (_M_state == _S_state_normal) 00080 _M_scan_normal(); 00081 else if (_M_state == _S_state_in_bracket) 00082 _M_scan_in_bracket(); 00083 else if (_M_state == _S_state_in_brace) 00084 _M_scan_in_brace(); 00085 else 00086 { 00087 _GLIBCXX_DEBUG_ASSERT(false); 00088 } 00089 } 00090 00091 // Differences between styles: 00092 // 1) "\(", "\)", "\{" in basic. It's not escaping. 00093 // 2) "(?:", "(?=", "(?!" in ECMAScript. 00094 template<typename _CharT> 00095 void 00096 _Scanner<_CharT>:: 00097 _M_scan_normal() 00098 { 00099 auto __c = *_M_current++; 00100 00101 if (std::strchr(_M_spec_char, _M_ctype.narrow(__c, ' ')) == nullptr) 00102 { 00103 _M_token = _S_token_ord_char; 00104 _M_value.assign(1, __c); 00105 return; 00106 } 00107 if (__c == '\\') 00108 { 00109 if (_M_current == _M_end) 00110 __throw_regex_error(regex_constants::error_escape); 00111 00112 if (!_M_is_basic() 00113 || (*_M_current != '(' 00114 && *_M_current != ')' 00115 && *_M_current != '{')) 00116 { 00117 (this->*_M_eat_escape)(); 00118 return; 00119 } 00120 __c = *_M_current++; 00121 } 00122 if (__c == '(') 00123 { 00124 if (_M_is_ecma() && *_M_current == '?') 00125 { 00126 if (++_M_current == _M_end) 00127 __throw_regex_error(regex_constants::error_paren); 00128 00129 if (*_M_current == ':') 00130 { 00131 ++_M_current; 00132 _M_token = _S_token_subexpr_no_group_begin; 00133 } 00134 else if (*_M_current == '=') 00135 { 00136 ++_M_current; 00137 _M_token = _S_token_subexpr_lookahead_begin; 00138 _M_value.assign(1, 'p'); 00139 } 00140 else if (*_M_current == '!') 00141 { 00142 ++_M_current; 00143 _M_token = _S_token_subexpr_lookahead_begin; 00144 _M_value.assign(1, 'n'); 00145 } 00146 else 00147 __throw_regex_error(regex_constants::error_paren); 00148 } 00149 else if (_M_flags & regex_constants::nosubs) 00150 _M_token = _S_token_subexpr_no_group_begin; 00151 else 00152 _M_token = _S_token_subexpr_begin; 00153 } 00154 else if (__c == ')') 00155 _M_token = _S_token_subexpr_end; 00156 else if (__c == '[') 00157 { 00158 _M_state = _S_state_in_bracket; 00159 _M_at_bracket_start = true; 00160 if (_M_current != _M_end && *_M_current == '^') 00161 { 00162 _M_token = _S_token_bracket_neg_begin; 00163 ++_M_current; 00164 } 00165 else 00166 _M_token = _S_token_bracket_begin; 00167 } 00168 else if (__c == '{') 00169 { 00170 _M_state = _S_state_in_brace; 00171 _M_token = _S_token_interval_begin; 00172 } 00173 else if (__c != ']' && __c != '}') 00174 { 00175 auto __it = _M_token_tbl; 00176 auto __narrowc = _M_ctype.narrow(__c, '\0'); 00177 for (; __it->first != '\0'; ++__it) 00178 if (__it->first == __narrowc) 00179 { 00180 _M_token = __it->second; 00181 return; 00182 } 00183 _GLIBCXX_DEBUG_ASSERT(false); 00184 } 00185 else 00186 { 00187 _M_token = _S_token_ord_char; 00188 _M_value.assign(1, __c); 00189 } 00190 } 00191 00192 // Differences between styles: 00193 // 1) different semantics of "[]" and "[^]". 00194 // 2) Escaping in bracket expr. 00195 template<typename _CharT> 00196 void 00197 _Scanner<_CharT>:: 00198 _M_scan_in_bracket() 00199 { 00200 if (_M_current == _M_end) 00201 __throw_regex_error(regex_constants::error_brack); 00202 00203 auto __c = *_M_current++; 00204 00205 if (__c == '[') 00206 { 00207 if (_M_current == _M_end) 00208 __throw_regex_error(regex_constants::error_brack); 00209 00210 if (*_M_current == '.') 00211 { 00212 _M_token = _S_token_collsymbol; 00213 _M_eat_class(*_M_current++); 00214 } 00215 else if (*_M_current == ':') 00216 { 00217 _M_token = _S_token_char_class_name; 00218 _M_eat_class(*_M_current++); 00219 } 00220 else if (*_M_current == '=') 00221 { 00222 _M_token = _S_token_equiv_class_name; 00223 _M_eat_class(*_M_current++); 00224 } 00225 else 00226 { 00227 _M_token = _S_token_ord_char; 00228 _M_value.assign(1, __c); 00229 } 00230 } 00231 // In POSIX, when encountering "[]" or "[^]", the ']' is interpreted 00232 // literally. So "[]]" and "[^]]" are valid regexes. See the testcases 00233 // `*/empty_range.cc`. 00234 else if (__c == ']' && (_M_is_ecma() || !_M_at_bracket_start)) 00235 { 00236 _M_token = _S_token_bracket_end; 00237 _M_state = _S_state_normal; 00238 } 00239 // ECMAScript and awk permits escaping in bracket. 00240 else if (__c == '\\' && (_M_is_ecma() || _M_is_awk())) 00241 (this->*_M_eat_escape)(); 00242 else 00243 { 00244 _M_token = _S_token_ord_char; 00245 _M_value.assign(1, __c); 00246 } 00247 _M_at_bracket_start = false; 00248 } 00249 00250 // Differences between styles: 00251 // 1) "\}" in basic style. 00252 template<typename _CharT> 00253 void 00254 _Scanner<_CharT>:: 00255 _M_scan_in_brace() 00256 { 00257 if (_M_current == _M_end) 00258 __throw_regex_error(regex_constants::error_brace); 00259 00260 auto __c = *_M_current++; 00261 00262 if (_M_ctype.is(_CtypeT::digit, __c)) 00263 { 00264 _M_token = _S_token_dup_count; 00265 _M_value.assign(1, __c); 00266 while (_M_current != _M_end 00267 && _M_ctype.is(_CtypeT::digit, *_M_current)) 00268 _M_value += *_M_current++; 00269 } 00270 else if (__c == ',') 00271 _M_token = _S_token_comma; 00272 // basic use \}. 00273 else if (_M_is_basic()) 00274 { 00275 if (__c == '\\' && _M_current != _M_end && *_M_current == '}') 00276 { 00277 _M_state = _S_state_normal; 00278 _M_token = _S_token_interval_end; 00279 ++_M_current; 00280 } 00281 else 00282 __throw_regex_error(regex_constants::error_badbrace); 00283 } 00284 else if (__c == '}') 00285 { 00286 _M_state = _S_state_normal; 00287 _M_token = _S_token_interval_end; 00288 } 00289 else 00290 __throw_regex_error(regex_constants::error_badbrace); 00291 } 00292 00293 template<typename _CharT> 00294 void 00295 _Scanner<_CharT>:: 00296 _M_eat_escape_ecma() 00297 { 00298 if (_M_current == _M_end) 00299 __throw_regex_error(regex_constants::error_escape); 00300 00301 auto __c = *_M_current++; 00302 auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0')); 00303 00304 if (__pos != nullptr && (__c != 'b' || _M_state == _S_state_in_bracket)) 00305 { 00306 _M_token = _S_token_ord_char; 00307 _M_value.assign(1, *__pos); 00308 } 00309 else if (__c == 'b') 00310 { 00311 _M_token = _S_token_word_bound; 00312 _M_value.assign(1, 'p'); 00313 } 00314 else if (__c == 'B') 00315 { 00316 _M_token = _S_token_word_bound; 00317 _M_value.assign(1, 'n'); 00318 } 00319 // N3376 28.13 00320 else if (__c == 'd' 00321 || __c == 'D' 00322 || __c == 's' 00323 || __c == 'S' 00324 || __c == 'w' 00325 || __c == 'W') 00326 { 00327 _M_token = _S_token_quoted_class; 00328 _M_value.assign(1, __c); 00329 } 00330 else if (__c == 'c') 00331 { 00332 if (_M_current == _M_end) 00333 __throw_regex_error(regex_constants::error_escape); 00334 _M_token = _S_token_ord_char; 00335 _M_value.assign(1, *_M_current++); 00336 } 00337 else if (__c == 'x' || __c == 'u') 00338 { 00339 _M_value.erase(); 00340 for (int __i = 0; __i < (__c == 'x' ? 2 : 4); __i++) 00341 { 00342 if (_M_current == _M_end 00343 || !_M_ctype.is(_CtypeT::xdigit, *_M_current)) 00344 __throw_regex_error(regex_constants::error_escape); 00345 _M_value += *_M_current++; 00346 } 00347 _M_token = _S_token_hex_num; 00348 } 00349 // ECMAScript recognizes multi-digit back-references. 00350 else if (_M_ctype.is(_CtypeT::digit, __c)) 00351 { 00352 _M_value.assign(1, __c); 00353 while (_M_current != _M_end 00354 && _M_ctype.is(_CtypeT::digit, *_M_current)) 00355 _M_value += *_M_current++; 00356 _M_token = _S_token_backref; 00357 } 00358 else 00359 { 00360 _M_token = _S_token_ord_char; 00361 _M_value.assign(1, __c); 00362 } 00363 } 00364 00365 // Differences between styles: 00366 // 1) Extended doesn't support backref, but basic does. 00367 template<typename _CharT> 00368 void 00369 _Scanner<_CharT>:: 00370 _M_eat_escape_posix() 00371 { 00372 if (_M_current == _M_end) 00373 __throw_regex_error(regex_constants::error_escape); 00374 00375 auto __c = *_M_current; 00376 auto __pos = std::strchr(_M_spec_char, _M_ctype.narrow(__c, '\0')); 00377 00378 if (__pos != nullptr && *__pos != '\0') 00379 { 00380 _M_token = _S_token_ord_char; 00381 _M_value.assign(1, __c); 00382 } 00383 // We MUST judge awk before handling backrefs. There's no backref in awk. 00384 else if (_M_is_awk()) 00385 { 00386 _M_eat_escape_awk(); 00387 return; 00388 } 00389 else if (_M_is_basic() && _M_ctype.is(_CtypeT::digit, __c) && __c != '0') 00390 { 00391 _M_token = _S_token_backref; 00392 _M_value.assign(1, __c); 00393 } 00394 else 00395 { 00396 #ifdef __STRICT_ANSI__ 00397 // POSIX says it is undefined to escape ordinary characters 00398 __throw_regex_error(regex_constants::error_escape); 00399 #else 00400 _M_token = _S_token_ord_char; 00401 _M_value.assign(1, __c); 00402 #endif 00403 } 00404 ++_M_current; 00405 } 00406 00407 template<typename _CharT> 00408 void 00409 _Scanner<_CharT>:: 00410 _M_eat_escape_awk() 00411 { 00412 auto __c = *_M_current++; 00413 auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0')); 00414 00415 if (__pos != nullptr) 00416 { 00417 _M_token = _S_token_ord_char; 00418 _M_value.assign(1, *__pos); 00419 } 00420 // \ddd for oct representation 00421 else if (_M_ctype.is(_CtypeT::digit, __c) 00422 && __c != '8' 00423 && __c != '9') 00424 { 00425 _M_value.assign(1, __c); 00426 for (int __i = 0; 00427 __i < 2 00428 && _M_current != _M_end 00429 && _M_ctype.is(_CtypeT::digit, *_M_current) 00430 && *_M_current != '8' 00431 && *_M_current != '9'; 00432 __i++) 00433 _M_value += *_M_current++; 00434 _M_token = _S_token_oct_num; 00435 return; 00436 } 00437 else 00438 __throw_regex_error(regex_constants::error_escape); 00439 } 00440 00441 // Eats a character class or throws an exception. 00442 // __ch could be ':', '.' or '=', _M_current is the char after ']' when 00443 // returning. 00444 template<typename _CharT> 00445 void 00446 _Scanner<_CharT>:: 00447 _M_eat_class(char __ch) 00448 { 00449 for (_M_value.clear(); _M_current != _M_end && *_M_current != __ch;) 00450 _M_value += *_M_current++; 00451 if (_M_current == _M_end 00452 || *_M_current++ != __ch 00453 || _M_current == _M_end // skip __ch 00454 || *_M_current++ != ']') // skip ']' 00455 { 00456 if (__ch == ':') 00457 __throw_regex_error(regex_constants::error_ctype); 00458 else 00459 __throw_regex_error(regex_constants::error_collate); 00460 } 00461 } 00462 00463 #ifdef _GLIBCXX_DEBUG 00464 template<typename _CharT> 00465 std::ostream& 00466 _Scanner<_CharT>:: 00467 _M_print(std::ostream& ostr) 00468 { 00469 switch (_M_token) 00470 { 00471 case _S_token_anychar: 00472 ostr << "any-character\n"; 00473 break; 00474 case _S_token_backref: 00475 ostr << "backref\n"; 00476 break; 00477 case _S_token_bracket_begin: 00478 ostr << "bracket-begin\n"; 00479 break; 00480 case _S_token_bracket_neg_begin: 00481 ostr << "bracket-neg-begin\n"; 00482 break; 00483 case _S_token_bracket_end: 00484 ostr << "bracket-end\n"; 00485 break; 00486 case _S_token_char_class_name: 00487 ostr << "char-class-name \"" << _M_value << "\"\n"; 00488 break; 00489 case _S_token_closure0: 00490 ostr << "closure0\n"; 00491 break; 00492 case _S_token_closure1: 00493 ostr << "closure1\n"; 00494 break; 00495 case _S_token_collsymbol: 00496 ostr << "collsymbol \"" << _M_value << "\"\n"; 00497 break; 00498 case _S_token_comma: 00499 ostr << "comma\n"; 00500 break; 00501 case _S_token_dup_count: 00502 ostr << "dup count: " << _M_value << "\n"; 00503 break; 00504 case _S_token_eof: 00505 ostr << "EOF\n"; 00506 break; 00507 case _S_token_equiv_class_name: 00508 ostr << "equiv-class-name \"" << _M_value << "\"\n"; 00509 break; 00510 case _S_token_interval_begin: 00511 ostr << "interval begin\n"; 00512 break; 00513 case _S_token_interval_end: 00514 ostr << "interval end\n"; 00515 break; 00516 case _S_token_line_begin: 00517 ostr << "line begin\n"; 00518 break; 00519 case _S_token_line_end: 00520 ostr << "line end\n"; 00521 break; 00522 case _S_token_opt: 00523 ostr << "opt\n"; 00524 break; 00525 case _S_token_or: 00526 ostr << "or\n"; 00527 break; 00528 case _S_token_ord_char: 00529 ostr << "ordinary character: \"" << _M_value << "\"\n"; 00530 break; 00531 case _S_token_subexpr_begin: 00532 ostr << "subexpr begin\n"; 00533 break; 00534 case _S_token_subexpr_no_group_begin: 00535 ostr << "no grouping subexpr begin\n"; 00536 break; 00537 case _S_token_subexpr_lookahead_begin: 00538 ostr << "lookahead subexpr begin\n"; 00539 break; 00540 case _S_token_subexpr_end: 00541 ostr << "subexpr end\n"; 00542 break; 00543 case _S_token_unknown: 00544 ostr << "-- unknown token --\n"; 00545 break; 00546 case _S_token_oct_num: 00547 ostr << "oct number " << _M_value << "\n"; 00548 break; 00549 case _S_token_hex_num: 00550 ostr << "hex number " << _M_value << "\n"; 00551 break; 00552 case _S_token_quoted_class: 00553 ostr << "quoted class " << "\\" << _M_value << "\n"; 00554 break; 00555 default: 00556 _GLIBCXX_DEBUG_ASSERT(false); 00557 } 00558 return ostr; 00559 } 00560 #endif 00561 00562 _GLIBCXX_END_NAMESPACE_VERSION 00563 } // namespace __detail 00564 } // namespace