libstdc++
regex_scanner.tcc
Go to the documentation of this file.
1 // class template regex -*- C++ -*-
2 
3 // Copyright (C) 2013-2014 Free Software Foundation, Inc.
4 //
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
9 // any later version.
10 
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
15 
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
19 
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
24 
25 /**
26  * @file bits/regex_scanner.tcc
27  * This is an internal header file, included by other library headers.
28  * Do not attempt to use it directly. @headername{regex}
29  */
30 
31 // FIXME make comments doxygen format.
32 
33 // N3376 specified 6 regex styles: ECMAScript, basic, extended, grep, egrep
34 // and awk
35 // 1) grep is basic except '\n' is treated as '|'
36 // 2) egrep is extended except '\n' is treated as '|'
37 // 3) awk is extended except special escaping rules, and there's no
38 // back-reference.
39 //
40 // References:
41 //
42 // ECMAScript: ECMA-262 15.10
43 //
44 // basic, extended:
45 // http://pubs.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html
46 //
47 // awk: http://pubs.opengroup.org/onlinepubs/000095399/utilities/awk.html
48 
49 namespace std _GLIBCXX_VISIBILITY(default)
50 {
51 namespace __detail
52 {
53 _GLIBCXX_BEGIN_NAMESPACE_VERSION
54 
55  template<typename _CharT>
56  _Scanner<_CharT>::
57  _Scanner(typename _Scanner::_IterT __begin,
58  typename _Scanner::_IterT __end,
59  _FlagT __flags, std::locale __loc)
60  : _ScannerBase(__flags),
61  _M_current(__begin), _M_end(__end),
62  _M_ctype(std::use_facet<_CtypeT>(__loc)),
63  _M_eat_escape(_M_is_ecma()
64  ? &_Scanner::_M_eat_escape_ecma
65  : &_Scanner::_M_eat_escape_posix)
66  { _M_advance(); }
67 
68  template<typename _CharT>
69  void
70  _Scanner<_CharT>::
71  _M_advance()
72  {
73  if (_M_current == _M_end)
74  {
75  _M_token = _S_token_eof;
76  return;
77  }
78 
79  if (_M_state == _S_state_normal)
80  _M_scan_normal();
81  else if (_M_state == _S_state_in_bracket)
82  _M_scan_in_bracket();
83  else if (_M_state == _S_state_in_brace)
84  _M_scan_in_brace();
85  else
86  _GLIBCXX_DEBUG_ASSERT(false);
87  }
88 
89  // Differences between styles:
90  // 1) "\(", "\)", "\{" in basic. It's not escaping.
91  // 2) "(?:", "(?=", "(?!" in ECMAScript.
92  template<typename _CharT>
93  void
94  _Scanner<_CharT>::
95  _M_scan_normal()
96  {
97  auto __c = *_M_current++;
98  const char* __pos;
99 
100  if (std::strchr(_M_spec_char, _M_ctype.narrow(__c, '\0')) == nullptr)
101  {
102  _M_token = _S_token_ord_char;
103  _M_value.assign(1, __c);
104  return;
105  }
106  if (__c == '\\')
107  {
108  if (_M_current == _M_end)
109  __throw_regex_error(regex_constants::error_escape);
110 
111  if (!_M_is_basic()
112  || (*_M_current != '('
113  && *_M_current != ')'
114  && *_M_current != '{'))
115  {
116  (this->*_M_eat_escape)();
117  return;
118  }
119  __c = *_M_current++;
120  }
121  if (__c == '(')
122  {
123  if (_M_is_ecma() && *_M_current == '?')
124  {
125  if (++_M_current == _M_end)
126  __throw_regex_error(regex_constants::error_paren);
127 
128  if (*_M_current == ':')
129  {
130  ++_M_current;
131  _M_token = _S_token_subexpr_no_group_begin;
132  }
133  else if (*_M_current == '=')
134  {
135  ++_M_current;
136  _M_token = _S_token_subexpr_lookahead_begin;
137  _M_value.assign(1, 'p');
138  }
139  else if (*_M_current == '!')
140  {
141  ++_M_current;
142  _M_token = _S_token_subexpr_lookahead_begin;
143  _M_value.assign(1, 'n');
144  }
145  else
146  __throw_regex_error(regex_constants::error_paren);
147  }
148  else if (_M_flags & regex_constants::nosubs)
149  _M_token = _S_token_subexpr_no_group_begin;
150  else
151  _M_token = _S_token_subexpr_begin;
152  }
153  else if (__c == ')')
154  _M_token = _S_token_subexpr_end;
155  else if (__c == '[')
156  {
157  _M_state = _S_state_in_bracket;
158  _M_at_bracket_start = true;
159  if (_M_current != _M_end && *_M_current == '^')
160  {
161  _M_token = _S_token_bracket_neg_begin;
162  ++_M_current;
163  }
164  else
165  _M_token = _S_token_bracket_begin;
166  }
167  else if (__c == '{')
168  {
169  _M_state = _S_state_in_brace;
170  _M_token = _S_token_interval_begin;
171  }
172  else if (((__pos = std::strchr(_M_spec_char, _M_ctype.narrow(__c, '\0')))
173  != nullptr
174  && *__pos != '\0'
175  && __c != ']'
176  && __c != '}')
177  || (_M_is_grep() && __c == '\n'))
178  {
179  auto __it = _M_token_tbl;
180  auto __narrowc = _M_ctype.narrow(__c, '\0');
181  for (; __it->first != '\0'; ++__it)
182  if (__it->first == __narrowc)
183  {
184  _M_token = __it->second;
185  return;
186  }
187  _GLIBCXX_DEBUG_ASSERT(false);
188  }
189  else
190  {
191  _M_token = _S_token_ord_char;
192  _M_value.assign(1, __c);
193  }
194  }
195 
196  // Differences between styles:
197  // 1) different semantics of "[]" and "[^]".
198  // 2) Escaping in bracket expr.
199  template<typename _CharT>
200  void
201  _Scanner<_CharT>::
202  _M_scan_in_bracket()
203  {
204  if (_M_current == _M_end)
205  __throw_regex_error(regex_constants::error_brack);
206 
207  auto __c = *_M_current++;
208 
209  if (__c == '[')
210  {
211  if (_M_current == _M_end)
212  __throw_regex_error(regex_constants::error_brack);
213 
214  if (*_M_current == '.')
215  {
216  _M_token = _S_token_collsymbol;
217  _M_eat_class(*_M_current++);
218  }
219  else if (*_M_current == ':')
220  {
221  _M_token = _S_token_char_class_name;
222  _M_eat_class(*_M_current++);
223  }
224  else if (*_M_current == '=')
225  {
226  _M_token = _S_token_equiv_class_name;
227  _M_eat_class(*_M_current++);
228  }
229  else
230  {
231  _M_token = _S_token_ord_char;
232  _M_value.assign(1, __c);
233  }
234  }
235  // In POSIX, when encountering "[]" or "[^]", the ']' is interpreted
236  // literally. So "[]]" or "[^]]" is valid regex. See the testcases
237  // `*/empty_range.cc`.
238  else if (__c == ']' && (_M_is_ecma() || !_M_at_bracket_start))
239  {
240  _M_token = _S_token_bracket_end;
241  _M_state = _S_state_normal;
242  }
243  // ECMAScirpt and awk permmits escaping in bracket.
244  else if (__c == '\\' && (_M_is_ecma() || _M_is_awk()))
245  (this->*_M_eat_escape)();
246  else
247  {
248  _M_token = _S_token_ord_char;
249  _M_value.assign(1, __c);
250  }
251  _M_at_bracket_start = false;
252  }
253 
254  // Differences between styles:
255  // 1) "\}" in basic style.
256  template<typename _CharT>
257  void
258  _Scanner<_CharT>::
259  _M_scan_in_brace()
260  {
261  if (_M_current == _M_end)
262  __throw_regex_error(regex_constants::error_brace);
263 
264  auto __c = *_M_current++;
265 
266  if (_M_ctype.is(_CtypeT::digit, __c))
267  {
268  _M_token = _S_token_dup_count;
269  _M_value.assign(1, __c);
270  while (_M_current != _M_end
271  && _M_ctype.is(_CtypeT::digit, *_M_current))
272  _M_value += *_M_current++;
273  }
274  else if (__c == ',')
275  _M_token = _S_token_comma;
276  // basic use \}.
277  else if (_M_is_basic())
278  {
279  if (__c == '\\' && _M_current != _M_end && *_M_current == '}')
280  {
281  _M_state = _S_state_normal;
282  _M_token = _S_token_interval_end;
283  ++_M_current;
284  }
285  else
286  __throw_regex_error(regex_constants::error_badbrace);
287  }
288  else if (__c == '}')
289  {
290  _M_state = _S_state_normal;
291  _M_token = _S_token_interval_end;
292  }
293  else
294  __throw_regex_error(regex_constants::error_badbrace);
295  }
296 
297  template<typename _CharT>
298  void
299  _Scanner<_CharT>::
300  _M_eat_escape_ecma()
301  {
302  if (_M_current == _M_end)
303  __throw_regex_error(regex_constants::error_escape);
304 
305  auto __c = *_M_current++;
306  auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0'));
307 
308  if (__pos != nullptr && (__c != 'b' || _M_state == _S_state_in_bracket))
309  {
310  _M_token = _S_token_ord_char;
311  _M_value.assign(1, *__pos);
312  }
313  else if (__c == 'b')
314  {
315  _M_token = _S_token_word_bound;
316  _M_value.assign(1, 'p');
317  }
318  else if (__c == 'B')
319  {
320  _M_token = _S_token_word_bound;
321  _M_value.assign(1, 'n');
322  }
323  // N3376 28.13
324  else if (__c == 'd'
325  || __c == 'D'
326  || __c == 's'
327  || __c == 'S'
328  || __c == 'w'
329  || __c == 'W')
330  {
331  _M_token = _S_token_quoted_class;
332  _M_value.assign(1, __c);
333  }
334  else if (__c == 'c')
335  {
336  if (_M_current == _M_end)
337  __throw_regex_error(regex_constants::error_escape);
338  _M_token = _S_token_ord_char;
339  _M_value.assign(1, *_M_current++);
340  }
341  else if (__c == 'x' || __c == 'u')
342  {
343  _M_value.erase();
344  for (int i = 0; i < (__c == 'x' ? 2 : 4); i++)
345  {
346  if (_M_current == _M_end
347  || !_M_ctype.is(_CtypeT::xdigit, *_M_current))
348  __throw_regex_error(regex_constants::error_escape);
349  _M_value += *_M_current++;
350  }
351  _M_token = _S_token_hex_num;
352  }
353  // ECMAScript recongnizes multi-digit back-references.
354  else if (_M_ctype.is(_CtypeT::digit, __c))
355  {
356  _M_value.assign(1, __c);
357  while (_M_current != _M_end
358  && _M_ctype.is(_CtypeT::digit, *_M_current))
359  _M_value += *_M_current++;
360  _M_token = _S_token_backref;
361  }
362  else
363  {
364  _M_token = _S_token_ord_char;
365  _M_value.assign(1, __c);
366  }
367  }
368 
369  // Differences between styles:
370  // 1) Extended doesn't support backref, but basic does.
371  template<typename _CharT>
372  void
373  _Scanner<_CharT>::
374  _M_eat_escape_posix()
375  {
376  if (_M_current == _M_end)
377  __throw_regex_error(regex_constants::error_escape);
378 
379  auto __c = *_M_current;
380  auto __pos = std::strchr(_M_spec_char, _M_ctype.narrow(__c, '\0'));
381 
382  if (__pos != nullptr && *__pos != '\0')
383  {
384  _M_token = _S_token_ord_char;
385  _M_value.assign(1, __c);
386  }
387  // We MUST judge awk before handling backrefs. There's no backref in awk.
388  else if (_M_is_awk())
389  {
390  _M_eat_escape_awk();
391  return;
392  }
393  else if (_M_is_basic() && _M_ctype.is(_CtypeT::digit, __c) && __c != '0')
394  {
395  _M_token = _S_token_backref;
396  _M_value.assign(1, __c);
397  }
398  else
399  {
400 #ifdef __STRICT_ANSI__
401  __throw_regex_error(regex_constants::error_escape);
402 #else
403  _M_token = _S_token_ord_char;
404  _M_value.assign(1, __c);
405 #endif
406  }
407  ++_M_current;
408  }
409 
410  template<typename _CharT>
411  void
412  _Scanner<_CharT>::
413  _M_eat_escape_awk()
414  {
415  auto __c = *_M_current++;
416  auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0'));
417 
418  if (__pos != nullptr)
419  {
420  _M_token = _S_token_ord_char;
421  _M_value.assign(1, *__pos);
422  }
423  // \ddd for oct representation
424  else if (_M_ctype.is(_CtypeT::digit, __c)
425  && __c != '8'
426  && __c != '9')
427  {
428  _M_value.assign(1, __c);
429  for (int __i = 0;
430  __i < 2
431  && _M_current != _M_end
432  && _M_ctype.is(_CtypeT::digit, *_M_current)
433  && *_M_current != '8'
434  && *_M_current != '9';
435  __i++)
436  _M_value += *_M_current++;
437  _M_token = _S_token_oct_num;
438  return;
439  }
440  else
441  __throw_regex_error(regex_constants::error_escape);
442  }
443 
444  // Eats a character class or throwns an exception.
445  // __ch cound be ':', '.' or '=', _M_current is the char after ']' when
446  // returning.
447  template<typename _CharT>
448  void
449  _Scanner<_CharT>::
450  _M_eat_class(char __ch)
451  {
452  for (_M_value.clear(); _M_current != _M_end && *_M_current != __ch;)
453  _M_value += *_M_current++;
454  if (_M_current == _M_end
455  || *_M_current++ != __ch
456  || _M_current == _M_end // skip __ch
457  || *_M_current++ != ']') // skip ']'
458  {
459  if (__ch == ':')
460  __throw_regex_error(regex_constants::error_ctype);
461  else
462  __throw_regex_error(regex_constants::error_collate);
463  }
464  }
465 
466 #ifdef _GLIBCXX_DEBUG
467  template<typename _CharT>
468  std::ostream&
469  _Scanner<_CharT>::
470  _M_print(std::ostream& ostr)
471  {
472  switch (_M_token)
473  {
474  case _S_token_anychar:
475  ostr << "any-character\n";
476  break;
477  case _S_token_backref:
478  ostr << "backref\n";
479  break;
480  case _S_token_bracket_begin:
481  ostr << "bracket-begin\n";
482  break;
483  case _S_token_bracket_neg_begin:
484  ostr << "bracket-neg-begin\n";
485  break;
486  case _S_token_bracket_end:
487  ostr << "bracket-end\n";
488  break;
489  case _S_token_char_class_name:
490  ostr << "char-class-name \"" << _M_value << "\"\n";
491  break;
492  case _S_token_closure0:
493  ostr << "closure0\n";
494  break;
495  case _S_token_closure1:
496  ostr << "closure1\n";
497  break;
498  case _S_token_collsymbol:
499  ostr << "collsymbol \"" << _M_value << "\"\n";
500  break;
501  case _S_token_comma:
502  ostr << "comma\n";
503  break;
504  case _S_token_dup_count:
505  ostr << "dup count: " << _M_value << "\n";
506  break;
507  case _S_token_eof:
508  ostr << "EOF\n";
509  break;
510  case _S_token_equiv_class_name:
511  ostr << "equiv-class-name \"" << _M_value << "\"\n";
512  break;
513  case _S_token_interval_begin:
514  ostr << "interval begin\n";
515  break;
516  case _S_token_interval_end:
517  ostr << "interval end\n";
518  break;
519  case _S_token_line_begin:
520  ostr << "line begin\n";
521  break;
522  case _S_token_line_end:
523  ostr << "line end\n";
524  break;
525  case _S_token_opt:
526  ostr << "opt\n";
527  break;
528  case _S_token_or:
529  ostr << "or\n";
530  break;
531  case _S_token_ord_char:
532  ostr << "ordinary character: \"" << _M_value << "\"\n";
533  break;
534  case _S_token_subexpr_begin:
535  ostr << "subexpr begin\n";
536  break;
537  case _S_token_subexpr_no_group_begin:
538  ostr << "no grouping subexpr begin\n";
539  break;
540  case _S_token_subexpr_lookahead_begin:
541  ostr << "lookahead subexpr begin\n";
542  break;
543  case _S_token_subexpr_end:
544  ostr << "subexpr end\n";
545  break;
546  case _S_token_unknown:
547  ostr << "-- unknown token --\n";
548  break;
549  case _S_token_oct_num:
550  ostr << "oct number " << _M_value << "\n";
551  break;
552  case _S_token_hex_num:
553  ostr << "hex number " << _M_value << "\n";
554  break;
555  case _S_token_quoted_class:
556  ostr << "quoted class " << "\\" << _M_value << "\n";
557  break;
558  default:
559  _GLIBCXX_DEBUG_ASSERT(false);
560  }
561  return ostr;
562  }
563 #endif
564 
565 _GLIBCXX_END_NAMESPACE_VERSION
566 } // namespace __detail
567 } // namespace
constexpr error_type error_escape(_S_error_escape)
const _Facet & use_facet(const locale &__loc)
Return a facet.use_facet looks for and returns a reference to a facet of type Facet where Facet is th...
constexpr error_type error_ctype(_S_error_ctype)
constexpr error_type error_collate(_S_error_collate)
constexpr error_type error_brack(_S_error_brack)
constexpr error_type error_brace(_S_error_brace)
constexpr error_type error_badbrace(_S_error_badbrace)
Container class for localization functionality.The locale class is first a class wrapper for C librar...
constexpr error_type error_paren(_S_error_paren)