libstdc++
regex_compiler.tcc
Go to the documentation of this file.
00001 // class template regex -*- C++ -*-
00002 
00003 // Copyright (C) 2013-2014 Free Software Foundation, Inc.
00004 //
00005 // This file is part of the GNU ISO C++ Library.  This library is free
00006 // software; you can redistribute it and/or modify it under the
00007 // terms of the GNU General Public License as published by the
00008 // Free Software Foundation; either version 3, or (at your option)
00009 // any later version.
00010 
00011 // This library is distributed in the hope that it will be useful,
00012 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00013 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014 // GNU General Public License for more details.
00015 
00016 // Under Section 7 of GPL version 3, you are granted additional
00017 // permissions described in the GCC Runtime Library Exception, version
00018 // 3.1, as published by the Free Software Foundation.
00019 
00020 // You should have received a copy of the GNU General Public License and
00021 // a copy of the GCC Runtime Library Exception along with this program;
00022 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
00023 // <http://www.gnu.org/licenses/>.
00024 
00025 /**
00026  *  @file bits/regex_compiler.tcc
00027  *  This is an internal header file, included by other library headers.
00028  *  Do not attempt to use it directly. @headername{regex}
00029  */
00030 
00031 // FIXME make comments doxygen format.
00032 
00033 // This compiler refers to "Regular Expression Matching Can Be Simple And Fast"
00034 // (http://swtch.com/~rsc/regexp/regexp1.html"),
00035 // but doesn't strictly follow it.
00036 //
00037 // When compiling, states are *chained* instead of tree- or graph-constructed.
00038 // It's more like structured programs: there's if statement and loop statement.
00039 //
00040 // For alternative structure(say "a|b"), aka "if statement", two branchs should
00041 // be constructed. However, these two shall merge to an "end_tag" at the end of
00042 // this operator:
00043 //
00044 //                branch1
00045 //              /        \
00046 // => begin_tag            end_tag =>
00047 //              \        /
00048 //                branch2
00049 //
00050 // This is the difference between this implementation and that in Russ's
00051 // article.
00052 //
00053 // That's why we introduced dummy node here ------ "end_tag" is a dummy node.
00054 // All dummy node will be eliminated at the end of compiling process.
00055 
00056 namespace std _GLIBCXX_VISIBILITY(default)
00057 {
00058 namespace __detail
00059 {
00060 _GLIBCXX_BEGIN_NAMESPACE_VERSION
00061 
00062   template<typename _TraitsT>
00063     _Compiler<_TraitsT>::
00064     _Compiler(_IterT __b, _IterT __e,
00065           const _TraitsT& __traits, _FlagT __flags)
00066     : _M_flags((__flags
00067         & (regex_constants::ECMAScript
00068            | regex_constants::basic
00069            | regex_constants::extended
00070            | regex_constants::grep
00071            | regex_constants::egrep
00072            | regex_constants::awk))
00073            ? __flags
00074            : __flags | regex_constants::ECMAScript),
00075     _M_traits(__traits),
00076     _M_ctype(std::use_facet<_CtypeT>(_M_traits.getloc())),
00077     _M_scanner(__b, __e, _M_flags, _M_traits.getloc()),
00078     _M_nfa(_M_flags)
00079     {
00080       _StateSeqT __r(_M_nfa, _M_nfa._M_start());
00081       __r._M_append(_M_nfa._M_insert_subexpr_begin());
00082       this->_M_disjunction();
00083       if (!_M_match_token(_ScannerT::_S_token_eof))
00084     __throw_regex_error(regex_constants::error_paren);
00085       __r._M_append(_M_pop());
00086       _GLIBCXX_DEBUG_ASSERT(_M_stack.empty());
00087       __r._M_append(_M_nfa._M_insert_subexpr_end());
00088       __r._M_append(_M_nfa._M_insert_accept());
00089       _M_nfa._M_eliminate_dummy();
00090     }
00091 
00092   template<typename _TraitsT>
00093     void
00094     _Compiler<_TraitsT>::
00095     _M_disjunction()
00096     {
00097       this->_M_alternative();
00098       while (_M_match_token(_ScannerT::_S_token_or))
00099     {
00100       _StateSeqT __alt1 = _M_pop();
00101       this->_M_alternative();
00102       _StateSeqT __alt2 = _M_pop();
00103       auto __end = _M_nfa._M_insert_dummy();
00104       __alt1._M_append(__end);
00105       __alt2._M_append(__end);
00106       _M_stack.push(_StateSeqT(_M_nfa,
00107                    _M_nfa._M_insert_alt(__alt1._M_start,
00108                             __alt2._M_start, false),
00109                    __end));
00110     }
00111     }
00112 
00113   template<typename _TraitsT>
00114     void
00115     _Compiler<_TraitsT>::
00116     _M_alternative()
00117     {
00118       if (this->_M_term())
00119     {
00120       _StateSeqT __re = _M_pop();
00121       this->_M_alternative();
00122       __re._M_append(_M_pop());
00123       _M_stack.push(__re);
00124     }
00125       else
00126     _M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_dummy()));
00127     }
00128 
00129   template<typename _TraitsT>
00130     bool
00131     _Compiler<_TraitsT>::
00132     _M_term()
00133     {
00134       if (this->_M_assertion())
00135     return true;
00136       if (this->_M_atom())
00137     {
00138       while (this->_M_quantifier());
00139       return true;
00140     }
00141       return false;
00142     }
00143 
00144   template<typename _TraitsT>
00145     bool
00146     _Compiler<_TraitsT>::
00147     _M_assertion()
00148     {
00149       if (_M_match_token(_ScannerT::_S_token_line_begin))
00150     _M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_line_begin()));
00151       else if (_M_match_token(_ScannerT::_S_token_line_end))
00152     _M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_line_end()));
00153       else if (_M_match_token(_ScannerT::_S_token_word_bound))
00154     // _M_value[0] == 'n' means it's negtive, say "not word boundary".
00155     _M_stack.push(_StateSeqT(_M_nfa, _M_nfa.
00156           _M_insert_word_bound(_M_value[0] == 'n')));
00157       else if (_M_match_token(_ScannerT::_S_token_subexpr_lookahead_begin))
00158     {
00159       auto __neg = _M_value[0] == 'n';
00160       this->_M_disjunction();
00161       if (!_M_match_token(_ScannerT::_S_token_subexpr_end))
00162         __throw_regex_error(regex_constants::error_paren);
00163       auto __tmp = _M_pop();
00164       __tmp._M_append(_M_nfa._M_insert_accept());
00165       _M_stack.push(
00166           _StateSeqT(
00167         _M_nfa,
00168         _M_nfa._M_insert_lookahead(__tmp._M_start, __neg)));
00169     }
00170       else
00171     return false;
00172       return true;
00173     }
00174 
00175   template<typename _TraitsT>
00176     bool
00177     _Compiler<_TraitsT>::
00178     _M_quantifier()
00179     {
00180       bool __neg = (_M_flags & regex_constants::ECMAScript);
00181       auto __init = [this, &__neg]()
00182     {
00183       if (_M_stack.empty())
00184         __throw_regex_error(regex_constants::error_badrepeat);
00185       __neg = __neg && _M_match_token(_ScannerT::_S_token_opt);
00186     };
00187       if (_M_match_token(_ScannerT::_S_token_closure0))
00188     {
00189       __init();
00190       auto __e = _M_pop();
00191       _StateSeqT __r(_M_nfa, _M_nfa._M_insert_alt(_S_invalid_state_id,
00192                               __e._M_start, __neg));
00193       __e._M_append(__r);
00194       _M_stack.push(__r);
00195     }
00196       else if (_M_match_token(_ScannerT::_S_token_closure1))
00197     {
00198       __init();
00199       auto __e = _M_pop();
00200       __e._M_append(_M_nfa._M_insert_alt(_S_invalid_state_id, __e._M_start,
00201                          __neg));
00202       _M_stack.push(__e);
00203     }
00204       else if (_M_match_token(_ScannerT::_S_token_opt))
00205     {
00206       __init();
00207       auto __e = _M_pop();
00208       auto __end = _M_nfa._M_insert_dummy();
00209       _StateSeqT __r(_M_nfa, _M_nfa._M_insert_alt(_S_invalid_state_id,
00210                               __e._M_start, __neg));
00211       __e._M_append(__end);
00212       __r._M_append(__end);
00213       _M_stack.push(__r);
00214     }
00215       else if (_M_match_token(_ScannerT::_S_token_interval_begin))
00216     {
00217       if (_M_stack.empty())
00218         __throw_regex_error(regex_constants::error_badrepeat);
00219       if (!_M_match_token(_ScannerT::_S_token_dup_count))
00220         __throw_regex_error(regex_constants::error_badbrace);
00221       _StateSeqT __r(_M_pop());
00222       _StateSeqT __e(_M_nfa, _M_nfa._M_insert_dummy());
00223       long __min_rep = _M_cur_int_value(10);
00224       bool __infi = false;
00225       long __n;
00226 
00227       // {3
00228       if (_M_match_token(_ScannerT::_S_token_comma))
00229         if (_M_match_token(_ScannerT::_S_token_dup_count)) // {3,7}
00230           __n = _M_cur_int_value(10) - __min_rep;
00231         else
00232           __infi = true;
00233       else
00234         __n = 0;
00235       if (!_M_match_token(_ScannerT::_S_token_interval_end))
00236         __throw_regex_error(regex_constants::error_brace);
00237 
00238       __neg = __neg && _M_match_token(_ScannerT::_S_token_opt);
00239 
00240       for (long __i = 0; __i < __min_rep; ++__i)
00241         __e._M_append(__r._M_clone());
00242 
00243       if (__infi)
00244         {
00245           auto __tmp = __r._M_clone();
00246           _StateSeqT __s(_M_nfa,
00247                  _M_nfa._M_insert_alt(_S_invalid_state_id,
00248                           __tmp._M_start, __neg));
00249           __tmp._M_append(__s);
00250           __e._M_append(__s);
00251         }
00252       else
00253         {
00254           if (__n < 0)
00255         __throw_regex_error(regex_constants::error_badbrace);
00256           auto __end = _M_nfa._M_insert_dummy();
00257           // _M_alt is the "match more" branch, and _M_next is the
00258           // "match less" one. Switch _M_alt and _M_next of all created
00259           // nodes. This is a hacking but IMO works well.
00260           std::stack<_StateIdT> __stack;
00261           for (long __i = 0; __i < __n; ++__i)
00262         {
00263           auto __tmp = __r._M_clone();
00264           auto __alt = _M_nfa._M_insert_alt(__tmp._M_start,
00265                             __end, __neg);
00266           __stack.push(__alt);
00267           __e._M_append(_StateSeqT(_M_nfa, __alt, __tmp._M_end));
00268         }
00269           __e._M_append(__end);
00270           while (!__stack.empty())
00271         {
00272           auto& __tmp = _M_nfa[__stack.top()];
00273           __stack.pop();
00274           std::swap(__tmp._M_next, __tmp._M_alt);
00275         }
00276         }
00277       _M_stack.push(__e);
00278     }
00279       else
00280     return false;
00281       return true;
00282     }
00283 
00284 #define __INSERT_REGEX_MATCHER(__func, args...)\
00285     do\
00286       if (!(_M_flags & regex_constants::icase))\
00287         if (!(_M_flags & regex_constants::collate))\
00288           __func<false, false>(args);\
00289         else\
00290           __func<false, true>(args);\
00291       else\
00292         if (!(_M_flags & regex_constants::collate))\
00293           __func<true, false>(args);\
00294         else\
00295           __func<true, true>(args);\
00296     while (false)
00297 
00298   template<typename _TraitsT>
00299     bool
00300     _Compiler<_TraitsT>::
00301     _M_atom()
00302     {
00303       if (_M_match_token(_ScannerT::_S_token_anychar))
00304     {
00305       if (!(_M_flags & regex_constants::ECMAScript))
00306         __INSERT_REGEX_MATCHER(_M_insert_any_matcher_posix);
00307       else
00308         __INSERT_REGEX_MATCHER(_M_insert_any_matcher_ecma);
00309     }
00310       else if (_M_try_char())
00311     __INSERT_REGEX_MATCHER(_M_insert_char_matcher);
00312       else if (_M_match_token(_ScannerT::_S_token_backref))
00313     _M_stack.push(_StateSeqT(_M_nfa, _M_nfa.
00314                  _M_insert_backref(_M_cur_int_value(10))));
00315       else if (_M_match_token(_ScannerT::_S_token_quoted_class))
00316     __INSERT_REGEX_MATCHER(_M_insert_character_class_matcher);
00317       else if (_M_match_token(_ScannerT::_S_token_subexpr_no_group_begin))
00318     {
00319       _StateSeqT __r(_M_nfa, _M_nfa._M_insert_dummy());
00320       this->_M_disjunction();
00321       if (!_M_match_token(_ScannerT::_S_token_subexpr_end))
00322         __throw_regex_error(regex_constants::error_paren);
00323       __r._M_append(_M_pop());
00324       _M_stack.push(__r);
00325     }
00326       else if (_M_match_token(_ScannerT::_S_token_subexpr_begin))
00327     {
00328       _StateSeqT __r(_M_nfa, _M_nfa._M_insert_subexpr_begin());
00329       this->_M_disjunction();
00330       if (!_M_match_token(_ScannerT::_S_token_subexpr_end))
00331         __throw_regex_error(regex_constants::error_paren);
00332       __r._M_append(_M_pop());
00333       __r._M_append(_M_nfa._M_insert_subexpr_end());
00334       _M_stack.push(__r);
00335     }
00336       else if (!_M_bracket_expression())
00337     return false;
00338       return true;
00339     }
00340 
00341   template<typename _TraitsT>
00342     bool
00343     _Compiler<_TraitsT>::
00344     _M_bracket_expression()
00345     {
00346       bool __neg =
00347     _M_match_token(_ScannerT::_S_token_bracket_neg_begin);
00348       if (!(__neg || _M_match_token(_ScannerT::_S_token_bracket_begin)))
00349     return false;
00350       __INSERT_REGEX_MATCHER(_M_insert_bracket_matcher, __neg);
00351       return true;
00352     }
00353 #undef __INSERT_REGEX_MATCHER
00354 
00355   template<typename _TraitsT>
00356   template<bool __icase, bool __collate>
00357     void
00358     _Compiler<_TraitsT>::
00359     _M_insert_any_matcher_ecma()
00360     {
00361       _M_stack.push(_StateSeqT(_M_nfa,
00362     _M_nfa._M_insert_matcher
00363       (_AnyMatcher<_TraitsT, true, __icase, __collate>
00364         (_M_traits))));
00365     }
00366 
00367   template<typename _TraitsT>
00368   template<bool __icase, bool __collate>
00369     void
00370     _Compiler<_TraitsT>::
00371     _M_insert_any_matcher_posix()
00372     {
00373       _M_stack.push(_StateSeqT(_M_nfa,
00374     _M_nfa._M_insert_matcher
00375       (_AnyMatcher<_TraitsT, false, __icase, __collate>
00376         (_M_traits))));
00377     }
00378 
00379   template<typename _TraitsT>
00380   template<bool __icase, bool __collate>
00381     void
00382     _Compiler<_TraitsT>::
00383     _M_insert_char_matcher()
00384     {
00385       _M_stack.push(_StateSeqT(_M_nfa,
00386     _M_nfa._M_insert_matcher
00387       (_CharMatcher<_TraitsT, __icase, __collate>
00388         (_M_value[0], _M_traits))));
00389     }
00390 
00391   template<typename _TraitsT>
00392   template<bool __icase, bool __collate>
00393     void
00394     _Compiler<_TraitsT>::
00395     _M_insert_character_class_matcher()
00396     {
00397       _GLIBCXX_DEBUG_ASSERT(_M_value.size() == 1);
00398       _BracketMatcher<_TraitsT, __icase, __collate> __matcher
00399     (_M_ctype.is(_CtypeT::upper, _M_value[0]), _M_traits);
00400       __matcher._M_add_character_class(_M_value, false);
00401       __matcher._M_ready();
00402       _M_stack.push(_StateSeqT(_M_nfa,
00403     _M_nfa._M_insert_matcher(std::move(__matcher))));
00404     }
00405 
00406   template<typename _TraitsT>
00407   template<bool __icase, bool __collate>
00408     void
00409     _Compiler<_TraitsT>::
00410     _M_insert_bracket_matcher(bool __neg)
00411     {
00412       _BracketMatcher<_TraitsT, __icase, __collate> __matcher(__neg, _M_traits);
00413       pair<bool, _CharT> __last_char; // Optional<_CharT>
00414       __last_char.first = false;
00415       if (!(_M_flags & regex_constants::ECMAScript))
00416     if (_M_try_char())
00417       {
00418         __matcher._M_add_char(_M_value[0]);
00419         __last_char.first = true;
00420         __last_char.second = _M_value[0];
00421       }
00422       while (_M_expression_term(__last_char, __matcher));
00423       __matcher._M_ready();
00424       _M_stack.push(_StateSeqT(
00425               _M_nfa,
00426               _M_nfa._M_insert_matcher(std::move(__matcher))));
00427     }
00428 
00429   template<typename _TraitsT>
00430   template<bool __icase, bool __collate>
00431     bool
00432     _Compiler<_TraitsT>::
00433     _M_expression_term(pair<bool, _CharT>& __last_char,
00434                _BracketMatcher<_TraitsT, __icase, __collate>& __matcher)
00435 
00436     {
00437       if (_M_match_token(_ScannerT::_S_token_bracket_end))
00438     return false;
00439 
00440       if (_M_match_token(_ScannerT::_S_token_collsymbol))
00441     {
00442       auto __symbol = __matcher._M_add_collate_element(_M_value);
00443       if (__symbol.size() == 1)
00444         {
00445           __last_char.first = true;
00446           __last_char.second = __symbol[0];
00447         }
00448     }
00449       else if (_M_match_token(_ScannerT::_S_token_equiv_class_name))
00450     __matcher._M_add_equivalence_class(_M_value);
00451       else if (_M_match_token(_ScannerT::_S_token_char_class_name))
00452     __matcher._M_add_character_class(_M_value, false);
00453       // POSIX doesn't allow '-' as a start-range char (say [a-z--0]),
00454       // except when the '-' is the first or last character in the bracket
00455       // expression ([--0]). ECMAScript treats all '-' after a range as a
00456       // normal character. Also see above, where _M_expression_term gets called.
00457       //
00458       // As a result, POSIX rejects [-----], but ECMAScript doesn't.
00459       // Boost (1.57.0) always uses POSIX style even in its ECMAScript syntax.
00460       // Clang (3.5) always uses ECMAScript style even in its POSIX syntax.
00461       //
00462       // It turns out that no one reads BNFs ;)
00463       else if (_M_try_char())
00464     {
00465       if (!__last_char.first)
00466         {
00467           __matcher._M_add_char(_M_value[0]);
00468           if (_M_value[0] == '-'
00469           && !(_M_flags & regex_constants::ECMAScript))
00470         {
00471           if (_M_match_token(_ScannerT::_S_token_bracket_end))
00472             return false;
00473           __throw_regex_error(regex_constants::error_range);
00474         }
00475           __last_char.first = true;
00476           __last_char.second = _M_value[0];
00477         }
00478       else
00479         {
00480           if (_M_value[0] == '-')
00481         {
00482           if (_M_try_char())
00483             {
00484               __matcher._M_make_range(__last_char.second , _M_value[0]);
00485               __last_char.first = false;
00486             }
00487           else
00488             {
00489               if (_M_scanner._M_get_token()
00490               != _ScannerT::_S_token_bracket_end)
00491             __throw_regex_error(regex_constants::error_range);
00492               __matcher._M_add_char(_M_value[0]);
00493             }
00494         }
00495           else
00496         {
00497           __matcher._M_add_char(_M_value[0]);
00498           __last_char.second = _M_value[0];
00499         }
00500         }
00501     }
00502       else if (_M_match_token(_ScannerT::_S_token_quoted_class))
00503     __matcher._M_add_character_class(_M_value,
00504                      _M_ctype.is(_CtypeT::upper,
00505                              _M_value[0]));
00506       else
00507     __throw_regex_error(regex_constants::error_brack);
00508 
00509       return true;
00510     }
00511 
00512   template<typename _TraitsT>
00513     bool
00514     _Compiler<_TraitsT>::
00515     _M_try_char()
00516     {
00517       bool __is_char = false;
00518       if (_M_match_token(_ScannerT::_S_token_oct_num))
00519     {
00520       __is_char = true;
00521       _M_value.assign(1, _M_cur_int_value(8));
00522     }
00523       else if (_M_match_token(_ScannerT::_S_token_hex_num))
00524     {
00525       __is_char = true;
00526       _M_value.assign(1, _M_cur_int_value(16));
00527     }
00528       else if (_M_match_token(_ScannerT::_S_token_ord_char))
00529     __is_char = true;
00530       return __is_char;
00531     }
00532 
00533   template<typename _TraitsT>
00534     bool
00535     _Compiler<_TraitsT>::
00536     _M_match_token(_TokenT token)
00537     {
00538       if (token == _M_scanner._M_get_token())
00539     {
00540       _M_value = _M_scanner._M_get_value();
00541       _M_scanner._M_advance();
00542       return true;
00543     }
00544       return false;
00545     }
00546 
00547   template<typename _TraitsT>
00548     int
00549     _Compiler<_TraitsT>::
00550     _M_cur_int_value(int __radix)
00551     {
00552       long __v = 0;
00553       for (typename _StringT::size_type __i = 0;
00554        __i < _M_value.length(); ++__i)
00555     __v =__v * __radix + _M_traits.value(_M_value[__i], __radix);
00556       return __v;
00557     }
00558 
00559   template<typename _TraitsT, bool __icase, bool __collate>
00560     bool
00561     _BracketMatcher<_TraitsT, __icase, __collate>::
00562     _M_apply(_CharT __ch, false_type) const
00563     {
00564       bool __ret = false;
00565       if (std::find(_M_char_set.begin(), _M_char_set.end(),
00566             _M_translator._M_translate(__ch))
00567       != _M_char_set.end())
00568     __ret = true;
00569       else
00570     {
00571       auto __s = _M_translator._M_transform(__ch);
00572       for (auto& __it : _M_range_set)
00573         if (__it.first <= __s && __s <= __it.second)
00574           {
00575         __ret = true;
00576         break;
00577           }
00578       if (_M_traits.isctype(__ch, _M_class_set))
00579         __ret = true;
00580       else if (std::find(_M_equiv_set.begin(), _M_equiv_set.end(),
00581                  _M_traits.transform_primary(&__ch, &__ch+1))
00582            != _M_equiv_set.end())
00583         __ret = true;
00584       else
00585         {
00586           for (auto& __it : _M_neg_class_set)
00587         if (!_M_traits.isctype(__ch, __it))
00588           {
00589             __ret = true;
00590             break;
00591           }
00592         }
00593     }
00594       if (_M_is_non_matching)
00595     return !__ret;
00596       else
00597     return __ret;
00598     }
00599 
00600 _GLIBCXX_END_NAMESPACE_VERSION
00601 } // namespace __detail
00602 } // namespace