libstdc++
regex_compiler.tcc
Go to the documentation of this file.
00001 // class template regex -*- C++ -*-
00002 
00003 // Copyright (C) 2013-2015 Free Software Foundation, Inc.
00004 //
00005 // This file is part of the GNU ISO C++ Library.  This library is free
00006 // software; you can redistribute it and/or modify it under the
00007 // terms of the GNU General Public License as published by the
00008 // Free Software Foundation; either version 3, or (at your option)
00009 // any later version.
00010 
00011 // This library is distributed in the hope that it will be useful,
00012 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00013 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014 // GNU General Public License for more details.
00015 
00016 // Under Section 7 of GPL version 3, you are granted additional
00017 // permissions described in the GCC Runtime Library Exception, version
00018 // 3.1, as published by the Free Software Foundation.
00019 
00020 // You should have received a copy of the GNU General Public License and
00021 // a copy of the GCC Runtime Library Exception along with this program;
00022 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
00023 // <http://www.gnu.org/licenses/>.
00024 
00025 /**
00026  *  @file bits/regex_compiler.tcc
00027  *  This is an internal header file, included by other library headers.
00028  *  Do not attempt to use it directly. @headername{regex}
00029  */
00030 
00031 // FIXME make comments doxygen format.
00032 
00033 // This compiler refers to "Regular Expression Matching Can Be Simple And Fast"
00034 // (http://swtch.com/~rsc/regexp/regexp1.html"),
00035 // but doesn't strictly follow it.
00036 //
00037 // When compiling, states are *chained* instead of tree- or graph-constructed.
00038 // It's more like structured programs: there's if statement and loop statement.
00039 //
00040 // For alternative structure (say "a|b"), aka "if statement", two branches
00041 // should be constructed. However, these two shall merge to an "end_tag" at
00042 // the end of this operator:
00043 //
00044 //                branch1
00045 //              /        \
00046 // => begin_tag            end_tag =>
00047 //              \        /
00048 //                branch2
00049 //
00050 // This is the difference between this implementation and that in Russ's
00051 // article.
00052 //
00053 // That's why we introduced dummy node here ------ "end_tag" is a dummy node.
00054 // All dummy node will be eliminated at the end of compiling process.
00055 
00056 namespace std _GLIBCXX_VISIBILITY(default)
00057 {
00058 namespace __detail
00059 {
00060 _GLIBCXX_BEGIN_NAMESPACE_VERSION
00061 
00062   template<typename _TraitsT>
00063     _Compiler<_TraitsT>::
00064     _Compiler(_IterT __b, _IterT __e,
00065               const typename _TraitsT::locale_type& __loc, _FlagT __flags)
00066     : _M_flags((__flags
00067                 & (regex_constants::ECMAScript
00068                    | regex_constants::basic
00069                    | regex_constants::extended
00070                    | regex_constants::grep
00071                    | regex_constants::egrep
00072                    | regex_constants::awk))
00073                ? __flags
00074                : __flags | regex_constants::ECMAScript),
00075       _M_scanner(__b, __e, _M_flags, __loc),
00076       _M_nfa(make_shared<_RegexT>(__loc, _M_flags)),
00077       _M_traits(_M_nfa->_M_traits),
00078       _M_ctype(std::use_facet<_CtypeT>(__loc))
00079     {
00080       _StateSeqT __r(*_M_nfa, _M_nfa->_M_start());
00081       __r._M_append(_M_nfa->_M_insert_subexpr_begin());
00082       this->_M_disjunction();
00083       if (!_M_match_token(_ScannerT::_S_token_eof))
00084         __throw_regex_error(regex_constants::error_paren);
00085       __r._M_append(_M_pop());
00086       _GLIBCXX_DEBUG_ASSERT(_M_stack.empty());
00087       __r._M_append(_M_nfa->_M_insert_subexpr_end());
00088       __r._M_append(_M_nfa->_M_insert_accept());
00089       _M_nfa->_M_eliminate_dummy();
00090     }
00091 
00092   template<typename _TraitsT>
00093     void
00094     _Compiler<_TraitsT>::
00095     _M_disjunction()
00096     {
00097       this->_M_alternative();
00098       while (_M_match_token(_ScannerT::_S_token_or))
00099         {
00100           _StateSeqT __alt1 = _M_pop();
00101           this->_M_alternative();
00102           _StateSeqT __alt2 = _M_pop();
00103           auto __end = _M_nfa->_M_insert_dummy();
00104           __alt1._M_append(__end);
00105           __alt2._M_append(__end);
00106           // __alt2 is state._M_next, __alt1 is state._M_alt. The executor
00107           // executes _M_alt before _M_next, as well as executing left
00108           // alternative before right one.
00109           _M_stack.push(_StateSeqT(*_M_nfa,
00110                                    _M_nfa->_M_insert_alt(
00111                                      __alt2._M_start, __alt1._M_start, false),
00112                                    __end));
00113         }
00114     }
00115 
00116   template<typename _TraitsT>
00117     void
00118     _Compiler<_TraitsT>::
00119     _M_alternative()
00120     {
00121       if (this->_M_term())
00122         {
00123           _StateSeqT __re = _M_pop();
00124           this->_M_alternative();
00125           __re._M_append(_M_pop());
00126           _M_stack.push(__re);
00127         }
00128       else
00129         _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->_M_insert_dummy()));
00130     }
00131 
00132   template<typename _TraitsT>
00133     bool
00134     _Compiler<_TraitsT>::
00135     _M_term()
00136     {
00137       if (this->_M_assertion())
00138         return true;
00139       if (this->_M_atom())
00140         {
00141           while (this->_M_quantifier());
00142           return true;
00143         }
00144       return false;
00145     }
00146 
00147   template<typename _TraitsT>
00148     bool
00149     _Compiler<_TraitsT>::
00150     _M_assertion()
00151     {
00152       if (_M_match_token(_ScannerT::_S_token_line_begin))
00153         _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->_M_insert_line_begin()));
00154       else if (_M_match_token(_ScannerT::_S_token_line_end))
00155         _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->_M_insert_line_end()));
00156       else if (_M_match_token(_ScannerT::_S_token_word_bound))
00157         // _M_value[0] == 'n' means it's negative, say "not word boundary".
00158         _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->
00159               _M_insert_word_bound(_M_value[0] == 'n')));
00160       else if (_M_match_token(_ScannerT::_S_token_subexpr_lookahead_begin))
00161         {
00162           auto __neg = _M_value[0] == 'n';
00163           this->_M_disjunction();
00164           if (!_M_match_token(_ScannerT::_S_token_subexpr_end))
00165             __throw_regex_error(regex_constants::error_paren);
00166           auto __tmp = _M_pop();
00167           __tmp._M_append(_M_nfa->_M_insert_accept());
00168           _M_stack.push(
00169               _StateSeqT(
00170                 *_M_nfa,
00171                 _M_nfa->_M_insert_lookahead(__tmp._M_start, __neg)));
00172         }
00173       else
00174         return false;
00175       return true;
00176     }
00177 
00178   template<typename _TraitsT>
00179     bool
00180     _Compiler<_TraitsT>::
00181     _M_quantifier()
00182     {
00183       bool __neg = (_M_flags & regex_constants::ECMAScript);
00184       auto __init = [this, &__neg]()
00185         {
00186           if (_M_stack.empty())
00187             __throw_regex_error(regex_constants::error_badrepeat);
00188           __neg = __neg && _M_match_token(_ScannerT::_S_token_opt);
00189         };
00190       if (_M_match_token(_ScannerT::_S_token_closure0))
00191         {
00192           __init();
00193           auto __e = _M_pop();
00194           _StateSeqT __r(*_M_nfa,
00195                          _M_nfa->_M_insert_repeat(_S_invalid_state_id,
00196                                                   __e._M_start, __neg));
00197           __e._M_append(__r);
00198           _M_stack.push(__r);
00199         }
00200       else if (_M_match_token(_ScannerT::_S_token_closure1))
00201         {
00202           __init();
00203           auto __e = _M_pop();
00204           __e._M_append(_M_nfa->_M_insert_repeat(_S_invalid_state_id,
00205                                                  __e._M_start, __neg));
00206           _M_stack.push(__e);
00207         }
00208       else if (_M_match_token(_ScannerT::_S_token_opt))
00209         {
00210           __init();
00211           auto __e = _M_pop();
00212           auto __end = _M_nfa->_M_insert_dummy();
00213           _StateSeqT __r(*_M_nfa,
00214                          _M_nfa->_M_insert_repeat(_S_invalid_state_id,
00215                                                   __e._M_start, __neg));
00216           __e._M_append(__end);
00217           __r._M_append(__end);
00218           _M_stack.push(__r);
00219         }
00220       else if (_M_match_token(_ScannerT::_S_token_interval_begin))
00221         {
00222           if (_M_stack.empty())
00223             __throw_regex_error(regex_constants::error_badrepeat);
00224           if (!_M_match_token(_ScannerT::_S_token_dup_count))
00225             __throw_regex_error(regex_constants::error_badbrace);
00226           _StateSeqT __r(_M_pop());
00227           _StateSeqT __e(*_M_nfa, _M_nfa->_M_insert_dummy());
00228           long __min_rep = _M_cur_int_value(10);
00229           bool __infi = false;
00230           long __n;
00231 
00232           // {3
00233           if (_M_match_token(_ScannerT::_S_token_comma))
00234             if (_M_match_token(_ScannerT::_S_token_dup_count)) // {3,7}
00235               __n = _M_cur_int_value(10) - __min_rep;
00236             else
00237               __infi = true;
00238           else
00239             __n = 0;
00240           if (!_M_match_token(_ScannerT::_S_token_interval_end))
00241             __throw_regex_error(regex_constants::error_brace);
00242 
00243           __neg = __neg && _M_match_token(_ScannerT::_S_token_opt);
00244 
00245           for (long __i = 0; __i < __min_rep; ++__i)
00246             __e._M_append(__r._M_clone());
00247 
00248           if (__infi)
00249             {
00250               auto __tmp = __r._M_clone();
00251               _StateSeqT __s(*_M_nfa,
00252                              _M_nfa->_M_insert_repeat(_S_invalid_state_id,
00253                                                       __tmp._M_start, __neg));
00254               __tmp._M_append(__s);
00255               __e._M_append(__s);
00256             }
00257           else
00258             {
00259               if (__n < 0)
00260                 __throw_regex_error(regex_constants::error_badbrace);
00261               auto __end = _M_nfa->_M_insert_dummy();
00262               // _M_alt is the "match more" branch, and _M_next is the
00263               // "match less" one. Switch _M_alt and _M_next of all created
00264               // nodes. This is a hack but IMO works well.
00265               std::stack<_StateIdT> __stack;
00266               for (long __i = 0; __i < __n; ++__i)
00267                 {
00268                   auto __tmp = __r._M_clone();
00269                   auto __alt = _M_nfa->_M_insert_repeat(__tmp._M_start,
00270                                                         __end, __neg);
00271                   __stack.push(__alt);
00272                   __e._M_append(_StateSeqT(*_M_nfa, __alt, __tmp._M_end));
00273                 }
00274               __e._M_append(__end);
00275               while (!__stack.empty())
00276                 {
00277                   auto& __tmp = (*_M_nfa)[__stack.top()];
00278                   __stack.pop();
00279                   std::swap(__tmp._M_next, __tmp._M_alt);
00280                 }
00281             }
00282           _M_stack.push(__e);
00283         }
00284       else
00285         return false;
00286       return true;
00287     }
00288 
00289 #define __INSERT_REGEX_MATCHER(__func, args...)\
00290         do\
00291           if (!(_M_flags & regex_constants::icase))\
00292             if (!(_M_flags & regex_constants::collate))\
00293               __func<false, false>(args);\
00294             else\
00295               __func<false, true>(args);\
00296           else\
00297             if (!(_M_flags & regex_constants::collate))\
00298               __func<true, false>(args);\
00299             else\
00300               __func<true, true>(args);\
00301         while (false)
00302 
00303   template<typename _TraitsT>
00304     bool
00305     _Compiler<_TraitsT>::
00306     _M_atom()
00307     {
00308       if (_M_match_token(_ScannerT::_S_token_anychar))
00309         {
00310           if (!(_M_flags & regex_constants::ECMAScript))
00311             __INSERT_REGEX_MATCHER(_M_insert_any_matcher_posix);
00312           else
00313             __INSERT_REGEX_MATCHER(_M_insert_any_matcher_ecma);
00314         }
00315       else if (_M_try_char())
00316         __INSERT_REGEX_MATCHER(_M_insert_char_matcher);
00317       else if (_M_match_token(_ScannerT::_S_token_backref))
00318         _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->
00319                                  _M_insert_backref(_M_cur_int_value(10))));
00320       else if (_M_match_token(_ScannerT::_S_token_quoted_class))
00321         __INSERT_REGEX_MATCHER(_M_insert_character_class_matcher);
00322       else if (_M_match_token(_ScannerT::_S_token_subexpr_no_group_begin))
00323         {
00324           _StateSeqT __r(*_M_nfa, _M_nfa->_M_insert_dummy());
00325           this->_M_disjunction();
00326           if (!_M_match_token(_ScannerT::_S_token_subexpr_end))
00327             __throw_regex_error(regex_constants::error_paren);
00328           __r._M_append(_M_pop());
00329           _M_stack.push(__r);
00330         }
00331       else if (_M_match_token(_ScannerT::_S_token_subexpr_begin))
00332         {
00333           _StateSeqT __r(*_M_nfa, _M_nfa->_M_insert_subexpr_begin());
00334           this->_M_disjunction();
00335           if (!_M_match_token(_ScannerT::_S_token_subexpr_end))
00336             __throw_regex_error(regex_constants::error_paren);
00337           __r._M_append(_M_pop());
00338           __r._M_append(_M_nfa->_M_insert_subexpr_end());
00339           _M_stack.push(__r);
00340         }
00341       else if (!_M_bracket_expression())
00342         return false;
00343       return true;
00344     }
00345 
00346   template<typename _TraitsT>
00347     bool
00348     _Compiler<_TraitsT>::
00349     _M_bracket_expression()
00350     {
00351       bool __neg =
00352         _M_match_token(_ScannerT::_S_token_bracket_neg_begin);
00353       if (!(__neg || _M_match_token(_ScannerT::_S_token_bracket_begin)))
00354         return false;
00355       __INSERT_REGEX_MATCHER(_M_insert_bracket_matcher, __neg);
00356       return true;
00357     }
00358 #undef __INSERT_REGEX_MATCHER
00359 
00360   template<typename _TraitsT>
00361   template<bool __icase, bool __collate>
00362     void
00363     _Compiler<_TraitsT>::
00364     _M_insert_any_matcher_ecma()
00365     {
00366       _M_stack.push(_StateSeqT(*_M_nfa,
00367         _M_nfa->_M_insert_matcher
00368           (_AnyMatcher<_TraitsT, true, __icase, __collate>
00369             (_M_traits))));
00370     }
00371 
00372   template<typename _TraitsT>
00373   template<bool __icase, bool __collate>
00374     void
00375     _Compiler<_TraitsT>::
00376     _M_insert_any_matcher_posix()
00377     {
00378       _M_stack.push(_StateSeqT(*_M_nfa,
00379         _M_nfa->_M_insert_matcher
00380           (_AnyMatcher<_TraitsT, false, __icase, __collate>
00381             (_M_traits))));
00382     }
00383 
00384   template<typename _TraitsT>
00385   template<bool __icase, bool __collate>
00386     void
00387     _Compiler<_TraitsT>::
00388     _M_insert_char_matcher()
00389     {
00390       _M_stack.push(_StateSeqT(*_M_nfa,
00391         _M_nfa->_M_insert_matcher
00392           (_CharMatcher<_TraitsT, __icase, __collate>
00393             (_M_value[0], _M_traits))));
00394     }
00395 
00396   template<typename _TraitsT>
00397   template<bool __icase, bool __collate>
00398     void
00399     _Compiler<_TraitsT>::
00400     _M_insert_character_class_matcher()
00401     {
00402       _GLIBCXX_DEBUG_ASSERT(_M_value.size() == 1);
00403       _BracketMatcher<_TraitsT, __icase, __collate> __matcher
00404         (_M_ctype.is(_CtypeT::upper, _M_value[0]), _M_traits);
00405       __matcher._M_add_character_class(_M_value, false);
00406       __matcher._M_ready();
00407       _M_stack.push(_StateSeqT(*_M_nfa,
00408         _M_nfa->_M_insert_matcher(std::move(__matcher))));
00409     }
00410 
00411   template<typename _TraitsT>
00412   template<bool __icase, bool __collate>
00413     void
00414     _Compiler<_TraitsT>::
00415     _M_insert_bracket_matcher(bool __neg)
00416     {
00417       _BracketMatcher<_TraitsT, __icase, __collate> __matcher(__neg, _M_traits);
00418       pair<bool, _CharT> __last_char; // Optional<_CharT>
00419       __last_char.first = false;
00420       if (!(_M_flags & regex_constants::ECMAScript))
00421         if (_M_try_char())
00422           {
00423             __matcher._M_add_char(_M_value[0]);
00424             __last_char.first = true;
00425             __last_char.second = _M_value[0];
00426           }
00427       while (_M_expression_term(__last_char, __matcher));
00428       __matcher._M_ready();
00429       _M_stack.push(_StateSeqT(
00430                       *_M_nfa,
00431                       _M_nfa->_M_insert_matcher(std::move(__matcher))));
00432     }
00433 
00434   template<typename _TraitsT>
00435   template<bool __icase, bool __collate>
00436     bool
00437     _Compiler<_TraitsT>::
00438     _M_expression_term(pair<bool, _CharT>& __last_char,
00439                        _BracketMatcher<_TraitsT, __icase, __collate>& __matcher)
00440     {
00441       if (_M_match_token(_ScannerT::_S_token_bracket_end))
00442         return false;
00443 
00444       if (_M_match_token(_ScannerT::_S_token_collsymbol))
00445         {
00446           auto __symbol = __matcher._M_add_collate_element(_M_value);
00447           if (__symbol.size() == 1)
00448             {
00449               __last_char.first = true;
00450               __last_char.second = __symbol[0];
00451             }
00452         }
00453       else if (_M_match_token(_ScannerT::_S_token_equiv_class_name))
00454         __matcher._M_add_equivalence_class(_M_value);
00455       else if (_M_match_token(_ScannerT::_S_token_char_class_name))
00456         __matcher._M_add_character_class(_M_value, false);
00457       // POSIX doesn't allow '-' as a start-range char (say [a-z--0]),
00458       // except when the '-' is the first or last character in the bracket
00459       // expression ([--0]). ECMAScript treats all '-' after a range as a
00460       // normal character. Also see above, where _M_expression_term gets called.
00461       //
00462       // As a result, POSIX rejects [-----], but ECMAScript doesn't.
00463       // Boost (1.57.0) always uses POSIX style even in its ECMAScript syntax.
00464       // Clang (3.5) always uses ECMAScript style even in its POSIX syntax.
00465       //
00466       // It turns out that no one reads BNFs ;)
00467       else if (_M_try_char())
00468         {
00469           if (!__last_char.first)
00470             {
00471               __matcher._M_add_char(_M_value[0]);
00472               if (_M_value[0] == '-'
00473                   && !(_M_flags & regex_constants::ECMAScript))
00474                 {
00475                   if (_M_match_token(_ScannerT::_S_token_bracket_end))
00476                     return false;
00477                   __throw_regex_error(regex_constants::error_range);
00478                 }
00479               __last_char.first = true;
00480               __last_char.second = _M_value[0];
00481             }
00482           else
00483             {
00484               if (_M_value[0] == '-')
00485                 {
00486                   if (_M_try_char())
00487                     {
00488                       __matcher._M_make_range(__last_char.second , _M_value[0]);
00489                       __last_char.first = false;
00490                     }
00491                   else
00492                     {
00493                       if (_M_scanner._M_get_token()
00494                           != _ScannerT::_S_token_bracket_end)
00495                         __throw_regex_error(regex_constants::error_range);
00496                       __matcher._M_add_char(_M_value[0]);
00497                     }
00498                 }
00499               else
00500                 {
00501                   __matcher._M_add_char(_M_value[0]);
00502                   __last_char.second = _M_value[0];
00503                 }
00504             }
00505         }
00506       else if (_M_match_token(_ScannerT::_S_token_quoted_class))
00507         __matcher._M_add_character_class(_M_value,
00508                                          _M_ctype.is(_CtypeT::upper,
00509                                                      _M_value[0]));
00510       else
00511         __throw_regex_error(regex_constants::error_brack);
00512 
00513       return true;
00514     }
00515 
00516   template<typename _TraitsT>
00517     bool
00518     _Compiler<_TraitsT>::
00519     _M_try_char()
00520     {
00521       bool __is_char = false;
00522       if (_M_match_token(_ScannerT::_S_token_oct_num))
00523         {
00524           __is_char = true;
00525           _M_value.assign(1, _M_cur_int_value(8));
00526         }
00527       else if (_M_match_token(_ScannerT::_S_token_hex_num))
00528         {
00529           __is_char = true;
00530           _M_value.assign(1, _M_cur_int_value(16));
00531         }
00532       else if (_M_match_token(_ScannerT::_S_token_ord_char))
00533         __is_char = true;
00534       return __is_char;
00535     }
00536 
00537   template<typename _TraitsT>
00538     bool
00539     _Compiler<_TraitsT>::
00540     _M_match_token(_TokenT token)
00541     {
00542       if (token == _M_scanner._M_get_token())
00543         {
00544           _M_value = _M_scanner._M_get_value();
00545           _M_scanner._M_advance();
00546           return true;
00547         }
00548       return false;
00549     }
00550 
00551   template<typename _TraitsT>
00552     int
00553     _Compiler<_TraitsT>::
00554     _M_cur_int_value(int __radix)
00555     {
00556       long __v = 0;
00557       for (typename _StringT::size_type __i = 0;
00558            __i < _M_value.length(); ++__i)
00559         __v =__v * __radix + _M_traits.value(_M_value[__i], __radix);
00560       return __v;
00561     }
00562 
00563   template<typename _TraitsT, bool __icase, bool __collate>
00564     bool
00565     _BracketMatcher<_TraitsT, __icase, __collate>::
00566     _M_apply(_CharT __ch, false_type) const
00567     {
00568       bool __ret = std::binary_search(_M_char_set.begin(), _M_char_set.end(),
00569                                       _M_translator._M_translate(__ch));
00570       if (!__ret)
00571         {
00572           auto __s = _M_translator._M_transform(__ch);
00573           for (auto& __it : _M_range_set)
00574             if (__it.first <= __s && __s <= __it.second)
00575               {
00576                 __ret = true;
00577                 break;
00578               }
00579           if (_M_traits.isctype(__ch, _M_class_set))
00580             __ret = true;
00581           else if (std::find(_M_equiv_set.begin(), _M_equiv_set.end(),
00582                              _M_traits.transform_primary(&__ch, &__ch+1))
00583                    != _M_equiv_set.end())
00584             __ret = true;
00585           else
00586             {
00587               for (auto& __it : _M_neg_class_set)
00588                 if (!_M_traits.isctype(__ch, __it))
00589                   {
00590                     __ret = true;
00591                     break;
00592                   }
00593             }
00594         }
00595       if (_M_is_non_matching)
00596         return !__ret;
00597       else
00598         return __ret;
00599     }
00600 
00601 _GLIBCXX_END_NAMESPACE_VERSION
00602 } // namespace __detail
00603 } // namespace