libstdc++
regex_automaton.h
Go to the documentation of this file.
00001 // class template regex -*- C++ -*-
00002 
00003 // Copyright (C) 2013-2015 Free Software Foundation, Inc.
00004 //
00005 // This file is part of the GNU ISO C++ Library.  This library is free
00006 // software; you can redistribute it and/or modify it under the
00007 // terms of the GNU General Public License as published by the
00008 // Free Software Foundation; either version 3, or (at your option)
00009 // any later version.
00010 
00011 // This library is distributed in the hope that it will be useful,
00012 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00013 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014 // GNU General Public License for more details.
00015 
00016 // Under Section 7 of GPL version 3, you are granted additional
00017 // permissions described in the GCC Runtime Library Exception, version
00018 // 3.1, as published by the Free Software Foundation.
00019 
00020 // You should have received a copy of the GNU General Public License and
00021 // a copy of the GCC Runtime Library Exception along with this program;
00022 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
00023 // <http://www.gnu.org/licenses/>.
00024 
00025 /**
00026  *  @file bits/regex_automaton.h
00027  *  This is an internal header file, included by other library headers.
00028  *  Do not attempt to use it directly. @headername{regex}
00029  */
00030 
00031 // This macro defines the maximal state number a NFA can have.
00032 #ifndef _GLIBCXX_REGEX_STATE_LIMIT
00033 #define _GLIBCXX_REGEX_STATE_LIMIT 100000
00034 #endif
00035 
00036 namespace std _GLIBCXX_VISIBILITY(default)
00037 {
00038 namespace __detail
00039 {
00040 _GLIBCXX_BEGIN_NAMESPACE_VERSION
00041 
00042   /**
00043    *  @defgroup regex-detail Base and Implementation Classes
00044    *  @ingroup regex
00045    *  @{
00046    */
00047 
00048   typedef long _StateIdT;
00049   static const _StateIdT _S_invalid_state_id  = -1;
00050 
00051   template<typename _CharT>
00052     using _Matcher = std::function<bool (_CharT)>;
00053 
00054   /// Operation codes that define the type of transitions within the base NFA
00055   /// that represents the regular expression.
00056   enum _Opcode : int
00057   {
00058       _S_opcode_unknown,
00059       _S_opcode_alternative,
00060       _S_opcode_repeat,
00061       _S_opcode_backref,
00062       _S_opcode_line_begin_assertion,
00063       _S_opcode_line_end_assertion,
00064       _S_opcode_word_boundary,
00065       _S_opcode_subexpr_lookahead,
00066       _S_opcode_subexpr_begin,
00067       _S_opcode_subexpr_end,
00068       _S_opcode_dummy,
00069       _S_opcode_match,
00070       _S_opcode_accept,
00071   };
00072 
00073   struct _State_base
00074   {
00075     _Opcode      _M_opcode;           // type of outgoing transition
00076     _StateIdT    _M_next;             // outgoing transition
00077     union // Since they are mutually exclusive.
00078     {
00079       size_t _M_subexpr;        // for _S_opcode_subexpr_*
00080       size_t _M_backref_index;  // for _S_opcode_backref
00081       struct
00082       {
00083         // for _S_opcode_alternative, _S_opcode_repeat and
00084         // _S_opcode_subexpr_lookahead
00085         _StateIdT  _M_alt;
00086         // for _S_opcode_word_boundary or _S_opcode_subexpr_lookahead or
00087         // quantifiers (ungreedy if set true)
00088         bool       _M_neg;
00089       };
00090     };
00091 
00092     explicit _State_base(_Opcode __opcode)
00093     : _M_opcode(__opcode), _M_next(_S_invalid_state_id)
00094     { }
00095 
00096   protected:
00097     ~_State_base() = default;
00098 
00099   public:
00100 #ifdef _GLIBCXX_DEBUG
00101     std::ostream&
00102     _M_print(std::ostream& ostr) const;
00103 
00104     // Prints graphviz dot commands for state.
00105     std::ostream&
00106     _M_dot(std::ostream& __ostr, _StateIdT __id) const;
00107 #endif
00108   };
00109 
00110   template<typename _TraitsT>
00111     struct _State : _State_base
00112     {
00113       typedef _Matcher<typename _TraitsT::char_type> _MatcherT;
00114 
00115       _MatcherT      _M_matches;        // for _S_opcode_match
00116 
00117       explicit _State(_Opcode __opcode) : _State_base(__opcode) { }
00118     };
00119 
00120   struct _NFA_base
00121   {
00122     typedef size_t                              _SizeT;
00123     typedef regex_constants::syntax_option_type _FlagT;
00124 
00125     explicit
00126     _NFA_base(_FlagT __f)
00127     : _M_flags(__f), _M_start_state(0), _M_subexpr_count(0),
00128     _M_has_backref(false)
00129     { }
00130 
00131     _NFA_base(_NFA_base&&) = default;
00132 
00133   protected:
00134     ~_NFA_base() = default;
00135 
00136   public:
00137     _FlagT
00138     _M_options() const
00139     { return _M_flags; }
00140 
00141     _StateIdT
00142     _M_start() const
00143     { return _M_start_state; }
00144 
00145     _SizeT
00146     _M_sub_count() const
00147     { return _M_subexpr_count; }
00148 
00149     std::vector<size_t>       _M_paren_stack;
00150     _FlagT                    _M_flags;
00151     _StateIdT                 _M_start_state;
00152     _SizeT                    _M_subexpr_count;
00153     bool                      _M_has_backref;
00154   };
00155 
00156   template<typename _TraitsT>
00157     struct _NFA
00158     : _NFA_base, std::vector<_State<_TraitsT>>
00159     {
00160       typedef _State<_TraitsT>                          _StateT;
00161       typedef _Matcher<typename _TraitsT::char_type>    _MatcherT;
00162 
00163       _NFA(const typename _TraitsT::locale_type& __loc, _FlagT __flags)
00164       : _NFA_base(__flags)
00165       { _M_traits.imbue(__loc); }
00166 
00167       // for performance reasons _NFA objects should only be moved not copied
00168       _NFA(const _NFA&) = delete;
00169       _NFA(_NFA&&) = default;
00170 
00171       _StateIdT
00172       _M_insert_accept()
00173       {
00174         auto __ret = _M_insert_state(_StateT(_S_opcode_accept));
00175         return __ret;
00176       }
00177 
00178       _StateIdT
00179       _M_insert_alt(_StateIdT __next, _StateIdT __alt, bool __neg)
00180       {
00181         _StateT __tmp(_S_opcode_alternative);
00182         // It labels every quantifier to make greedy comparison easier in BFS
00183         // approach.
00184         __tmp._M_next = __next;
00185         __tmp._M_alt = __alt;
00186         return _M_insert_state(std::move(__tmp));
00187       }
00188 
00189       _StateIdT
00190       _M_insert_repeat(_StateIdT __next, _StateIdT __alt, bool __neg)
00191       {
00192         _StateT __tmp(_S_opcode_repeat);
00193         // It labels every quantifier to make greedy comparison easier in BFS
00194         // approach.
00195         __tmp._M_next = __next;
00196         __tmp._M_alt = __alt;
00197         __tmp._M_neg = __neg;
00198         return _M_insert_state(std::move(__tmp));
00199       }
00200 
00201       _StateIdT
00202       _M_insert_matcher(_MatcherT __m)
00203       {
00204         _StateT __tmp(_S_opcode_match);
00205         __tmp._M_matches = std::move(__m);
00206         return _M_insert_state(std::move(__tmp));
00207       }
00208 
00209       _StateIdT
00210       _M_insert_subexpr_begin()
00211       {
00212         auto __id = this->_M_subexpr_count++;
00213         this->_M_paren_stack.push_back(__id);
00214         _StateT __tmp(_S_opcode_subexpr_begin);
00215         __tmp._M_subexpr = __id;
00216         return _M_insert_state(std::move(__tmp));
00217       }
00218 
00219       _StateIdT
00220       _M_insert_subexpr_end()
00221       {
00222         _StateT __tmp(_S_opcode_subexpr_end);
00223         __tmp._M_subexpr = this->_M_paren_stack.back();
00224         this->_M_paren_stack.pop_back();
00225         return _M_insert_state(std::move(__tmp));
00226       }
00227 
00228       _StateIdT
00229       _M_insert_backref(size_t __index);
00230 
00231       _StateIdT
00232       _M_insert_line_begin()
00233       { return _M_insert_state(_StateT(_S_opcode_line_begin_assertion)); }
00234 
00235       _StateIdT
00236       _M_insert_line_end()
00237       { return _M_insert_state(_StateT(_S_opcode_line_end_assertion)); }
00238 
00239       _StateIdT
00240       _M_insert_word_bound(bool __neg)
00241       {
00242         _StateT __tmp(_S_opcode_word_boundary);
00243         __tmp._M_neg = __neg;
00244         return _M_insert_state(std::move(__tmp));
00245       }
00246 
00247       _StateIdT
00248       _M_insert_lookahead(_StateIdT __alt, bool __neg)
00249       {
00250         _StateT __tmp(_S_opcode_subexpr_lookahead);
00251         __tmp._M_alt = __alt;
00252         __tmp._M_neg = __neg;
00253         return _M_insert_state(std::move(__tmp));
00254       }
00255 
00256       _StateIdT
00257       _M_insert_dummy()
00258       { return _M_insert_state(_StateT(_S_opcode_dummy)); }
00259 
00260       _StateIdT
00261       _M_insert_state(_StateT __s)
00262       {
00263         this->push_back(std::move(__s));
00264         if (this->size() > _GLIBCXX_REGEX_STATE_LIMIT)
00265           __throw_regex_error(regex_constants::error_space);
00266         return this->size()-1;
00267       }
00268 
00269       // Eliminate dummy node in this NFA to make it compact.
00270       void
00271       _M_eliminate_dummy();
00272 
00273 #ifdef _GLIBCXX_DEBUG
00274       std::ostream&
00275       _M_dot(std::ostream& __ostr) const;
00276 #endif
00277     public:
00278       _TraitsT                  _M_traits;
00279     };
00280 
00281   /// Describes a sequence of one or more %_State, its current start
00282   /// and end(s).  This structure contains fragments of an NFA during
00283   /// construction.
00284   template<typename _TraitsT>
00285     class _StateSeq
00286     {
00287     public:
00288       typedef _NFA<_TraitsT> _RegexT;
00289 
00290     public:
00291       _StateSeq(_RegexT& __nfa, _StateIdT __s)
00292       : _M_nfa(__nfa), _M_start(__s), _M_end(__s)
00293       { }
00294 
00295       _StateSeq(_RegexT& __nfa, _StateIdT __s, _StateIdT __end)
00296       : _M_nfa(__nfa), _M_start(__s), _M_end(__end)
00297       { }
00298 
00299       // Append a state on *this and change *this to the new sequence.
00300       void
00301       _M_append(_StateIdT __id)
00302       {
00303         _M_nfa[_M_end]._M_next = __id;
00304         _M_end = __id;
00305       }
00306 
00307       // Append a sequence on *this and change *this to the new sequence.
00308       void
00309       _M_append(const _StateSeq& __s)
00310       {
00311         _M_nfa[_M_end]._M_next = __s._M_start;
00312         _M_end = __s._M_end;
00313       }
00314 
00315       // Clones an entire sequence.
00316       _StateSeq
00317       _M_clone();
00318 
00319     public:
00320       _RegexT&  _M_nfa;
00321       _StateIdT _M_start;
00322       _StateIdT _M_end;
00323     };
00324 
00325  //@} regex-detail
00326 _GLIBCXX_END_NAMESPACE_VERSION
00327 } // namespace __detail
00328 } // namespace std
00329 
00330 #include <bits/regex_automaton.tcc>