libstdc++
regex.tcc
Go to the documentation of this file.
00001 // class template regex -*- C++ -*-
00002 
00003 // Copyright (C) 2013-2015 Free Software Foundation, Inc.
00004 //
00005 // This file is part of the GNU ISO C++ Library.  This library is free
00006 // software; you can redistribute it and/or modify it under the
00007 // terms of the GNU General Public License as published by the
00008 // Free Software Foundation; either version 3, or (at your option)
00009 // any later version.
00010 
00011 // This library is distributed in the hope that it will be useful,
00012 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00013 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014 // GNU General Public License for more details.
00015 
00016 // Under Section 7 of GPL version 3, you are granted additional
00017 // permissions described in the GCC Runtime Library Exception, version
00018 // 3.1, as published by the Free Software Foundation.
00019 
00020 // You should have received a copy of the GNU General Public License and
00021 // a copy of the GCC Runtime Library Exception along with this program;
00022 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
00023 // <http://www.gnu.org/licenses/>.
00024 
00025 /**
00026  *  @file bits/regex.tcc
00027  *  This is an internal header file, included by other library headers.
00028  *  Do not attempt to use it directly. @headername{regex}
00029  */
00030 
00031 // A non-standard switch to let the user pick the matching algorithm.
00032 // If _GLIBCXX_REGEX_USE_THOMPSON_NFA is defined, the thompson NFA
00033 // algorithm will be used. This algorithm is not enabled by default,
00034 // and cannot be used if the regex contains back-references, but has better
00035 // (polynomial instead of exponential) worst case performance.
00036 // See __regex_algo_impl below.
00037 
00038 namespace std _GLIBCXX_VISIBILITY(default)
00039 {
00040 namespace __detail
00041 {
00042 _GLIBCXX_BEGIN_NAMESPACE_VERSION
00043 
00044   // Result of merging regex_match and regex_search.
00045   //
00046   // __policy now can be _S_auto (auto dispatch) and _S_alternate (use
00047   // the other one if possible, for test purpose).
00048   //
00049   // That __match_mode is true means regex_match, else regex_search.
00050   template<typename _BiIter, typename _Alloc,
00051            typename _CharT, typename _TraitsT,
00052            _RegexExecutorPolicy __policy,
00053            bool __match_mode>
00054     bool
00055     __regex_algo_impl(_BiIter                              __s,
00056                       _BiIter                              __e,
00057                       match_results<_BiIter, _Alloc>&      __m,
00058                       const basic_regex<_CharT, _TraitsT>& __re,
00059                       regex_constants::match_flag_type     __flags)
00060     {
00061       if (__re._M_automaton == nullptr)
00062         return false;
00063 
00064       typename match_results<_BiIter, _Alloc>::_Base_type& __res = __m;
00065       __m._M_begin = __s;
00066       __m._M_resize(__re._M_automaton->_M_sub_count());
00067       for (auto& __it : __res)
00068         __it.matched = false;
00069 
00070       // __policy is used by testsuites so that they can use Thompson NFA
00071       // without defining a macro. Users should define
00072       // _GLIBCXX_REGEX_USE_THOMPSON_NFA if they need to use this approach.
00073       bool __ret;
00074       if (!__re._M_automaton->_M_has_backref
00075           && !(__re._M_flags & regex_constants::ECMAScript)
00076 #ifndef _GLIBCXX_REGEX_USE_THOMPSON_NFA
00077           && __policy == _RegexExecutorPolicy::_S_alternate
00078 #endif
00079           )
00080         {
00081           _Executor<_BiIter, _Alloc, _TraitsT, false>
00082             __executor(__s, __e, __m, __re, __flags);
00083           if (__match_mode)
00084             __ret = __executor._M_match();
00085           else
00086             __ret = __executor._M_search();
00087         }
00088       else
00089         {
00090           _Executor<_BiIter, _Alloc, _TraitsT, true>
00091             __executor(__s, __e, __m, __re, __flags);
00092           if (__match_mode)
00093             __ret = __executor._M_match();
00094           else
00095             __ret = __executor._M_search();
00096         }
00097       if (__ret)
00098         {
00099           for (auto& __it : __res)
00100             if (!__it.matched)
00101               __it.first = __it.second = __e;
00102           auto& __pre = __m._M_prefix();
00103           auto& __suf = __m._M_suffix();
00104           if (__match_mode)
00105             {
00106               __pre.matched = false;
00107               __pre.first = __s;
00108               __pre.second = __s;
00109               __suf.matched = false;
00110               __suf.first = __e;
00111               __suf.second = __e;
00112             }
00113           else
00114             {
00115               __pre.first = __s;
00116               __pre.second = __res[0].first;
00117               __pre.matched = (__pre.first != __pre.second);
00118               __suf.first = __res[0].second;
00119               __suf.second = __e;
00120               __suf.matched = (__suf.first != __suf.second);
00121             }
00122         }
00123       else
00124         {
00125           __m._M_resize(0);
00126           for (auto& __it : __res)
00127             {
00128               __it.matched = false;
00129               __it.first = __it.second = __e;
00130             }
00131         }
00132       return __ret;
00133     }
00134 
00135 _GLIBCXX_END_NAMESPACE_VERSION
00136 }
00137 
00138 _GLIBCXX_BEGIN_NAMESPACE_VERSION
00139 
00140   template<typename _Ch_type>
00141   template<typename _Fwd_iter>
00142     typename regex_traits<_Ch_type>::string_type
00143     regex_traits<_Ch_type>::
00144     lookup_collatename(_Fwd_iter __first, _Fwd_iter __last) const
00145     {
00146       typedef std::ctype<char_type> __ctype_type;
00147       const __ctype_type& __fctyp(use_facet<__ctype_type>(_M_locale));
00148 
00149       static const char* __collatenames[] =
00150         {
00151           "NUL",
00152           "SOH",
00153           "STX",
00154           "ETX",
00155           "EOT",
00156           "ENQ",
00157           "ACK",
00158           "alert",
00159           "backspace",
00160           "tab",
00161           "newline",
00162           "vertical-tab",
00163           "form-feed",
00164           "carriage-return",
00165           "SO",
00166           "SI",
00167           "DLE",
00168           "DC1",
00169           "DC2",
00170           "DC3",
00171           "DC4",
00172           "NAK",
00173           "SYN",
00174           "ETB",
00175           "CAN",
00176           "EM",
00177           "SUB",
00178           "ESC",
00179           "IS4",
00180           "IS3",
00181           "IS2",
00182           "IS1",
00183           "space",
00184           "exclamation-mark",
00185           "quotation-mark",
00186           "number-sign",
00187           "dollar-sign",
00188           "percent-sign",
00189           "ampersand",
00190           "apostrophe",
00191           "left-parenthesis",
00192           "right-parenthesis",
00193           "asterisk",
00194           "plus-sign",
00195           "comma",
00196           "hyphen",
00197           "period",
00198           "slash",
00199           "zero",
00200           "one",
00201           "two",
00202           "three",
00203           "four",
00204           "five",
00205           "six",
00206           "seven",
00207           "eight",
00208           "nine",
00209           "colon",
00210           "semicolon",
00211           "less-than-sign",
00212           "equals-sign",
00213           "greater-than-sign",
00214           "question-mark",
00215           "commercial-at",
00216           "A",
00217           "B",
00218           "C",
00219           "D",
00220           "E",
00221           "F",
00222           "G",
00223           "H",
00224           "I",
00225           "J",
00226           "K",
00227           "L",
00228           "M",
00229           "N",
00230           "O",
00231           "P",
00232           "Q",
00233           "R",
00234           "S",
00235           "T",
00236           "U",
00237           "V",
00238           "W",
00239           "X",
00240           "Y",
00241           "Z",
00242           "left-square-bracket",
00243           "backslash",
00244           "right-square-bracket",
00245           "circumflex",
00246           "underscore",
00247           "grave-accent",
00248           "a",
00249           "b",
00250           "c",
00251           "d",
00252           "e",
00253           "f",
00254           "g",
00255           "h",
00256           "i",
00257           "j",
00258           "k",
00259           "l",
00260           "m",
00261           "n",
00262           "o",
00263           "p",
00264           "q",
00265           "r",
00266           "s",
00267           "t",
00268           "u",
00269           "v",
00270           "w",
00271           "x",
00272           "y",
00273           "z",
00274           "left-curly-bracket",
00275           "vertical-line",
00276           "right-curly-bracket",
00277           "tilde",
00278           "DEL",
00279         };
00280 
00281       string __s;
00282       for (; __first != __last; ++__first)
00283         __s += __fctyp.narrow(*__first, 0);
00284 
00285       for (const auto& __it : __collatenames)
00286         if (__s == __it)
00287           return string_type(1, __fctyp.widen(
00288             static_cast<char>(&__it - __collatenames)));
00289 
00290       // TODO Add digraph support:
00291       // http://boost.sourceforge.net/libs/regex/doc/collating_names.html
00292 
00293       return string_type();
00294     }
00295 
00296   template<typename _Ch_type>
00297   template<typename _Fwd_iter>
00298     typename regex_traits<_Ch_type>::char_class_type
00299     regex_traits<_Ch_type>::
00300     lookup_classname(_Fwd_iter __first, _Fwd_iter __last, bool __icase) const
00301     {
00302       typedef std::ctype<char_type> __ctype_type;
00303       const __ctype_type& __fctyp(use_facet<__ctype_type>(_M_locale));
00304 
00305       // Mappings from class name to class mask.
00306       static const pair<const char*, char_class_type> __classnames[] =
00307       {
00308         {"d", ctype_base::digit},
00309         {"w", {ctype_base::alnum, _RegexMask::_S_under}},
00310         {"s", ctype_base::space},
00311         {"alnum", ctype_base::alnum},
00312         {"alpha", ctype_base::alpha},
00313         {"blank", ctype_base::blank},
00314         {"cntrl", ctype_base::cntrl},
00315         {"digit", ctype_base::digit},
00316         {"graph", ctype_base::graph},
00317         {"lower", ctype_base::lower},
00318         {"print", ctype_base::print},
00319         {"punct", ctype_base::punct},
00320         {"space", ctype_base::space},
00321         {"upper", ctype_base::upper},
00322         {"xdigit", ctype_base::xdigit},
00323       };
00324 
00325       string __s;
00326       for (; __first != __last; ++__first)
00327         __s += __fctyp.narrow(__fctyp.tolower(*__first), 0);
00328 
00329       for (const auto& __it : __classnames)
00330         if (__s == __it.first)
00331           {
00332             if (__icase
00333                 && ((__it.second
00334                      & (ctype_base::lower | ctype_base::upper)) != 0))
00335               return ctype_base::alpha;
00336             return __it.second;
00337           }
00338       return 0;
00339     }
00340 
00341   template<typename _Ch_type>
00342     bool
00343     regex_traits<_Ch_type>::
00344     isctype(_Ch_type __c, char_class_type __f) const
00345     {
00346       typedef std::ctype<char_type> __ctype_type;
00347       const __ctype_type& __fctyp(use_facet<__ctype_type>(_M_locale));
00348 
00349       return __fctyp.is(__f._M_base, __c)
00350         // [[:w:]]
00351         || ((__f._M_extended & _RegexMask::_S_under)
00352             && __c == __fctyp.widen('_'));
00353     }
00354 
00355   template<typename _Ch_type>
00356     int
00357     regex_traits<_Ch_type>::
00358     value(_Ch_type __ch, int __radix) const
00359     {
00360       std::basic_istringstream<char_type> __is(string_type(1, __ch));
00361       long __v;
00362       if (__radix == 8)
00363         __is >> std::oct;
00364       else if (__radix == 16)
00365         __is >> std::hex;
00366       __is >> __v;
00367       return __is.fail() ? -1 : __v;
00368     }
00369 
00370   template<typename _Bi_iter, typename _Alloc>
00371   template<typename _Out_iter>
00372     _Out_iter match_results<_Bi_iter, _Alloc>::
00373     format(_Out_iter __out,
00374            const match_results<_Bi_iter, _Alloc>::char_type* __fmt_first,
00375            const match_results<_Bi_iter, _Alloc>::char_type* __fmt_last,
00376            match_flag_type __flags) const
00377     {
00378       _GLIBCXX_DEBUG_ASSERT( ready() );
00379       regex_traits<char_type> __traits;
00380       typedef std::ctype<char_type> __ctype_type;
00381       const __ctype_type&
00382         __fctyp(use_facet<__ctype_type>(__traits.getloc()));
00383 
00384       auto __output = [&](size_t __idx)
00385         {
00386           auto& __sub = (*this)[__idx];
00387           if (__sub.matched)
00388             __out = std::copy(__sub.first, __sub.second, __out);
00389         };
00390 
00391       if (__flags & regex_constants::format_sed)
00392         {
00393           for (; __fmt_first != __fmt_last;)
00394             if (*__fmt_first == '&')
00395               {
00396                 __output(0);
00397                 ++__fmt_first;
00398               }
00399             else if (*__fmt_first == '\\')
00400               {
00401                 if (++__fmt_first != __fmt_last
00402                     && __fctyp.is(__ctype_type::digit, *__fmt_first))
00403                   __output(__traits.value(*__fmt_first++, 10));
00404                 else
00405                   *__out++ = '\\';
00406               }
00407             else
00408               *__out++ = *__fmt_first++;
00409         }
00410       else
00411         {
00412           while (1)
00413             {
00414               auto __next = std::find(__fmt_first, __fmt_last, '$');
00415               if (__next == __fmt_last)
00416                 break;
00417 
00418               __out = std::copy(__fmt_first, __next, __out);
00419 
00420               auto __eat = [&](char __ch) -> bool
00421                 {
00422                   if (*__next == __ch)
00423                     {
00424                       ++__next;
00425                       return true;
00426                     }
00427                   return false;
00428                 };
00429 
00430               if (++__next == __fmt_last)
00431                 *__out++ = '$';
00432               else if (__eat('$'))
00433                 *__out++ = '$';
00434               else if (__eat('&'))
00435                 __output(0);
00436               else if (__eat('`'))
00437                 {
00438                   auto& __sub = _M_prefix();
00439                   if (__sub.matched)
00440                     __out = std::copy(__sub.first, __sub.second, __out);
00441                 }
00442               else if (__eat('\''))
00443                 {
00444                   auto& __sub = _M_suffix();
00445                   if (__sub.matched)
00446                     __out = std::copy(__sub.first, __sub.second, __out);
00447                 }
00448               else if (__fctyp.is(__ctype_type::digit, *__next))
00449                 {
00450                   long __num = __traits.value(*__next, 10);
00451                   if (++__next != __fmt_last
00452                       && __fctyp.is(__ctype_type::digit, *__next))
00453                     {
00454                       __num *= 10;
00455                       __num += __traits.value(*__next++, 10);
00456                     }
00457                   if (0 <= __num && __num < this->size())
00458                     __output(__num);
00459                 }
00460               else
00461                 *__out++ = '$';
00462               __fmt_first = __next;
00463             }
00464           __out = std::copy(__fmt_first, __fmt_last, __out);
00465         }
00466       return __out;
00467     }
00468 
00469   template<typename _Out_iter, typename _Bi_iter,
00470            typename _Rx_traits, typename _Ch_type>
00471     _Out_iter
00472     regex_replace(_Out_iter __out, _Bi_iter __first, _Bi_iter __last,
00473                   const basic_regex<_Ch_type, _Rx_traits>& __e,
00474                   const _Ch_type* __fmt,
00475                   regex_constants::match_flag_type __flags)
00476     {
00477       typedef regex_iterator<_Bi_iter, _Ch_type, _Rx_traits> _IterT;
00478       _IterT __i(__first, __last, __e, __flags);
00479       _IterT __end;
00480       if (__i == __end)
00481         {
00482           if (!(__flags & regex_constants::format_no_copy))
00483             __out = std::copy(__first, __last, __out);
00484         }
00485       else
00486         {
00487           sub_match<_Bi_iter> __last;
00488           auto __len = char_traits<_Ch_type>::length(__fmt);
00489           for (; __i != __end; ++__i)
00490             {
00491               if (!(__flags & regex_constants::format_no_copy))
00492                 __out = std::copy(__i->prefix().first, __i->prefix().second,
00493                                   __out);
00494               __out = __i->format(__out, __fmt, __fmt + __len, __flags);
00495               __last = __i->suffix();
00496               if (__flags & regex_constants::format_first_only)
00497                 break;
00498             }
00499           if (!(__flags & regex_constants::format_no_copy))
00500             __out = std::copy(__last.first, __last.second, __out);
00501         }
00502       return __out;
00503     }
00504 
00505   template<typename _Bi_iter,
00506            typename _Ch_type,
00507            typename _Rx_traits>
00508     bool
00509     regex_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
00510     operator==(const regex_iterator& __rhs) const
00511     {
00512       return (_M_match.empty() && __rhs._M_match.empty())
00513         || (_M_begin == __rhs._M_begin
00514             && _M_end == __rhs._M_end
00515             && _M_pregex == __rhs._M_pregex
00516             && _M_flags == __rhs._M_flags
00517             && _M_match[0] == __rhs._M_match[0]);
00518     }
00519 
00520   template<typename _Bi_iter,
00521            typename _Ch_type,
00522            typename _Rx_traits>
00523     regex_iterator<_Bi_iter, _Ch_type, _Rx_traits>&
00524     regex_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
00525     operator++()
00526     {
00527       // In all cases in which the call to regex_search returns true,
00528       // match.prefix().first shall be equal to the previous value of
00529       // match[0].second, and for each index i in the half-open range
00530       // [0, match.size()) for which match[i].matched is true,
00531       // match[i].position() shall return distance(begin, match[i].first).
00532       // [28.12.1.4.5]
00533       if (_M_match[0].matched)
00534         {
00535           auto __start = _M_match[0].second;
00536           auto __prefix_first = _M_match[0].second;
00537           if (_M_match[0].first == _M_match[0].second)
00538             {
00539               if (__start == _M_end)
00540                 {
00541                   _M_match = value_type();
00542                   return *this;
00543                 }
00544               else
00545                 {
00546                   if (regex_search(__start, _M_end, _M_match, *_M_pregex,
00547                                    _M_flags
00548                                    | regex_constants::match_not_null
00549                                    | regex_constants::match_continuous))
00550                     {
00551                       _GLIBCXX_DEBUG_ASSERT(_M_match[0].matched);
00552                       auto& __prefix = _M_match._M_prefix();
00553                       __prefix.first = __prefix_first;
00554                       __prefix.matched = __prefix.first != __prefix.second;
00555                       // [28.12.1.4.5]
00556                       _M_match._M_begin = _M_begin;
00557                       return *this;
00558                     }
00559                   else
00560                     ++__start;
00561                 }
00562             }
00563           _M_flags |= regex_constants::match_prev_avail;
00564           if (regex_search(__start, _M_end, _M_match, *_M_pregex, _M_flags))
00565             {
00566               _GLIBCXX_DEBUG_ASSERT(_M_match[0].matched);
00567               auto& __prefix = _M_match._M_prefix();
00568               __prefix.first = __prefix_first;
00569               __prefix.matched = __prefix.first != __prefix.second;
00570               // [28.12.1.4.5]
00571               _M_match._M_begin = _M_begin;
00572             }
00573           else
00574             _M_match = value_type();
00575         }
00576       return *this;
00577     }
00578 
00579   template<typename _Bi_iter,
00580            typename _Ch_type,
00581            typename _Rx_traits>
00582     regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>&
00583     regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
00584     operator=(const regex_token_iterator& __rhs)
00585     {
00586       _M_position = __rhs._M_position;
00587       _M_subs = __rhs._M_subs;
00588       _M_n = __rhs._M_n;
00589       _M_suffix = __rhs._M_suffix;
00590       _M_has_m1 = __rhs._M_has_m1;
00591       _M_normalize_result();
00592       return *this;
00593     }
00594 
00595   template<typename _Bi_iter,
00596            typename _Ch_type,
00597            typename _Rx_traits>
00598     bool
00599     regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
00600     operator==(const regex_token_iterator& __rhs) const
00601     {
00602       if (_M_end_of_seq() && __rhs._M_end_of_seq())
00603         return true;
00604       if (_M_suffix.matched && __rhs._M_suffix.matched
00605           && _M_suffix == __rhs._M_suffix)
00606         return true;
00607       if (_M_end_of_seq() || _M_suffix.matched
00608           || __rhs._M_end_of_seq() || __rhs._M_suffix.matched)
00609         return false;
00610       return _M_position == __rhs._M_position
00611         && _M_n == __rhs._M_n
00612         && _M_subs == __rhs._M_subs;
00613     }
00614 
00615   template<typename _Bi_iter,
00616            typename _Ch_type,
00617            typename _Rx_traits>
00618     regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>&
00619     regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
00620     operator++()
00621     {
00622       _Position __prev = _M_position;
00623       if (_M_suffix.matched)
00624         *this = regex_token_iterator();
00625       else if (_M_n + 1 < _M_subs.size())
00626         {
00627           _M_n++;
00628           _M_result = &_M_current_match();
00629         }
00630       else
00631         {
00632           _M_n = 0;
00633           ++_M_position;
00634           if (_M_position != _Position())
00635             _M_result = &_M_current_match();
00636           else if (_M_has_m1 && __prev->suffix().length() != 0)
00637             {
00638               _M_suffix.matched = true;
00639               _M_suffix.first = __prev->suffix().first;
00640               _M_suffix.second = __prev->suffix().second;
00641               _M_result = &_M_suffix;
00642             }
00643           else
00644             *this = regex_token_iterator();
00645         }
00646       return *this;
00647     }
00648 
00649   template<typename _Bi_iter,
00650            typename _Ch_type,
00651            typename _Rx_traits>
00652     void
00653     regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
00654     _M_init(_Bi_iter __a, _Bi_iter __b)
00655     {
00656       _M_has_m1 = false;
00657       for (auto __it : _M_subs)
00658         if (__it == -1)
00659           {
00660             _M_has_m1 = true;
00661             break;
00662           }
00663       if (_M_position != _Position())
00664         _M_result = &_M_current_match();
00665       else if (_M_has_m1)
00666         {
00667           _M_suffix.matched = true;
00668           _M_suffix.first = __a;
00669           _M_suffix.second = __b;
00670           _M_result = &_M_suffix;
00671         }
00672       else
00673         _M_result = nullptr;
00674     }
00675 
00676 _GLIBCXX_END_NAMESPACE_VERSION
00677 } // namespace
00678