libstdc++
regex.tcc
Go to the documentation of this file.
00001 // class template regex -*- C++ -*-
00002 
00003 // Copyright (C) 2013-2014 Free Software Foundation, Inc.
00004 //
00005 // This file is part of the GNU ISO C++ Library.  This library is free
00006 // software; you can redistribute it and/or modify it under the
00007 // terms of the GNU General Public License as published by the
00008 // Free Software Foundation; either version 3, or (at your option)
00009 // any later version.
00010 
00011 // This library is distributed in the hope that it will be useful,
00012 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00013 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014 // GNU General Public License for more details.
00015 
00016 // Under Section 7 of GPL version 3, you are granted additional
00017 // permissions described in the GCC Runtime Library Exception, version
00018 // 3.1, as published by the Free Software Foundation.
00019 
00020 // You should have received a copy of the GNU General Public License and
00021 // a copy of the GCC Runtime Library Exception along with this program;
00022 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
00023 // <http://www.gnu.org/licenses/>.
00024 
00025 /**
00026  *  @file bits/regex.tcc
00027  *  This is an internal header file, included by other library headers.
00028  *  Do not attempt to use it directly. @headername{regex}
00029  */
00030 
00031 // See below __regex_algo_impl to get what this is talking about. The default
00032 // value 1 indicated a conservative optimization without giving up worst case
00033 // performance.
00034 #ifndef _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT
00035 #define _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT 1
00036 #endif
00037 
00038 namespace std _GLIBCXX_VISIBILITY(default)
00039 {
00040 namespace __detail
00041 {
00042 _GLIBCXX_BEGIN_NAMESPACE_VERSION
00043 
00044   // Result of merging regex_match and regex_search.
00045   //
00046   // __policy now can be _S_auto (auto dispatch) and _S_alternate (use
00047   // the other one if possible, for test purpose).
00048   //
00049   // That __match_mode is true means regex_match, else regex_search.
00050   template<typename _BiIter, typename _Alloc,
00051        typename _CharT, typename _TraitsT,
00052        _RegexExecutorPolicy __policy,
00053        bool __match_mode>
00054     bool
00055     __regex_algo_impl(_BiIter                              __s,
00056               _BiIter                              __e,
00057               match_results<_BiIter, _Alloc>&      __m,
00058               const basic_regex<_CharT, _TraitsT>& __re,
00059               regex_constants::match_flag_type     __flags)
00060     {
00061       if (__re._M_automaton == nullptr)
00062     return false;
00063 
00064       typename match_results<_BiIter, _Alloc>::_Base_type& __res = __m;
00065       __m._M_begin = __s;
00066       __res.resize(__re._M_automaton->_M_sub_count() + 2);
00067       for (auto& __it : __res)
00068     __it.matched = false;
00069 
00070       // This function decide which executor to use under given circumstances.
00071       // The _S_auto policy now is the following: if a NFA has no
00072       // back-references and has more than _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT
00073       // quantifiers (*, +, ?), the BFS executor will be used, other wise
00074       // DFS executor. This is because DFS executor has a exponential upper
00075       // bound, but better best-case performace. Meanwhile, BFS executor can
00076       // effectively prevent from exponential-long time matching (which must
00077       // contains many quantifiers), but it's slower in average.
00078       //
00079       // For simple regex, BFS executor could be 2 or more times slower than
00080       // DFS executor.
00081       //
00082       // Of course, BFS executor cannot handle back-references.
00083       bool __ret;
00084       if (!__re._M_automaton->_M_has_backref
00085       && (__policy == _RegexExecutorPolicy::_S_alternate
00086           || __re._M_automaton->_M_quant_count
00087         > _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT))
00088     {
00089       _Executor<_BiIter, _Alloc, _TraitsT, false>
00090         __executor(__s, __e, __m, __re, __flags);
00091       if (__match_mode)
00092         __ret = __executor._M_match();
00093       else
00094         __ret = __executor._M_search();
00095     }
00096       else
00097     {
00098       _Executor<_BiIter, _Alloc, _TraitsT, true>
00099         __executor(__s, __e, __m, __re, __flags);
00100       if (__match_mode)
00101         __ret = __executor._M_match();
00102       else
00103         __ret = __executor._M_search();
00104     }
00105       if (__ret)
00106     {
00107       for (auto __it : __res)
00108         if (!__it.matched)
00109           __it.first = __it.second = __e;
00110       auto& __pre = __res[__res.size()-2];
00111       auto& __suf = __res[__res.size()-1];
00112       if (__match_mode)
00113         {
00114           __pre.matched = false;
00115           __pre.first = __s;
00116           __pre.second = __s;
00117           __suf.matched = false;
00118           __suf.first = __e;
00119           __suf.second = __e;
00120         }
00121       else
00122         {
00123           __pre.first = __s;
00124           __pre.second = __res[0].first;
00125           __pre.matched = (__pre.first != __pre.second);
00126           __suf.first = __res[0].second;
00127           __suf.second = __e;
00128           __suf.matched = (__suf.first != __suf.second);
00129         }
00130     }
00131       return __ret;
00132     }
00133 
00134 _GLIBCXX_END_NAMESPACE_VERSION
00135 }
00136 
00137 _GLIBCXX_BEGIN_NAMESPACE_VERSION
00138 
00139   template<typename _Ch_type>
00140   template<typename _Fwd_iter>
00141     typename regex_traits<_Ch_type>::string_type
00142     regex_traits<_Ch_type>::
00143     lookup_collatename(_Fwd_iter __first, _Fwd_iter __last) const
00144     {
00145       typedef std::ctype<char_type> __ctype_type;
00146       const __ctype_type& __fctyp(use_facet<__ctype_type>(_M_locale));
00147 
00148       static const char* __collatenames[] =
00149     {
00150       "NUL",
00151       "SOH",
00152       "STX",
00153       "ETX",
00154       "EOT",
00155       "ENQ",
00156       "ACK",
00157       "alert",
00158       "backspace",
00159       "tab",
00160       "newline",
00161       "vertical-tab",
00162       "form-feed",
00163       "carriage-return",
00164       "SO",
00165       "SI",
00166       "DLE",
00167       "DC1",
00168       "DC2",
00169       "DC3",
00170       "DC4",
00171       "NAK",
00172       "SYN",
00173       "ETB",
00174       "CAN",
00175       "EM",
00176       "SUB",
00177       "ESC",
00178       "IS4",
00179       "IS3",
00180       "IS2",
00181       "IS1",
00182       "space",
00183       "exclamation-mark",
00184       "quotation-mark",
00185       "number-sign",
00186       "dollar-sign",
00187       "percent-sign",
00188       "ampersand",
00189       "apostrophe",
00190       "left-parenthesis",
00191       "right-parenthesis",
00192       "asterisk",
00193       "plus-sign",
00194       "comma",
00195       "hyphen",
00196       "period",
00197       "slash",
00198       "zero",
00199       "one",
00200       "two",
00201       "three",
00202       "four",
00203       "five",
00204       "six",
00205       "seven",
00206       "eight",
00207       "nine",
00208       "colon",
00209       "semicolon",
00210       "less-than-sign",
00211       "equals-sign",
00212       "greater-than-sign",
00213       "question-mark",
00214       "commercial-at",
00215       "A",
00216       "B",
00217       "C",
00218       "D",
00219       "E",
00220       "F",
00221       "G",
00222       "H",
00223       "I",
00224       "J",
00225       "K",
00226       "L",
00227       "M",
00228       "N",
00229       "O",
00230       "P",
00231       "Q",
00232       "R",
00233       "S",
00234       "T",
00235       "U",
00236       "V",
00237       "W",
00238       "X",
00239       "Y",
00240       "Z",
00241       "left-square-bracket",
00242       "backslash",
00243       "right-square-bracket",
00244       "circumflex",
00245       "underscore",
00246       "grave-accent",
00247       "a",
00248       "b",
00249       "c",
00250       "d",
00251       "e",
00252       "f",
00253       "g",
00254       "h",
00255       "i",
00256       "j",
00257       "k",
00258       "l",
00259       "m",
00260       "n",
00261       "o",
00262       "p",
00263       "q",
00264       "r",
00265       "s",
00266       "t",
00267       "u",
00268       "v",
00269       "w",
00270       "x",
00271       "y",
00272       "z",
00273       "left-curly-bracket",
00274       "vertical-line",
00275       "right-curly-bracket",
00276       "tilde",
00277       "DEL",
00278     };
00279 
00280       string __s;
00281       for (; __first != __last; ++__first)
00282     __s += __fctyp.narrow(*__first, 0);
00283 
00284       for (const auto& __it : __collatenames)
00285     if (__s == __it)
00286       return string_type(1, __fctyp.widen(
00287         static_cast<char>(&__it - __collatenames)));
00288 
00289       // TODO Add digraph support:
00290       // http://boost.sourceforge.net/libs/regex/doc/collating_names.html
00291 
00292       return string_type();
00293     }
00294 
00295   template<typename _Ch_type>
00296   template<typename _Fwd_iter>
00297     typename regex_traits<_Ch_type>::char_class_type
00298     regex_traits<_Ch_type>::
00299     lookup_classname(_Fwd_iter __first, _Fwd_iter __last, bool __icase) const
00300     {
00301       typedef std::ctype<char_type> __ctype_type;
00302       const __ctype_type& __fctyp(use_facet<__ctype_type>(_M_locale));
00303 
00304       // Mappings from class name to class mask.
00305       static const pair<const char*, char_class_type> __classnames[] =
00306       {
00307     {"d", ctype_base::digit},
00308     {"w", {ctype_base::alnum, _RegexMask::_S_under}},
00309     {"s", ctype_base::space},
00310     {"alnum", ctype_base::alnum},
00311     {"alpha", ctype_base::alpha},
00312     {"blank", {0, _RegexMask::_S_blank}},
00313     {"cntrl", ctype_base::cntrl},
00314     {"digit", ctype_base::digit},
00315     {"graph", ctype_base::graph},
00316     {"lower", ctype_base::lower},
00317     {"print", ctype_base::print},
00318     {"punct", ctype_base::punct},
00319     {"space", ctype_base::space},
00320     {"upper", ctype_base::upper},
00321     {"xdigit", ctype_base::xdigit},
00322       };
00323 
00324       string __s;
00325       for (; __first != __last; ++__first)
00326     __s += __fctyp.narrow(__fctyp.tolower(*__first), 0);
00327 
00328       for (const auto& __it : __classnames)
00329     if (__s == __it.first)
00330       {
00331         if (__icase
00332         && ((__it.second
00333              & (ctype_base::lower | ctype_base::upper)) != 0))
00334           return ctype_base::alpha;
00335         return __it.second;
00336       }
00337       return 0;
00338     }
00339 
00340   template<typename _Ch_type>
00341     bool
00342     regex_traits<_Ch_type>::
00343     isctype(_Ch_type __c, char_class_type __f) const
00344     {
00345       typedef std::ctype<char_type> __ctype_type;
00346       const __ctype_type& __fctyp(use_facet<__ctype_type>(_M_locale));
00347 
00348       return __fctyp.is(__f._M_base, __c)
00349     // [[:w:]]
00350     || ((__f._M_extended & _RegexMask::_S_under)
00351         && __c == __fctyp.widen('_'))
00352     // [[:blank:]]
00353     || ((__f._M_extended & _RegexMask::_S_blank)
00354         && (__c == __fctyp.widen(' ')
00355         || __c == __fctyp.widen('\t')));
00356     }
00357 
00358   template<typename _Ch_type>
00359     int
00360     regex_traits<_Ch_type>::
00361     value(_Ch_type __ch, int __radix) const
00362     {
00363       std::basic_istringstream<char_type> __is(string_type(1, __ch));
00364       long __v;
00365       if (__radix == 8)
00366     __is >> std::oct;
00367       else if (__radix == 16)
00368     __is >> std::hex;
00369       __is >> __v;
00370       return __is.fail() ? -1 : __v;
00371     }
00372 
00373   template<typename _Bi_iter, typename _Alloc>
00374   template<typename _Out_iter>
00375     _Out_iter match_results<_Bi_iter, _Alloc>::
00376     format(_Out_iter __out,
00377        const match_results<_Bi_iter, _Alloc>::char_type* __fmt_first,
00378        const match_results<_Bi_iter, _Alloc>::char_type* __fmt_last,
00379        match_flag_type __flags) const
00380     {
00381       _GLIBCXX_DEBUG_ASSERT( ready() );
00382       regex_traits<char_type> __traits;
00383       typedef std::ctype<char_type> __ctype_type;
00384       const __ctype_type&
00385     __fctyp(use_facet<__ctype_type>(__traits.getloc()));
00386 
00387       auto __output = [&](size_t __idx)
00388     {
00389       auto& __sub = _Base_type::operator[](__idx);
00390       if (__sub.matched)
00391         __out = std::copy(__sub.first, __sub.second, __out);
00392     };
00393 
00394       if (__flags & regex_constants::format_sed)
00395     {
00396       for (; __fmt_first != __fmt_last;)
00397         if (*__fmt_first == '&')
00398           {
00399         __output(0);
00400         ++__fmt_first;
00401           }
00402         else if (*__fmt_first == '\\')
00403           {
00404         if (++__fmt_first != __fmt_last
00405             && __fctyp.is(__ctype_type::digit, *__fmt_first))
00406           __output(__traits.value(*__fmt_first++, 10));
00407         else
00408           *__out++ = '\\';
00409           }
00410         else
00411           *__out++ = *__fmt_first++;
00412     }
00413       else
00414     {
00415       while (1)
00416         {
00417           auto __next = std::find(__fmt_first, __fmt_last, '$');
00418           if (__next == __fmt_last)
00419         break;
00420 
00421           __out = std::copy(__fmt_first, __next, __out);
00422 
00423           auto __eat = [&](char __ch) -> bool
00424         {
00425           if (*__next == __ch)
00426             {
00427               ++__next;
00428               return true;
00429             }
00430           return false;
00431         };
00432 
00433           if (++__next == __fmt_last)
00434         *__out++ = '$';
00435           else if (__eat('$'))
00436         *__out++ = '$';
00437           else if (__eat('&'))
00438         __output(0);
00439           else if (__eat('`'))
00440         __output(_Base_type::size()-2);
00441           else if (__eat('\''))
00442         __output(_Base_type::size()-1);
00443           else if (__fctyp.is(__ctype_type::digit, *__next))
00444         {
00445           long __num = __traits.value(*__next, 10);
00446           if (++__next != __fmt_last
00447               && __fctyp.is(__ctype_type::digit, *__next))
00448             {
00449               __num *= 10;
00450               __num += __traits.value(*__next++, 10);
00451             }
00452           if (0 <= __num && __num < this->size())
00453             __output(__num);
00454         }
00455           else
00456         *__out++ = '$';
00457           __fmt_first = __next;
00458         }
00459       __out = std::copy(__fmt_first, __fmt_last, __out);
00460     }
00461       return __out;
00462     }
00463 
00464   template<typename _Out_iter, typename _Bi_iter,
00465        typename _Rx_traits, typename _Ch_type>
00466     _Out_iter
00467     regex_replace(_Out_iter __out, _Bi_iter __first, _Bi_iter __last,
00468           const basic_regex<_Ch_type, _Rx_traits>& __e,
00469           const _Ch_type* __fmt,
00470           regex_constants::match_flag_type __flags)
00471     {
00472       typedef regex_iterator<_Bi_iter, _Ch_type, _Rx_traits> _IterT;
00473       _IterT __i(__first, __last, __e, __flags);
00474       _IterT __end;
00475       if (__i == __end)
00476     {
00477       if (!(__flags & regex_constants::format_no_copy))
00478         __out = std::copy(__first, __last, __out);
00479     }
00480       else
00481     {
00482       sub_match<_Bi_iter> __last;
00483       auto __len = char_traits<_Ch_type>::length(__fmt);
00484       for (; __i != __end; ++__i)
00485         {
00486           if (!(__flags & regex_constants::format_no_copy))
00487         __out = std::copy(__i->prefix().first, __i->prefix().second,
00488                   __out);
00489           __out = __i->format(__out, __fmt, __fmt + __len, __flags);
00490           __last = __i->suffix();
00491           if (__flags & regex_constants::format_first_only)
00492         break;
00493         }
00494       if (!(__flags & regex_constants::format_no_copy))
00495         __out = std::copy(__last.first, __last.second, __out);
00496     }
00497       return __out;
00498     }
00499 
00500   template<typename _Bi_iter,
00501        typename _Ch_type,
00502        typename _Rx_traits>
00503     bool
00504     regex_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
00505     operator==(const regex_iterator& __rhs) const
00506     {
00507       return (_M_match.empty() && __rhs._M_match.empty())
00508     || (_M_begin == __rhs._M_begin
00509         && _M_end == __rhs._M_end
00510         && _M_pregex == __rhs._M_pregex
00511         && _M_flags == __rhs._M_flags
00512         && _M_match[0] == __rhs._M_match[0]);
00513     }
00514 
00515   template<typename _Bi_iter,
00516        typename _Ch_type,
00517        typename _Rx_traits>
00518     regex_iterator<_Bi_iter, _Ch_type, _Rx_traits>&
00519     regex_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
00520     operator++()
00521     {
00522       // In all cases in which the call to regex_search returns true,
00523       // match.prefix().first shall be equal to the previous value of
00524       // match[0].second, and for each index i in the half-open range
00525       // [0, match.size()) for which match[i].matched is true,
00526       // match[i].position() shall return distance(begin, match[i].first).
00527       // [28.12.1.4.5]
00528       if (_M_match[0].matched)
00529     {
00530       auto __start = _M_match[0].second;
00531       auto __prefix_first = _M_match[0].second;
00532       if (_M_match[0].first == _M_match[0].second)
00533         {
00534           if (__start == _M_end)
00535         {
00536           _M_match = value_type();
00537           return *this;
00538         }
00539           else
00540         {
00541           if (regex_search(__start, _M_end, _M_match, *_M_pregex,
00542                    _M_flags
00543                    | regex_constants::match_not_null
00544                    | regex_constants::match_continuous))
00545             {
00546               _GLIBCXX_DEBUG_ASSERT(_M_match[0].matched);
00547               auto& __prefix = _M_match.at(_M_match.size());
00548               __prefix.first = __prefix_first;
00549               __prefix.matched = __prefix.first != __prefix.second;
00550               // [28.12.1.4.5]
00551               _M_match._M_begin = _M_begin;
00552               return *this;
00553             }
00554           else
00555             ++__start;
00556         }
00557         }
00558       _M_flags |= regex_constants::match_prev_avail;
00559       if (regex_search(__start, _M_end, _M_match, *_M_pregex, _M_flags))
00560         {
00561           _GLIBCXX_DEBUG_ASSERT(_M_match[0].matched);
00562           auto& __prefix = _M_match.at(_M_match.size());
00563           __prefix.first = __prefix_first;
00564           __prefix.matched = __prefix.first != __prefix.second;
00565           // [28.12.1.4.5]
00566           _M_match._M_begin = _M_begin;
00567         }
00568       else
00569         _M_match = value_type();
00570     }
00571       return *this;
00572     }
00573 
00574   template<typename _Bi_iter,
00575        typename _Ch_type,
00576        typename _Rx_traits>
00577     regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>&
00578     regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
00579     operator=(const regex_token_iterator& __rhs)
00580     {
00581       _M_position = __rhs._M_position;
00582       _M_subs = __rhs._M_subs;
00583       _M_n = __rhs._M_n;
00584       _M_suffix = __rhs._M_suffix;
00585       _M_has_m1 = __rhs._M_has_m1;
00586       _M_normalize_result();
00587       return *this;
00588     }
00589 
00590   template<typename _Bi_iter,
00591        typename _Ch_type,
00592        typename _Rx_traits>
00593     bool
00594     regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
00595     operator==(const regex_token_iterator& __rhs) const
00596     {
00597       if (_M_end_of_seq() && __rhs._M_end_of_seq())
00598     return true;
00599       if (_M_suffix.matched && __rhs._M_suffix.matched
00600       && _M_suffix == __rhs._M_suffix)
00601     return true;
00602       if (_M_end_of_seq() || _M_suffix.matched
00603       || __rhs._M_end_of_seq() || __rhs._M_suffix.matched)
00604     return false;
00605       return _M_position == __rhs._M_position
00606     && _M_n == __rhs._M_n
00607     && _M_subs == __rhs._M_subs;
00608     }
00609 
00610   template<typename _Bi_iter,
00611        typename _Ch_type,
00612        typename _Rx_traits>
00613     regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>&
00614     regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
00615     operator++()
00616     {
00617       _Position __prev = _M_position;
00618       if (_M_suffix.matched)
00619     *this = regex_token_iterator();
00620       else if (_M_n + 1 < _M_subs.size())
00621     {
00622       _M_n++;
00623       _M_result = &_M_current_match();
00624     }
00625       else
00626     {
00627       _M_n = 0;
00628       ++_M_position;
00629       if (_M_position != _Position())
00630         _M_result = &_M_current_match();
00631       else if (_M_has_m1 && __prev->suffix().length() != 0)
00632         {
00633           _M_suffix.matched = true;
00634           _M_suffix.first = __prev->suffix().first;
00635           _M_suffix.second = __prev->suffix().second;
00636           _M_result = &_M_suffix;
00637         }
00638       else
00639         *this = regex_token_iterator();
00640     }
00641       return *this;
00642     }
00643 
00644   template<typename _Bi_iter,
00645        typename _Ch_type,
00646        typename _Rx_traits>
00647     void
00648     regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
00649     _M_init(_Bi_iter __a, _Bi_iter __b)
00650     {
00651       _M_has_m1 = false;
00652       for (auto __it : _M_subs)
00653     if (__it == -1)
00654       {
00655         _M_has_m1 = true;
00656         break;
00657       }
00658       if (_M_position != _Position())
00659     _M_result = &_M_current_match();
00660       else if (_M_has_m1)
00661     {
00662       _M_suffix.matched = true;
00663       _M_suffix.first = __a;
00664       _M_suffix.second = __b;
00665       _M_result = &_M_suffix;
00666     }
00667       else
00668     _M_result = nullptr;
00669     }
00670 
00671 _GLIBCXX_END_NAMESPACE_VERSION
00672 } // namespace
00673