The data structure _BracketMatcher (storing regex like [123a-z]) could be quite slow mainly because of regex_traits. So a result of cache for small range of inputs (char) sounds reasonable. It iterates all 256 inputs and calculate them at regex compile time.
Booted and tested under -m64 and -m32. Thanks! -- Regards, Tim Shen
commit de1e43989be9c9dffebf488e118549cfe2987b9e Author: tim <timshe...@gmail.com> Date: Fri Jan 3 11:43:12 2014 -0500 2014-01-03 Tim Shen <timshe...@gmail.com> * include/bits/regex_compiler.h (_AnyMatcher<>::_AnyMatcher(), _AnyMatcher<>::operator(), _AnyMatcher<>::_M_apply(), _CharMatcher<>::_CharMatcher(), _CharMatcher<>::_M_translate(), _BracketMatcher<>::_BracketMatcher(), _BracketMatcher<>::operator(), _BracketMatcher<>::_M_add_char(), _BracketMatcher<>::_M_add_collating_element(), _BracketMatcher<>::_M_add_equivalence_class(), _BracketMatcher<>::_M_add_character_class(), _BracketMatcher<>::_M_make_range(), _BracketMatcher<>::_M_ready(), _BracketMatcher<>::_M_apply(), _BracketMatcher<>::_M_make_cache()): Fix _AnyMatcher behavior of POSIX style and move _M_flags to template parameter; Add cache for _BracketMatcher. Adjust declarations from here... * include/bits/regex.h (basic_regex<>::imbue()): ...to here. Also, imbuing a regex will trigger a recompilation to rebuild the cache. * include/bits/regex_compiler.tcc (_Compiler<>::_M_atom(), _Compiler<>::_M_bracket_expression()): Adjust matchers' caller for different template bool parameters. * include/bits/regex_executor.h: Remove unnecessary declarations. * include/std/regex: Adjust including orders. * testsuite/28_regex/traits/char/user_defined.cc: New. * testsuite/28_regex/traits/wchar_t/user_defined.cc: New. diff --git a/libstdc++-v3/include/bits/regex.h b/libstdc++-v3/include/bits/regex.h index 4e091e0..ae8e1f5 100644 --- a/libstdc++-v3/include/bits/regex.h +++ b/libstdc++-v3/include/bits/regex.h @@ -30,6 +30,15 @@ namespace std _GLIBCXX_VISIBILITY(default) { +_GLIBCXX_BEGIN_NAMESPACE_VERSION + template<typename, typename> + class basic_regex; + + template<typename, typename> + class match_results; + +_GLIBCXX_END_NAMESPACE_VERSION + namespace __detail { _GLIBCXX_BEGIN_NAMESPACE_VERSION @@ -48,6 +57,56 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION const basic_regex<_CharT, _TraitsT>& __re, regex_constants::match_flag_type __flags); + template<typename, typename, typename, bool> + class _Executor; + + template<typename _Tp> + struct __has_contiguous_iter : std::false_type { }; + + template<typename _Ch, typename _Tr, typename _Alloc> + struct __has_contiguous_iter<std::basic_string<_Ch, _Tr, _Alloc>> + : std::true_type // string<Ch> storage is contiguous + { }; + + template<typename _Tp, typename _Alloc> + struct __has_contiguous_iter<std::vector<_Tp, _Alloc>> + : std::true_type // vector<Tp> storage is contiguous + { }; + + template<typename _Alloc> + struct __has_contiguous_iter<std::vector<bool, _Alloc>> + : std::false_type // vector<bool> storage is not contiguous + { }; + + template<typename _Tp> + struct __is_contiguous_normal_iter : std::false_type { }; + + template<typename _Tp, typename _Cont> + struct + __is_contiguous_normal_iter<__gnu_cxx::__normal_iterator<_Tp, _Cont>> + : __has_contiguous_iter<_Cont>::type + { }; + + template<typename _Iter, typename _TraitsT> + using __enable_if_contiguous_normal_iter + = typename enable_if< __is_contiguous_normal_iter<_Iter>::value, + std::shared_ptr<_NFA<_TraitsT>> >::type; + + template<typename _Iter, typename _TraitsT> + using __disable_if_contiguous_normal_iter + = typename enable_if< !__is_contiguous_normal_iter<_Iter>::value, + std::shared_ptr<_NFA<_TraitsT>> >::type; + + template<typename _FwdIter, typename _TraitsT> + __disable_if_contiguous_normal_iter<_FwdIter, _TraitsT> + __compile_nfa(_FwdIter __first, _FwdIter __last, const _TraitsT& __traits, + regex_constants::syntax_option_type __flags); + + template<typename _Iter, typename _TraitsT> + __enable_if_contiguous_normal_iter<_Iter, _TraitsT> + __compile_nfa(_Iter __first, _Iter __last, const _TraitsT& __traits, + regex_constants::syntax_option_type __flags); + _GLIBCXX_END_NAMESPACE_VERSION } @@ -501,6 +560,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION basic_regex(_FwdIter __first, _FwdIter __last, flag_type __f = ECMAScript) : _M_flags(__f), + _M_original_str(__first, __last), _M_automaton(__detail::__compile_nfa(__first, __last, _M_traits, _M_flags)) { } @@ -637,6 +697,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION flag_type __flags = ECMAScript) { _M_flags = __flags; + _M_original_str.assign(__s.begin(), __s.end()); _M_automaton = __detail::__compile_nfa(__s.begin(), __s.end(), _M_traits, _M_flags); return *this; @@ -701,7 +762,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION */ locale_type imbue(locale_type __loc) - { return _M_traits.imbue(__loc); } + { + auto __ret = _M_traits.imbue(__loc); + this->assign(_M_original_str, _M_flags); + return __ret; + } /** * @brief Gets the locale currently imbued in the regular expression @@ -744,9 +809,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION template<typename, typename, typename, bool> friend class __detail::_Executor; - flag_type _M_flags; - _Rx_traits _M_traits; - _AutomatonPtr _M_automaton; + flag_type _M_flags; + _Rx_traits _M_traits; + basic_string<_Ch_type> _M_original_str; + _AutomatonPtr _M_automaton; }; /** @brief Standard regular expressions. */ diff --git a/libstdc++-v3/include/bits/regex_compiler.h b/libstdc++-v3/include/bits/regex_compiler.h index c380131..4ac67df 100644 --- a/libstdc++-v3/include/bits/regex_compiler.h +++ b/libstdc++-v3/include/bits/regex_compiler.h @@ -129,43 +129,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _StackT _M_stack; }; - template<typename _Tp> - struct __has_contiguous_iter : std::false_type { }; - - template<typename _Ch, typename _Tr, typename _Alloc> - struct __has_contiguous_iter<std::basic_string<_Ch, _Tr, _Alloc>> - : std::true_type // string<Ch> storage is contiguous - { }; - - template<typename _Tp, typename _Alloc> - struct __has_contiguous_iter<std::vector<_Tp, _Alloc>> - : std::true_type // vector<Tp> storage is contiguous - { }; - - template<typename _Alloc> - struct __has_contiguous_iter<std::vector<bool, _Alloc>> - : std::false_type // vector<bool> storage is not contiguous - { }; - - template<typename _Tp> - struct __is_contiguous_normal_iter : std::false_type { }; - - template<typename _Tp, typename _Cont> - struct - __is_contiguous_normal_iter<__gnu_cxx::__normal_iterator<_Tp, _Cont>> - : __has_contiguous_iter<_Cont>::type - { }; - - template<typename _Iter, typename _TraitsT> - using __enable_if_contiguous_normal_iter - = typename enable_if< __is_contiguous_normal_iter<_Iter>::value, - std::shared_ptr<_NFA<_TraitsT>> >::type; - - template<typename _Iter, typename _TraitsT> - using __disable_if_contiguous_normal_iter - = typename enable_if< !__is_contiguous_normal_iter<_Iter>::value, - std::shared_ptr<_NFA<_TraitsT>> >::type; - template<typename _FwdIter, typename _TraitsT> inline __disable_if_contiguous_normal_iter<_FwdIter, _TraitsT> __compile_nfa(_FwdIter __first, _FwdIter __last, const _TraitsT& __traits, @@ -185,7 +148,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION return __compile_nfa(__cfirst, __cfirst + __len, __traits, __flags); } - template<typename _TraitsT> + template<typename _TraitsT, bool __is_ecma> struct _AnyMatcher { typedef typename _TraitsT::char_type _CharT; @@ -197,25 +160,55 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION bool operator()(_CharT __ch) const + { return _M_apply(__ch, typename is_same<_CharT, char>::type()); } + + bool + _M_apply(_CharT __ch, true_type) const { - return _M_traits.translate(__ch) != '\n' - && _M_traits.translate(__ch) != '\r' - && _M_traits.translate(__ch) != u'\u2028' - && _M_traits.translate(__ch) != u'\u2029'; + auto __c = _M_traits.translate(__ch); + if (__is_ecma) + { + static auto __n = _M_traits.translate('\n'); + static auto __r = _M_traits.translate('\r'); + return __c != __n && __c != __r; + } + else + { + static auto __nul = _M_traits.translate('\0'); + return __c != __nul; + } + } + + bool + _M_apply(_CharT __ch, false_type) const + { + auto __c = _M_traits.translate(__ch); + if (__is_ecma) + { + static auto __n = _M_traits.translate('\n'); + static auto __r = _M_traits.translate('\r'); + static auto __u2028 = _M_traits.translate(u'\u2028'); + static auto __u2029 = _M_traits.translate(u'\u2029'); + return __c != __n && __c != __r && __c != __u2028 + && __c != __u2029; + } + else + { + static auto __nul = _M_traits.translate('\0'); + return __c != __nul; + } } const _TraitsT& _M_traits; }; - template<typename _TraitsT> + template<typename _TraitsT, bool __icase> struct _CharMatcher { typedef typename _TraitsT::char_type _CharT; - typedef regex_constants::syntax_option_type _FlagT; - explicit - _CharMatcher(_CharT __ch, const _TraitsT& __traits, _FlagT __flags) - : _M_traits(__traits), _M_flags(__flags), _M_ch(_M_translate(__ch)) + _CharMatcher(_CharT __ch, const _TraitsT& __traits) + : _M_traits(__traits), _M_ch(_M_translate(__ch)) { } bool @@ -225,44 +218,56 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _CharT _M_translate(_CharT __ch) const { - if (_M_flags & regex_constants::icase) + if (__icase) return _M_traits.translate_nocase(__ch); else return _M_traits.translate(__ch); } const _TraitsT& _M_traits; - _FlagT _M_flags; _CharT _M_ch; }; /// Matches a character range (bracket expression) // TODO: Convert used _M_flags fields to template parameters, including // collate and icase. Avoid using std::set, could use flat_set - // (sorted vector and binary search) instead; use an fixed sized (256) - // vector<bool> for char specialization if necessary. + // (sorted vector and binary search) instead. template<typename _TraitsT> struct _BracketMatcher { + public: typedef typename _TraitsT::char_type _CharT; typedef typename _TraitsT::char_class_type _CharClassT; typedef typename _TraitsT::string_type _StringT; typedef regex_constants::syntax_option_type _FlagT; - explicit + public: _BracketMatcher(bool __is_non_matching, const _TraitsT& __traits, _FlagT __flags) - : _M_traits(__traits), _M_class_set(0), _M_flags(__flags), + : +#ifdef _GLIBCXX_DEBUG + _M_is_ready(false), +#endif + _M_traits(__traits), _M_class_set(0), _M_flags(__flags), _M_is_non_matching(__is_non_matching) { } bool - operator()(_CharT) const; + operator()(_CharT __ch) const + { + _GLIBCXX_DEBUG_ASSERT(_M_is_ready); + return _M_apply(__ch, _IsChar()); + } void _M_add_char(_CharT __c) - { _M_char_set.insert(_M_translate(__c)); } + { + _M_char_set.insert(_M_translate(__c)); +#ifdef _GLIBCXX_DEBUG + _M_is_ready = false; +#endif + } void _M_add_collating_element(const _StringT& __s) @@ -272,6 +277,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION if (__st.empty()) __throw_regex_error(regex_constants::error_collate); _M_char_set.insert(_M_translate(__st[0])); +#ifdef _GLIBCXX_DEBUG + _M_is_ready = false; +#endif } void @@ -284,6 +292,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION __st = _M_traits.transform_primary(__st.data(), __st.data() + __st.size()); _M_equiv_set.insert(__st); +#ifdef _GLIBCXX_DEBUG + _M_is_ready = false; +#endif } void @@ -295,6 +306,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION if (__mask == 0) __throw_regex_error(regex_constants::error_ctype); _M_class_set |= __mask; +#ifdef _GLIBCXX_DEBUG + _M_is_ready = false; +#endif } void @@ -306,8 +320,36 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _M_get_str(_M_translate(__r)))); else _M_range_set.insert(make_pair(_M_get_str(__l), _M_get_str(__r))); +#ifdef _GLIBCXX_DEBUG + _M_is_ready = false; +#endif } + void + _M_ready() + { + _M_make_cache(_IsChar()); +#ifdef _GLIBCXX_DEBUG + _M_is_ready = true; +#endif + } + + private: + typedef typename is_same<_CharT, char>::type _IsChar; + struct _Dummy { }; + typedef typename conditional<_IsChar::value, + std::bitset<1 << (8 * sizeof(_CharT))>, + _Dummy>::type _CacheT; + typedef typename make_unsigned<_CharT>::type _UnsignedCharT; + + private: + bool + _M_apply(_CharT __ch, false_type) const; + + bool + _M_apply(_CharT __ch, true_type) const + { return _M_cache[static_cast<_UnsignedCharT>(__ch)]; } + _CharT _M_translate(_CharT __c) const { @@ -328,6 +370,20 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION return _M_traits.transform(__s.begin(), __s.end()); } + void + _M_make_cache(true_type) + { + for (int __i = 0; __i < _M_cache.size(); __i++) + _M_cache[static_cast<_UnsignedCharT>(__i)] = + _M_apply(__i, false_type()); + } + + void + _M_make_cache(false_type) + { } + + private: + _CacheT _M_cache; std::set<_CharT> _M_char_set; std::set<_StringT> _M_equiv_set; std::set<pair<_StringT, _StringT>> _M_range_set; @@ -335,6 +391,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _CharClassT _M_class_set; _FlagT _M_flags; bool _M_is_non_matching; +#ifdef _GLIBCXX_DEBUG + bool _M_is_ready; +#endif }; //@} regex-detail diff --git a/libstdc++-v3/include/bits/regex_compiler.tcc b/libstdc++-v3/include/bits/regex_compiler.tcc index 0d90681..191643a 100644 --- a/libstdc++-v3/include/bits/regex_compiler.tcc +++ b/libstdc++-v3/include/bits/regex_compiler.tcc @@ -284,15 +284,33 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _M_atom() { if (_M_match_token(_ScannerT::_S_token_anychar)) - _M_stack.push(_StateSeqT(_M_nfa, - _M_nfa._M_insert_matcher - (_AnyMatcher<_TraitsT>(_M_traits)))); + { + if (_M_flags && regex_constants::ECMAScript) + _M_stack.push(_StateSeqT(_M_nfa, + _M_nfa._M_insert_matcher + (_AnyMatcher<_TraitsT, + true>(_M_traits)))); + else + _M_stack.push(_StateSeqT(_M_nfa, + _M_nfa._M_insert_matcher + (_AnyMatcher<_TraitsT, + false>(_M_traits)))); + } else if (_M_try_char()) - _M_stack.push(_StateSeqT(_M_nfa, - _M_nfa._M_insert_matcher - (_CharMatcher<_TraitsT>(_M_value[0], - _M_traits, - _M_flags)))); + { + if (_M_flags && regex_constants::icase) + _M_stack.push(_StateSeqT(_M_nfa, + _M_nfa._M_insert_matcher + (_CharMatcher<_TraitsT, + true>(_M_value[0], + _M_traits)))); + else + _M_stack.push(_StateSeqT(_M_nfa, + _M_nfa._M_insert_matcher + (_CharMatcher<_TraitsT, + false>(_M_value[0], + _M_traits)))); + } else if (_M_match_token(_ScannerT::_S_token_backref)) _M_stack.push(_StateSeqT(_M_nfa, _M_nfa. _M_insert_backref(_M_cur_int_value(10)))); @@ -302,6 +320,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _BMatcherT __matcher(_M_ctype.is(_CtypeT::upper, _M_value[0]), _M_traits, _M_flags); __matcher._M_add_character_class(_M_value); + __matcher._M_ready(); _M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_matcher(std::move(__matcher)))); } @@ -341,6 +360,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _BMatcherT __matcher(__neg, _M_traits, _M_flags); while (!_M_match_token(_ScannerT::_S_token_bracket_end)) _M_expression_term(__matcher); + __matcher._M_ready(); _M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_matcher(std::move(__matcher)))); return true; @@ -432,7 +452,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION template<typename _TraitsT> bool - _BracketMatcher<_TraitsT>::operator()(_CharT __ch) const + _BracketMatcher<_TraitsT>::_M_apply(_CharT __ch, false_type) const { bool __ret = false; if (_M_traits.isctype(__ch, _M_class_set) diff --git a/libstdc++-v3/include/bits/regex_executor.h b/libstdc++-v3/include/bits/regex_executor.h index 980c6de..bed9014 100644 --- a/libstdc++-v3/include/bits/regex_executor.h +++ b/libstdc++-v3/include/bits/regex_executor.h @@ -32,17 +32,6 @@ namespace std _GLIBCXX_VISIBILITY(default) { -_GLIBCXX_BEGIN_NAMESPACE_VERSION - template<typename, typename> - class basic_regex; - - template<typename> - class sub_match; - - template<typename, typename> - class match_results; -_GLIBCXX_END_NAMESPACE_VERSION - namespace __detail { _GLIBCXX_BEGIN_NAMESPACE_VERSION diff --git a/libstdc++-v3/include/std/regex b/libstdc++-v3/include/std/regex index 442a095..9395f50 100644 --- a/libstdc++-v3/include/std/regex +++ b/libstdc++-v3/include/std/regex @@ -56,11 +56,11 @@ #include <bits/regex_constants.h> #include <bits/regex_error.h> -#include <bits/regex_scanner.h> #include <bits/regex_automaton.h> +#include <bits/regex.h> +#include <bits/regex_scanner.h> #include <bits/regex_compiler.h> #include <bits/regex_executor.h> -#include <bits/regex.h> #endif // C++11 diff --git a/libstdc++-v3/testsuite/28_regex/traits/char/user_defined.cc b/libstdc++-v3/testsuite/28_regex/traits/char/user_defined.cc new file mode 100644 index 0000000..4fcbbbf --- /dev/null +++ b/libstdc++-v3/testsuite/28_regex/traits/char/user_defined.cc @@ -0,0 +1,60 @@ +// { dg-options "-std=gnu++11" } +// { dg-do run } + +// +// 2014-01-03 Tim Shen <timshe...@gmail.com> +// +// Copyright (C) 2010-2014 Free Software Foundation, Inc. +// +// This file is part of the GNU ISO C++ Library. This library is free +// software; you can redistribute it and/or modify it under the +// terms of the GNU General Public License as published by the +// Free Software Foundation; either version 3, or (at your option) +// any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License along +// with this library; see the file COPYING3. If not see +// <http://www.gnu.org/licenses/>. + +// 28.3 Requirements [re.req] +// 28.2 (4) Table 127 - Regular expression traits class requirements +// 28.7 Class template regex_traits [re.traits] + +#include <regex> +#include <string> +#include <testsuite_hooks.h> + +using namespace std; + +template<typename CharT> + class MyRegexTraits + : public regex_traits<CharT> + { + public: + CharT + translate(CharT c) const + { + return c+1; + } + }; + +void +test01() +{ + bool test __attribute__((unused)) = true; + + basic_regex<char, MyRegexTraits<char>> re("."); + VERIFY(!regex_match("\n", re)); + VERIFY(!regex_match("\r", re)); +} + +int main() +{ + test01(); + return 0; +} diff --git a/libstdc++-v3/testsuite/28_regex/traits/wchar_t/user_defined.cc b/libstdc++-v3/testsuite/28_regex/traits/wchar_t/user_defined.cc new file mode 100644 index 0000000..ba38d0e --- /dev/null +++ b/libstdc++-v3/testsuite/28_regex/traits/wchar_t/user_defined.cc @@ -0,0 +1,62 @@ +// { dg-options "-std=gnu++11" } +// { dg-do run } + +// +// 2014-01-03 Tim Shen <timshe...@gmail.com> +// +// Copyright (C) 2010-2014 Free Software Foundation, Inc. +// +// This file is part of the GNU ISO C++ Library. This library is free +// software; you can redistribute it and/or modify it under the +// terms of the GNU General Public License as published by the +// Free Software Foundation; either version 3, or (at your option) +// any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License along +// with this library; see the file COPYING3. If not see +// <http://www.gnu.org/licenses/>. + +// 28.3 Requirements [re.req] +// 28.2 (4) Table 127 - Regular expression traits class requirements +// 28.7 Class template regex_traits [re.traits] + +#include <regex> +#include <string> +#include <testsuite_hooks.h> + +using namespace std; + +template<typename CharT> + class MyRegexTraits + : public regex_traits<CharT> + { + public: + CharT + translate(CharT c) const + { + return c+1; + } + }; + +void +test01() +{ + bool test __attribute__((unused)) = true; + + basic_regex<wchar_t, MyRegexTraits<wchar_t>> re(L"."); + VERIFY(!regex_match(L"\n", re)); + VERIFY(!regex_match(L"\r", re)); + VERIFY(!regex_match(L"\u2028", re)); + VERIFY(!regex_match(L"\u2029", re)); +} + +int main() +{ + test01(); + return 0; +}