std::regex builds a cache of equivalence classes by calling std::regex_traits<char>::transform_primary(c) for every char, which then calls std::collate<char>::transform which calls strxfrm. On several targets strxfrm fails for non-ASCII characters. Because strxfrm has no return value reserved to indicate an error, some implementations return INT_MAX or SIZE_MAX. This causes std::collate::transform to try to allocate a huge buffer, which is either very slow or throws std::bad_alloc. We should check errno after calling strxfrm to detect errors and then throw a more appropriate exception instead of trying to allocate a huge buffer.
Unfortunately the std::collate<C>::_M_transform function has a non-throwing exception specifier, so we can't do the error handling there. As well as checking errno, this patch changes std::collate::do_transform to use __builtin_alloca for small inputs, and to use RAII to deallocate the buffers used for large inputs. This change isn't sufficient to fix the three std::regex bugs caused by the lack of error handling in std::collate::do_transform, we also need to make std::regex_traits::transform_primary handle exceptions. This change also attempts to make transform_primary closer to the effects described in the standard, by not even attempting to use std::collate if the locale's std::collate facet has been replaced (see PR 118105). Arguably, we should not even try to call transform_primary for any char values over 127, since they're never valid in locales that use UTF-8 or 7-bit ASCII, and probably for other charsets too. Handling 128 exceptions for every std::regex compilation is very inefficient, but at least it now works instead of failing with std::bad_alloc, and no longer allocates 128 x 2GB. Maybe for C++26 we could check the locale's std::text_encoding and use that to decide whether to cache equivalence classes for char values over 127. I'm unsure if std::regex_traits<C>::transform_primary is supposed to convert the string to lower case or not. The general regex traits requirements ([re.req] p20) do say "when character case is not considered" but the specification for the std::regex_traits<char> and std::regex_traits<wchar_t> specializations ([re.traits] p7) don't say anything about that. libstdc++-v3/ChangeLog: PR libstdc++/85824 PR libstdc++/94409 PR libstdc++/98723 PR libstdc++/118105 * include/bits/locale_classes.tcc (collate::do_transform): Check errno after calling _M_transform. Use RAII type to manage the buffer and to restore errno. * include/bits/regex.h (regex_traits::transform_primary): Handle exceptions from std::collate::transform and do not try to use std::collate for user-defined facets. --- Tested x86_64-linux. libstdc++-v3/include/bits/locale_classes.tcc | 94 ++++++++++++++------ libstdc++-v3/include/bits/regex.h | 43 ++++++--- 2 files changed, 96 insertions(+), 41 deletions(-) diff --git a/libstdc++-v3/include/bits/locale_classes.tcc b/libstdc++-v3/include/bits/locale_classes.tcc index 2b78008e9ae..6e8f27bf0d9 100644 --- a/libstdc++-v3/include/bits/locale_classes.tcc +++ b/libstdc++-v3/include/bits/locale_classes.tcc @@ -37,6 +37,9 @@ #ifdef _GLIBCXX_SYSHDR #pragma GCC system_header #endif + +#include <cerrno> + #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wc++11-extensions" // extern template #pragma GCC diagnostic ignored "-Wvariadic-macros" @@ -295,43 +298,76 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION size_t __len = (__hi - __lo) * 2; - _CharT* __c = new _CharT[__len]; + struct _Buf + { + _Buf(size_t __n, void* __buf, int __e) + : _M_c(__buf ? (_CharT*)__buf : new _CharT[__n]), + _M_stackbuf(__buf), + _M_errno(__e) + { } - __try + ~_Buf() { - // strxfrm stops when it sees a nul character so we break - // the string into zero-terminated substrings and pass those - // to strxfrm. - for (;;) + if (_M_c != _M_stackbuf) + delete[] _M_c; + if (errno == 0) + errno = _M_errno; + } + + void _M_realloc(size_t __len) + { + _CharT* __p = new _CharT[__len]; + if (_M_c != _M_stackbuf) + delete[] _M_c; + _M_c = __p; + } + + _CharT* _M_c; + void* const _M_stackbuf; + int _M_errno; + }; + + const size_t __bytes = __len * sizeof(_CharT); + _Buf __buf(__len, __bytes <= 256 ? __builtin_alloca(__bytes) : 0, errno); + errno = 0; + + // strxfrm stops when it sees a nul character so we break + // the string into zero-terminated substrings and pass those + // to strxfrm. + for (;;) + { + // First try a buffer perhaps big enough. + size_t __res = _M_transform(__buf._M_c, __p, __len); + // If the buffer was not large enough, try again with the + // correct size. + if (__res >= __len) { - // First try a buffer perhaps big enough. - size_t __res = _M_transform(__c, __p, __len); - // If the buffer was not large enough, try again with the - // correct size. - if (__res >= __len) + if (__builtin_expect(errno, 0)) { - __len = __res + 1; - delete [] __c, __c = 0; - __c = new _CharT[__len]; - __res = _M_transform(__c, __p, __len); +#if __cpp_exceptions + __throw_system_error(errno); +#else + // std::regex can call this function internally with + // char values that always fail, so we don't want to + // use _GLIBCXX_THROW_OR_ABORT here. + __ret.clear(); + break; +#endif } - __ret.append(__c, __res); - __p += char_traits<_CharT>::length(__p); - if (__p == __pend) - break; - - __p++; - __ret.push_back(_CharT()); + __len = __res + 1; + __buf._M_realloc(__len); + __res = _M_transform(__buf._M_c, __p, __len); } - } - __catch(...) - { - delete [] __c; - __throw_exception_again; - } - delete [] __c; + __ret.append(__buf._M_c, __res); + __p += char_traits<_CharT>::length(__p); + if (__p == __pend) + break; + + __p++; + __ret.push_back(_CharT()); + } return __ret; } diff --git a/libstdc++-v3/include/bits/regex.h b/libstdc++-v3/include/bits/regex.h index 68ff479c905..57ea68e7ee9 100644 --- a/libstdc++-v3/include/bits/regex.h +++ b/libstdc++-v3/include/bits/regex.h @@ -253,9 +253,9 @@ _GLIBCXX_BEGIN_NAMESPACE_CXX11 * @param __first beginning of the character sequence. * @param __last one-past-the-end of the character sequence. * - * Effects: if typeid(use_facet<collate<_Ch_type> >) == - * typeid(collate_byname<_Ch_type>) and the form of the sort key - * returned by collate_byname<_Ch_type>::transform(__first, __last) + * Effects: if `typeid(use_facet<collate<_Ch_type>>(getloc())) == + * typeid(collate_byname<_Ch_type>)` and the form of the sort key + * returned by `collate_byname<_Ch_type>::transform(__first, __last)` * is known and can be converted into a primary sort key * then returns that key, otherwise returns an empty string. * @@ -265,17 +265,36 @@ _GLIBCXX_BEGIN_NAMESPACE_CXX11 string_type transform_primary(_Fwd_iter __first, _Fwd_iter __last) const { + string_type __ret; +#if __cpp_rtti + const auto& __fclt = use_facet<collate<char_type>>(_M_locale); + if (typeid(__fclt) != typeid(collate<char_type>)) // FIXME: PR 118110 + return __ret; + // TODO : this is not entirely correct. // This function requires extra support from the platform. - // - // Read http://gcc.gnu.org/ml/libstdc++/2013-09/msg00117.html and - // http://www.open-std.org/Jtc1/sc22/wg21/docs/papers/2003/n1429.htm - // for details. - typedef std::ctype<char_type> __ctype_type; - const __ctype_type& __fctyp(use_facet<__ctype_type>(_M_locale)); - _GLIBCXX_STD_C::vector<char_type> __s(__first, __last); - __fctyp.tolower(__s.data(), __s.data() + __s.size()); - return this->transform(__s.data(), __s.data() + __s.size()); + // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=118105 + + const auto& __fctyp(use_facet<ctype<char_type>>(_M_locale)); + basic_string<char_type> __s(__first, __last); + const auto __p = const_cast<char_type*>(__s.c_str()); + const auto __pend = __p + __s.size(); + // XXX: should we use tolower here? The regex traits requirements + // say that transform_primary ignores case, but the specification + // for the std::regex_traits<char> and std::regex_traits<wchar_t> + // specializations don't, they seem to suggest just using the + // collate::transform function to get a primary sort key. + __fctyp.tolower(__p, __pend); + + __try + { + __ret = __fclt.transform(__p, __pend); + } + __catch (const exception&) + { + } +#endif + return __ret; } /** -- 2.47.1