On 27/08/15 22:18 -0700, Tim Shen wrote:
Bootstrapped and tested.

Thanks!


--
Regards,
Tim Shen

commit 53c1caff442e97a18652ec8b1d984341168fd98d
Author: Tim Shen <tims...@google.com>
Date:   Thu Aug 27 21:42:40 2015 -0700

        PR libstdc++/67361
        * include/bits/regex_error.h: Add __throw_regex_error that
        supports string.
        * include/bits/regex_automaton.h: Add more specific exception
        messages.
        * include/bits/regex_automaton.tcc: Likewise.
        * include/bits/regex_compiler.h: Likewise.
        * include/bits/regex_compiler.tcc: Likewise.
        * include/bits/regex_scanner.h: Likewise.
        * include/bits/regex_scanner.tcc: Likewise.

Nice, thanks for doing this!

@@ -158,10 +159,14 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
      // _M_paren_stack is {1, 3}, for incomplete "(a.." and "(c..". At this
      // time, "\\2" is valid, but "\\1" and "\\3" are not.
      if (__index >= _M_subexpr_count)
-       __throw_regex_error(regex_constants::error_backref);
+       __throw_regex_error(
+         regex_constants::error_backref,
+         "Back-reference index exceeds current sub-expression count.");
      for (auto __it : this->_M_paren_stack)
        if (__index == __it)
-         __throw_regex_error(regex_constants::error_backref);
+         __throw_regex_error(
+           regex_constants::error_backref,
+           "Back-reference refered to an opened sub-expression.");

Should be "referred".

And one of the other strings in another throw says "befoer".


diff --git a/libstdc++-v3/include/bits/regex_compiler.h 
b/libstdc++-v3/include/bits/regex_compiler.h
index 0cb0c04..12ffabe 100644
--- a/libstdc++-v3/include/bits/regex_compiler.h
+++ b/libstdc++-v3/include/bits/regex_compiler.h
@@ -397,7 +397,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
        auto __st = _M_traits.lookup_collatename(__s.data(),
                                                 __s.data() + __s.size());
        if (__st.empty())
-         __throw_regex_error(regex_constants::error_collate);
+         __throw_regex_error(regex_constants::error_collate,
+                             string("Invalid collate element: "));
        _M_char_set.push_back(_M_translator._M_translate(__st[0]));
#ifdef _GLIBCXX_DEBUG
        _M_is_ready = false;

There seems to be no need to construct a std::string here, just pass a
const char* (see below).

Also, this string ends in a colon, whereas most end in a period. Any
reason for the difference?


@@ -411,7 +412,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
        auto __st = _M_traits.lookup_collatename(__s.data(),
                                                 __s.data() + __s.size());
        if (__st.empty())
-         __throw_regex_error(regex_constants::error_collate);
+         __throw_regex_error(regex_constants::error_collate,
+                             string("Invalid equivalence class."));
        __st = _M_traits.transform_primary(__st.data(),
                                           __st.data() + __st.size());
        _M_equiv_set.push_back(__st);

Just pass const char*.

@@ -428,7 +430,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
                                                 __s.data() + __s.size(),
                                                 __icase);
        if (__mask == 0)
-         __throw_regex_error(regex_constants::error_ctype);
+         __throw_regex_error(regex_constants::error_collate,
+                             string("Invalid character class."));
        if (!__neg)
          _M_class_set |= __mask;
        else

Ditto.

@@ -442,7 +445,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
      _M_make_range(_CharT __l, _CharT __r)
      {
        if (__l > __r)
-         __throw_regex_error(regex_constants::error_range);
+         __throw_regex_error(regex_constants::error_range,
+                             string("Invalid range in bracket expression."));
        _M_range_set.push_back(make_pair(_M_translator._M_transform(__l),
                                         _M_translator._M_transform(__r)));
#ifdef _GLIBCXX_DEBUG

Ditto.

diff --git a/libstdc++-v3/include/bits/regex_compiler.tcc 
b/libstdc++-v3/include/bits/regex_compiler.tcc
index 9a62311..019ca42 100644
--- a/libstdc++-v3/include/bits/regex_compiler.tcc
+++ b/libstdc++-v3/include/bits/regex_compiler.tcc
@@ -77,16 +77,26 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
      _M_traits(_M_nfa->_M_traits),
      _M_ctype(std::use_facet<_CtypeT>(__loc))
    {
-      _StateSeqT __r(*_M_nfa, _M_nfa->_M_start());
-      __r._M_append(_M_nfa->_M_insert_subexpr_begin());
-      this->_M_disjunction();
-      if (!_M_match_token(_ScannerT::_S_token_eof))
-       __throw_regex_error(regex_constants::error_paren);
-      __r._M_append(_M_pop());
-      _GLIBCXX_DEBUG_ASSERT(_M_stack.empty());
-      __r._M_append(_M_nfa->_M_insert_subexpr_end());
-      __r._M_append(_M_nfa->_M_insert_accept());
-      _M_nfa->_M_eliminate_dummy();
+      __try
+       {
+         _StateSeqT __r(*_M_nfa, _M_nfa->_M_start());
+         __r._M_append(_M_nfa->_M_insert_subexpr_begin());
+         this->_M_disjunction();
+         if (!_M_match_token(_ScannerT::_S_token_eof))
+           __throw_regex_error(regex_constants::error_paren,
+                               "Unexpected end of regex.");
+         __r._M_append(_M_pop());
+         _GLIBCXX_DEBUG_ASSERT(_M_stack.empty());
+         __r._M_append(_M_nfa->_M_insert_subexpr_end());
+         __r._M_append(_M_nfa->_M_insert_accept());
+         _M_nfa->_M_eliminate_dummy();
+       }
+      __catch(std::regex_error __e)
+       {
+         __throw_regex_error(__e.code(),
+                             string(__e.what()) + " Location: \""
+                             + _M_scanner._M_get_location_string() + "\"");
+       }
    }

  template<typename _TraitsT>

I wonder if we want to make this more efficient by adding a private
member to regex_error that would allow information to be appended to
the string, rather then creating a new regex_error with a new string.

diff --git a/libstdc++-v3/include/bits/regex_error.h 
b/libstdc++-v3/include/bits/regex_error.h
index 778edd5..0dd1fdf 100644
--- a/libstdc++-v3/include/bits/regex_error.h
+++ b/libstdc++-v3/include/bits/regex_error.h
@@ -155,6 +155,12 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
    regex_constants::error_type
    code() const
    { return _M_code; }
+
+  private:
+    regex_error(regex_constants::error_type __ecode, const string& __what)
+    : std::runtime_error(__what), _M_code(__ecode) { }
+
+    friend void __throw_regex_error(regex_constants::error_type, const 
string&);
  };

  //@} // group regex
@@ -162,5 +168,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
  void
  __throw_regex_error(regex_constants::error_type __ecode);

+  inline void
+  __throw_regex_error(regex_constants::error_type __ecode, const string& 
__what)
+  { _GLIBCXX_THROW_OR_ABORT(regex_error(__ecode, __what)); }
+
_GLIBCXX_END_NAMESPACE_VERSION
} // namespace std

I suggest adding another overload that takes a const char* rather than
std::string. The reason is that when using the new ABI this function
will take a std::__cxx11::string, so calling it will allocate memory
for the string data, then that string is passed to the regex_error
constructor which has to convert it internally to an old std::string,
which has to allocate a second time.

If there is an overload taking a const char* then that can be passed
to the regex_error constructor and only one allocation will be done.

(I have considered making it possible for exceptions to move the
memory from a new string into an their old string member, but that
isn't done currently.)


diff --git a/libstdc++-v3/include/bits/regex_scanner.h 
b/libstdc++-v3/include/bits/regex_scanner.h
index b47103e..7795dd2 100644
--- a/libstdc++-v3/include/bits/regex_scanner.h
+++ b/libstdc++-v3/include/bits/regex_scanner.h
@@ -220,6 +220,22 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
      _M_get_value() const
      { return _M_value; }

+      string
+      _M_get_location_string() const
+      {
+       auto __left = std::max(_M_begin, _M_current - 2);
+       auto __right = std::min(_M_end, _M_current + 3);
+       static constexpr char __here[] = ">>><<<";

I don't think there's any advantage to using a static here, it doesn't
need to be a global symbol, and with optimisation enabled we get the
same code from just const char __here[] = ">>><<<";


@@ -247,6 +263,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
      void
      _M_eat_class(char);

+      _IterT                        _M_begin;
      _IterT                        _M_current;
      _IterT                        _M_end;
      _CtypeT&                      _M_ctype;

This looks like an ABI change, as the size of the type changes.

If I understand correctly this is only needed for the location info,
we could still have nice human readable text in the exceptions without
this, right?


Reply via email to