pammon created this revision.
Herald added a reviewer: EricWF.

In regex, forward assertions like '(?=stuff)' are implemented by
constructing a child regular expression 'stuff' and matching that.
If the child regular expression contains a backreference, this would
trip an assertion or reference the wrong capture group, because the
child was ignorant of the capture groups of its parent. For example,
/(x)(?=\1)/ would trip an assertion.

Address this by propagating submatches into the child, so that
backreferences reference the correct capture groups. This also allows us
to eliminate the mexp_ field, because the child expression shares the
entire submatch array with the parent.


https://reviews.llvm.org/D32635

Files:
  include/regex
  test/std/re/re.alg/re.alg.match/ecma.pass.cpp

Index: test/std/re/re.alg/re.alg.match/ecma.pass.cpp
===================================================================
--- test/std/re/re.alg/re.alg.match/ecma.pass.cpp
+++ test/std/re/re.alg/re.alg.match/ecma.pass.cpp
@@ -637,6 +637,22 @@
         assert(m.str(0) == s);
     }
     {
+      std::cmatch m;
+      const char s[] = "abcabc";
+      assert(std::regex_match(s, m, std::regex("(.+)(?=\\1)(\\1)")));
+      assert(m.size() == 3);
+      assert(m.str(1) == "abc");
+      assert(m.str(2) == "abc");
+    }
+    {
+      std::cmatch m;
+      const char s[] = "aa";
+      assert(std::regex_match(s, m, std::regex("(a+)(?!\\1)(a*)")));
+      assert(m.size() == 3);
+      assert(m.str(1) == "aa");
+      assert(m.str(2) == "");
+    }
+    {
         std::cmatch m;
         const char s[] = "foobar";
         assert(std::regex_match(s, m, std::regex("[^\\0]*")));
Index: include/regex
===================================================================
--- include/regex
+++ include/regex
@@ -2826,7 +2826,7 @@
     void __push_end_marked_subexpression(unsigned);
     void __push_empty();
     void __push_word_boundary(bool);
-    void __push_lookahead(const basic_regex&, bool, unsigned);
+    void __push_lookahead(basic_regex, bool);
 
     template <class _Allocator>
         bool
@@ -2843,6 +2843,7 @@
         bool
         __match_at_start_ecma(const _CharT* __first, const _CharT* __last,
                  match_results<const _CharT*, _Allocator>& __m,
+                 const vector<sub_match<const _CharT *>> &incoming_sub_matches,
                  regex_constants::match_flag_type __flags, bool) const;
     template <class _Allocator>
         bool
@@ -2964,17 +2965,16 @@
     typedef __owns_one_state<_CharT> base;
 
     basic_regex<_CharT, _Traits> __exp_;
-    unsigned __mexp_;
     bool __invert_;
 
     __lookahead(const __lookahead&);
     __lookahead& operator=(const __lookahead&);
 public:
     typedef _VSTD::__state<_CharT> __state;
 
     _LIBCPP_INLINE_VISIBILITY
-    __lookahead(const basic_regex<_CharT, _Traits>& __exp, bool __invert, __node<_CharT>* __s, unsigned __mexp)
-        : base(__s), __exp_(__exp), __mexp_(__mexp), __invert_(__invert) {}
+    __lookahead(basic_regex<_CharT, _Traits> __exp, bool __invert, __node<_CharT>* __s)
+        : base(__s), __exp_(move(__exp)), __invert_(__invert) {}
 
     virtual void __exec(__state&) const;
 };
@@ -2987,16 +2987,18 @@
     __m.__init(1 + __exp_.mark_count(), __s.__current_, __s.__last_);
     bool __matched = __exp_.__match_at_start_ecma(
         __s.__current_, __s.__last_,
-        __m,
+        __m, __s.__sub_matches_,
         (__s.__flags_ | regex_constants::match_continuous) &
         ~regex_constants::__full_match,
         __s.__at_first_ && __s.__current_ == __s.__first_);
     if (__matched != __invert_)
     {
         __s.__do_ = __state::__accept_but_not_consume;
         __s.__node_ = this->first();
-        for (unsigned __i = 1; __i < __m.size(); ++__i) {
-            __s.__sub_matches_[__mexp_ + __i - 1] = __m.__matches_[__i];
+        if (__matched) {
+            for (unsigned __i = 1; __i < __m.size(); ++__i) {
+                __s.__sub_matches_[__i - 1] = __m.__matches_[__i];
+            }
         }
     }
     else
@@ -4168,26 +4170,16 @@
                         switch (*__temp)
                         {
                         case '=':
-                            {
-                                basic_regex __exp;
-                                __exp.__flags_ = __flags_;
-                                __temp = __exp.__parse(++__temp, __last);
-                                unsigned __mexp = __exp.__marked_count_;
-                                __push_lookahead(_VSTD::move(__exp), false, __marked_count_);
-                                __marked_count_ += __mexp;
-                                if (__temp == __last || *__temp != ')')
-                                    __throw_regex_error<regex_constants::error_paren>();
-                                __first = ++__temp;
-                            }
-                            break;
                         case '!':
                             {
+                                bool __invert = (*__temp == '!');
                                 basic_regex __exp;
                                 __exp.__flags_ = __flags_;
+                                __exp.__marked_count_ = __marked_count_;
                                 __temp = __exp.__parse(++__temp, __last);
                                 unsigned __mexp = __exp.__marked_count_;
-                                __push_lookahead(_VSTD::move(__exp), true, __marked_count_);
-                                __marked_count_ += __mexp;
+                                __push_lookahead(_VSTD::move(__exp), __invert);
+                                __marked_count_ = __mexp;
                                 if (__temp == __last || *__temp != ')')
                                     __throw_regex_error<regex_constants::error_paren>();
                                 __first = ++__temp;
@@ -4749,12 +4741,11 @@
 
 template <class _CharT, class _Traits>
 void
-basic_regex<_CharT, _Traits>::__push_lookahead(const basic_regex& __exp,
-                                               bool __invert,
-                                               unsigned __mexp)
+basic_regex<_CharT, _Traits>::__push_lookahead(basic_regex __exp,
+                                               bool __invert)
 {
-    __end_->first() = new __lookahead<_CharT, _Traits>(__exp, __invert,
-                                                           __end_->first(), __mexp);
+    __end_->first() = new __lookahead<_CharT, _Traits>(move(__exp), __invert,
+                                                           __end_->first());
     __end_ = static_cast<__owns_one_state<_CharT>*>(__end_->first());
 }
 
@@ -5529,6 +5520,7 @@
 basic_regex<_CharT, _Traits>::__match_at_start_ecma(
         const _CharT* __first, const _CharT* __last,
         match_results<const _CharT*, _Allocator>& __m,
+        const vector<sub_match<const _CharT *>> &incoming_sub_matches,
         regex_constants::match_flag_type __flags, bool __at_first) const
 {
     vector<__state> __states;
@@ -5539,13 +5531,15 @@
         __unmatched.first   = __last;
         __unmatched.second  = __last;
         __unmatched.matched = false;
+        vector<sub_match<const _CharT *>> sub_matches = incoming_sub_matches;
+        sub_matches.resize(mark_count(), __unmatched);
 
         __states.push_back(__state());
         __states.back().__do_ = 0;
         __states.back().__first_ = __first;
         __states.back().__current_ = __first;
         __states.back().__last_ = __last;
-        __states.back().__sub_matches_.resize(mark_count(), __unmatched);
+        __states.back().__sub_matches_ = move(sub_matches);
         __states.back().__loop_data_.resize(__loop_count());
         __states.back().__node_ = __st;
         __states.back().__flags_ = __flags;
@@ -5798,7 +5792,7 @@
         regex_constants::match_flag_type __flags, bool __at_first) const
 {
     if ((__flags_ & 0x1F0) == ECMAScript)
-        return __match_at_start_ecma(__first, __last, __m, __flags, __at_first);
+        return __match_at_start_ecma(__first, __last, __m, {}, __flags, __at_first);
     if (mark_count() == 0)
         return __match_at_start_posix_nosubs(__first, __last, __m, __flags, __at_first);
     return __match_at_start_posix_subs(__first, __last, __m, __flags, __at_first);
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
  • [PATCH] D32635: [libcxx] regex... Peter Ammon via Phabricator via cfe-commits

Reply via email to