bug#20957: 2.21 bug in handling at least one -P regular expression

Paul Eggert Fri, 03 Jul 2015 08:24:55 -0700

vampyre...@gmail.com wrote:

grep 2.21 incorrectly handles a -P regular expression that 2.20 handled 
correctly.

Thanks for reporting that. I installed the attached patches. The first fixesthe bug; the second is a minor cleanup.

>From bffb51cfda75eeb1d99c34973d5a45fc1b784d89 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Fri, 3 Jul 2015 08:10:54 -0700
Subject: [PATCH 1/2] grep: don't mishandle left context in -P

http://bugs.gnu.org/20957
* src/pcresearch.c (jit_exec): New arg SEARCH_OFFSET.
Caller changed.
(Pexecute): Pass the left context to pcre_exec, so that PCRE
regular-expression matching can see it.
* tests/pcre-context: New file, to test for this bug.
* tests/Makefile.am (TESTS): Add it.
---
 src/pcresearch.c   | 55 +++++++++++++++++++++++++++++++++---------------------
 tests/Makefile.am  |  1 +
 tests/pcre-context | 38 +++++++++++++++++++++++++++++++++++++
 3 files changed, 73 insertions(+), 21 deletions(-)
 create mode 100755 tests/pcre-context

diff --git a/src/pcresearch.c b/src/pcresearch.c
index aa05e20..b1f8310 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -43,16 +43,18 @@ static pcre_extra *extra;
 static int jit_stack_size;
 # endif
 
-/* Match the already-compiled PCRE pattern against the data in P, of
-   size SEARCH_BYTES, with options OPTIONS, and storing resulting
-   matches into SUB.  Return the (nonnegative) match location or a
-   (negative) error number.  */
+/* Match the already-compiled PCRE pattern against the data in SUBJECT,
+   of size SEARCH_BYTES and starting with offset SEARCH_OFFSET, with
+   options OPTIONS, and storing resulting matches into SUB.  Return
+   the (nonnegative) match location or a (negative) error number.  */
 static int
-jit_exec (char const *p, int search_bytes, int options, int *sub)
+jit_exec (char const *subject, int search_bytes, int search_offset,
+          int options, int *sub)
 {
   while (true)
     {
-      int e = pcre_exec (cre, extra, p, search_bytes, 0, options, sub, NSUB);
+      int e = pcre_exec (cre, extra, subject, search_bytes, search_offset,
+                         options, sub, NSUB);
 
 # if PCRE_STUDY_JIT_COMPILE
       if (e == PCRE_ERROR_JIT_STACKLIMIT
@@ -187,6 +189,11 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
   int e = PCRE_ERROR_NOMATCH;
   char const *line_end;
 
+  /* The search address to pass to pcre_exec.  This is the start of
+     the buffer, or just past the most-recently discovered encoding
+     error.  */
+  char const *subject = buf;
+
   /* If the input type is unknown, the caller is still testing the
      input, which means the current buffer cannot contain encoding
      errors and a multiline search is typically more efficient.
@@ -226,12 +233,13 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
               bol = false;
             }
 
+          int search_offset = p - subject;
+
           /* Check for an empty match; this is faster than letting
              pcre_exec do it.  */
-          int search_bytes = line_end - p;
-          if (search_bytes == 0)
+          if (p == line_end)
             {
-              sub[0] = sub[1] = 0;
+              sub[0] = sub[1] = search_offset;
               e = empty_match[bol];
               break;
             }
@@ -242,17 +250,18 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
           if (multiline)
             options |= PCRE_NO_UTF8_CHECK;
 
-          e = jit_exec (p, search_bytes, options, sub);
+          e = jit_exec (subject, line_end - subject, search_offset,
+                        options, sub);
           if (e != PCRE_ERROR_BADUTF8)
             {
               if (0 < e && multiline && sub[1] - sub[0] != 0)
                 {
-                  char const *nl = memchr (p + sub[0], eolbyte,
+                  char const *nl = memchr (subject + sub[0], eolbyte,
                                            sub[1] - sub[0]);
                   if (nl)
                     {
                       /* This match crosses a line boundary; reject it.  */
-                      p += sub[0];
+                      p = subject + sub[0];
                       line_end = nl;
                       continue;
                     }
@@ -261,22 +270,26 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
             }
           int valid_bytes = sub[0];
 
-          /* Try to match the string before the encoding error.
-             Again, handle the empty-match case specially, for speed.  */
-          if (valid_bytes == 0)
+          /* Try to match the string before the encoding error.  */
+          if (valid_bytes < search_offset)
+            e = PCRE_ERROR_NOMATCH;
+          else if (valid_bytes == 0)
             {
+              /* Handle the empty-match case specially, for speed.
+                 This optimization is valid if VALID_BYTES is zero,
+                 which means SEARCH_OFFSET is also zero.  */
               sub[1] = 0;
               e = empty_match[bol];
             }
           else
-            e = pcre_exec (cre, extra, p, valid_bytes, 0,
-                           options | PCRE_NO_UTF8_CHECK | PCRE_NOTEOL,
-                           sub, NSUB);
+            e = jit_exec (subject, valid_bytes, search_offset,
+                          options | PCRE_NO_UTF8_CHECK | PCRE_NOTEOL, sub);
+
           if (e != PCRE_ERROR_NOMATCH)
             break;
 
           /* Treat the encoding error as data that cannot match.  */
-          p += valid_bytes + 1;
+          p = subject += valid_bytes + 1;
           bol = false;
         }
 
@@ -315,8 +328,8 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
     }
   else
     {
-      char const *matchbeg = p + sub[0];
-      char const *matchend = p + sub[1];
+      char const *matchbeg = subject + sub[0];
+      char const *matchend = subject + sub[1];
       char const *beg;
       char const *end;
       if (start_ptr)
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 2d7ebf6..7bceac7 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -92,6 +92,7 @@ TESTS =						\
   options					\
   pcre						\
   pcre-abort					\
+  pcre-context					\
   pcre-infloop					\
   pcre-invalid-utf8-input			\
   pcre-jitstack					\
diff --git a/tests/pcre-context b/tests/pcre-context
new file mode 100755
index 0000000..f0c96e0
--- /dev/null
+++ b/tests/pcre-context
@@ -0,0 +1,38 @@
+#!/bin/sh
+# Test Perl regex with context
+. "${srcdir=.}/init.sh"; path_prepend_ ../src
+require_pcre_
+
+cat >in <<'EOF'
+Preceded by 0 empty lines.
+
+Preceded by 1 empty line.
+
+
+Preceded by 2 empty lines.
+
+
+
+Preceded by 3 empty lines.
+
+
+
+
+Preceded by 4 empty lines.
+
+EOF
+test $? -eq 0 || framework_failure_
+
+cat >exp <<'EOF'
+Preceded by 2 empty lines.
+Preceded by 3 empty lines.
+Preceded by 4 empty lines.
+EOF
+test $? -eq 0 || framework_failure_
+
+fail=0
+
+grep -Pzo '(?<=\n\n\n).*' in >out || fail_ 'grep -Pzo failed'
+compare exp out || fail=1
+
+Exit $fail
-- 
2.1.0

>From 36f8a291f87368072fd382cdcd9255b4163d6e1b Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Fri, 3 Jul 2015 08:11:53 -0700
Subject: [PATCH 2/2] grep: simplify print_line_middle slightly

* src/grep.c (print_line_middle): Simplify.
---
 src/grep.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/grep.c b/src/grep.c
index d1581e3..778dbcb 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -1022,8 +1022,8 @@ print_line_middle (const char *beg, const char *lim,
   const char *mid = NULL;
 
   while (cur < lim
-         && ((match_offset = execute (beg, lim - beg, &match_size,
-                                      beg + (cur - beg))) != (size_t) -1))
+         && ((match_offset = execute (beg, lim - beg, &match_size, cur))
+             != (size_t) -1))
     {
       char const *b = beg + match_offset;
 
-- 
2.1.0

bug#20957: 2.21 bug in handling at least one -P regular expression

Reply via email to