bug#22655: grep -Pz '^' now fails!

Paul Eggert Sat, 19 Nov 2016 00:37:31 -0800

Stephane Chazelas wrote:

one can
use (?m) if he wants ^ to match the beginning of each line in
the NUL-delimited record instead of just the beginning of the
record.

I think the intent is that ^ and $ should match only the line-terminatorspecified by -z (or by -z's absence). So the sort of usage you describe isunspecified and not supported. That being said, it does make sense to matchtricky regular expressions like that line by line, even if this hurtsperformance. Otherwise, I suspect there are even trickier regular expressionsthat could reject a buffer full of lines even though it contains matching lines.When in doubt we should avoid optimization so I installed the attached patchinto the master branch. Please give it a try.

From 0e00fe0fc34184b1cdcea92a671eb9ffebb4899b Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Sat, 19 Nov 2016 00:25:46 -0800
Subject: [PATCH] grep: -Pz no longer rejects ^, $

Problem reported by Stephane Chazelas (Bug#22655).
* NEWS: Document this.
* doc/grep.texi (grep Programs): Warn about -Pz.
* src/pcresearch.c (reflags): New static var.
(multibyte_locale): Remove static var; now local to Pcompile.
(Pcompile): Check for (? and (* too.  Set reflags instead of
dying when problematic operators are found.
(Pexecute): Use reflags to decide whether searches should
be multiline.
* tests/pcre: Test new behavior.
---
 NEWS             |  4 ++++
 doc/grep.texi    |  4 +++-
 src/pcresearch.c | 34 +++++++++++++++++++++-------------
 tests/pcre       |  3 ++-
 4 files changed, 30 insertions(+), 15 deletions(-)

diff --git a/NEWS b/NEWS
index b3b5049..a95c875 100644
--- a/NEWS
+++ b/NEWS
@@ -4,6 +4,10 @@ GNU grep NEWS                                    -*- outline -*-
 
 ** Bug fixes
 
+  grep -Pz no longer rejects patterns containing ^ and $, and is
+  more cautious about special patterns like (?-m) and (*FAIL).
+  [bug introduced in grep-2.23]
+
   grep's use of getprogname no longer causes a build failure on HP-UX.
 
 
diff --git a/doc/grep.texi b/doc/grep.texi
index fcfad42..ac821b4 100644
--- a/doc/grep.texi
+++ b/doc/grep.texi
@@ -1125,8 +1125,10 @@ expressions), separated by newlines, any of which is to be matched.
 @opindex --perl-regexp
 @cindex matching Perl-compatible regular expressions
 Interpret the pattern as a Perl-compatible regular expression (PCRE).
-This is highly experimental and
+This is highly experimental, particularly when combined with the
+the @option{-z} (@option{--null-data}) option, and
 @samp{grep@ -P} may warn of unimplemented features.
+@xref{Other Options}.
 
 @end table
 
diff --git a/src/pcresearch.c b/src/pcresearch.c
index 928c22c..9a13d97 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -32,6 +32,9 @@ enum { NSUB = 300 };
 /* Compiled internal form of a Perl regular expression.  */
 static pcre *cre;
 
+/* PCRE options used to compile the pattern.  */
+static int reflags;
+
 /* Additional information about the pattern.  */
 static pcre_extra *extra;
 
@@ -85,8 +88,6 @@ jit_exec (char const *subject, int search_bytes, int search_offset,
 /* Table, indexed by ! (flag & PCRE_NOTBOL), of whether the empty
    string matches when that flag is used.  */
 static int empty_match[2];
-
-static bool multibyte_locale;
 #endif
 
 void
@@ -112,18 +113,19 @@ Pcompile (char const *pattern, size_t size)
   char *n = re;
   char const *p;
   char const *pnul;
+  bool multibyte_locale = 1 < MB_CUR_MAX;
 
-  if (1 < MB_CUR_MAX)
+  if (multibyte_locale)
     {
       if (! localeinfo.using_utf8)
         die (EXIT_TROUBLE, 0, _("-P supports only unibyte and UTF-8 locales"));
-      multibyte_locale = true;
       flags |= PCRE_UTF8;
     }
 
-  /* FIXME: Remove these restrictions.  */
+  /* FIXME: Remove this restriction.  */
   if (memchr (pattern, '\n', size))
     die (EXIT_TROUBLE, 0, _("the -P option only supports a single pattern"));
+
   if (! eolbyte)
     {
       bool escaped = false;
@@ -133,9 +135,12 @@ Pcompile (char const *pattern, size_t size)
           escaped = after_unescaped_left_bracket = false;
         else
           {
-            if (*p == '$' || (*p == '^' && !after_unescaped_left_bracket))
-              die (EXIT_TROUBLE, 0,
-                   _("unescaped ^ or $ not supported with -Pz"));
+            if (*p == '$' || (*p == '^' && !after_unescaped_left_bracket)
+                || (*p == '(' && (p[1] == '?' || p[1] == '*')))
+              {
+                flags = (flags & ~ PCRE_MULTILINE) | PCRE_DOLLAR_ENDONLY;
+                break;
+              }
             escaped = *p == '\\';
             after_unescaped_left_bracket = *p == '[';
           }
@@ -217,12 +222,15 @@ Pexecute (char *buf, size_t size, size_t *match_size,
      error.  */
   char const *subject = buf;
 
-  /* If the input is unibyte or is free of encoding errors a multiline search is
+  /* If the pattern has no problematic operators and the input is
+     unibyte or is free of encoding errors, a multiline search is
      typically more efficient.  Otherwise, a single-line search is
-     typically faster, so that pcre_exec doesn't waste time validating
-     the entire input buffer.  */
-  bool multiline = true;
-  if (multibyte_locale)
+     either less confusing because the problematic operators are
+     interpreted more naturally, or it is typically faster because
+     pcre_exec doesn't waste time validating the entire input
+     buffer.  */
+  bool multiline = (reflags & PCRE_MULTILINE) != 0;
+  if (multiline && (reflags & PCRE_UTF8) != 0)
     {
       multiline = ! buf_has_encoding_errors (buf, size - 1);
       buf[size - 1] = eolbyte;
diff --git a/tests/pcre b/tests/pcre
index 8f3d9a4..653ef22 100755
--- a/tests/pcre
+++ b/tests/pcre
@@ -13,8 +13,9 @@ require_pcre_
 fail=0
 
 echo | grep -P '\s*$' || fail=1
-echo | returns_ 2 grep -zP '\s$' || fail=1
+echo | grep -zP '\s$' || fail=1
 echo '.ab' | returns_ 1 grep -Pwx ab || fail=1
 echo x | grep -Pz '[^a]' || fail=1
+printf 'x\n\0' | returns_ 1 grep -zP 'x$' || fail=1
 
 Exit $fail
-- 
2.7.4

bug#22655: grep -Pz '^' now fails!

Reply via email to