bug#22655: grep -Pz '^' now fails!

Paul Eggert Sat, 19 Nov 2016 23:59:02 -0800

Stephane Chazelas wrote:

I don't find a x220 factor, more like a x2.5 factor:

I think I found the factor-of-hundreds slowdown, and fixed it in the 2ndattached patch.

When I tried your benchmark with pcregrep (pcre 8.39, configured with--enable-unicode-properties), and with ./grep0 (which has the PCRE_MULTILINEimplementation, i.e., commit da94c91a81fc63275371d0580d8688b6abd85346), and with./grep (which is grep after the attached patches are installed), I got timingslike the following:


    user  sys
    1.972 0.072 LC_ALL=en_US.utf8 pcregrep -u "z.*a" k
    0.234 0.076 LC_ALL=en_US.utf8 ./grep0 -P "z.*a" k
    1.280 0.064 LC_ALL=en_US.utf8 ./grep -P "z.*a" k
    1.487 0.077 LC_ALL=C pcregrep "z.*a" k
    0.193 0.067 LC_ALL=C ./grep0 -P "z.*a" k
    0.825 0.096 LC_ALL=C ./grep -P "z.*a" k

All times are CPU seconds. This is Fedora 24 x86-64, AMD Phenom II X4 910e. Asbefore, k was created by the shell command: yes 'abcdefg hijklmn opqrstu vwxyz'| head -n 10000000 >k

So, on this benchmark using PCRE_MULTILINE gave a speedup of a factor of ~4.3 ina multibyte locale, and a speedup of ~3.5 in a unibyte locale.

On the other hand if you change the pattern to "z[^+]*a",
pcregrep still takes about one second, but GNU grep a lot longer

Yes, that example makes GNU grep -P look really bad. So installed the 1stattached patch, which mostly just reverts the January multiline patch, i.e., itgoes back to the slower "./grep -P" lines measured above.

From 1b65e6bbbc75d62e766dc6293ce042cd40fb935d Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Sat, 19 Nov 2016 22:48:37 -0800
Subject: [PATCH 1/2] grep: -P no longer uses PCRE_MULTILINE

This reverts commit f6603c4e1e04dbb87a7232c4b44acc6afdf65fef,
as the extra performance is not worth the trouble for PCRE users.
Problem reported by Stephane Chazelas in:
http://bugs.gnu.org/22655#103
* NEWS: Document this and the next patch.
* src/dfasearch.c (EGexecute):
* src/grep.c (execute_fp_t):
* src/kwsearch.c (Fexecute):
* src/pcresearch.c (Pexecute):
First arg is now a const pointer again.
* src/grep.c (buf_has_encoding_errors): Now static.
* src/grep.h (buf_has_encoding_errors): Remove decl.
* src/search.h: Adjust decls.
* src/pcresearch.c (reflags): Remove.  All uses removed.
(Pcompile, Pexecute): Do not use PCRE_MULTILINE.
---
 NEWS             |   8 +++--
 src/dfasearch.c  |   2 +-
 src/grep.c       |   4 +--
 src/grep.h       |   1 -
 src/kwsearch.c   |   2 +-
 src/pcresearch.c | 101 ++++++-------------------------------------------------
 src/search.h     |   6 ++--
 7 files changed, 22 insertions(+), 102 deletions(-)

diff --git a/NEWS b/NEWS
index 4972c01..6138b48 100644
--- a/NEWS
+++ b/NEWS
@@ -10,9 +10,11 @@ GNU grep NEWS                                    -*- outline -*-
   >/dev/null" where PROGRAM dies when writing into a broken pipe.
   [bug introduced in grep-2.26]
 
-  grep -Pz no longer rejects patterns containing ^ and $, is more
-  cautious about special patterns like (?-m) and (*FAIL), and works
-  when combined with -x.  [bug introduced in grep-2.23]
+  grep -P no longer attempts multiline matches.  This works more
+  intuitively with unusual patterns, and means that grep -Pz no longer
+  rejects patterns containing ^ and $ and works when combined with -x.
+  [bugs introduced in grep-2.23] A downside is that grep -P is now
+  significantly slower, albeit typically still faster than pcregrep.
 
   grep -m0 -L PAT FILE now outputs "FILE".  [bug introduced in grep-2.5]
 
diff --git a/src/dfasearch.c b/src/dfasearch.c
index d41b6fd..ded9917 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -216,7 +216,7 @@ GEAcompile (char const *pattern, size_t size, reg_syntax_t syntax_bits)
 }
 
 size_t
-EGexecute (char *buf, size_t size, size_t *match_size,
+EGexecute (char const *buf, size_t size, size_t *match_size,
            char const *start_ptr)
 {
   char const *buflim, *beg, *end, *ptr, *match, *best_match, *mb_start;
diff --git a/src/grep.c b/src/grep.c
index f120bcf..4538f22 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -589,7 +589,7 @@ static bool seek_data_failed;
 
 /* Functions we'll use to search. */
 typedef void (*compile_fp_t) (char const *, size_t);
-typedef size_t (*execute_fp_t) (char *, size_t, size_t *, char const *);
+typedef size_t (*execute_fp_t) (char const *, size_t, size_t *, char const *);
 static compile_fp_t compile;
 static execute_fp_t execute;
 
@@ -696,7 +696,7 @@ skip_easy_bytes (char const *buf)
 /* Return true if BUF, of size SIZE, has an encoding error.
    BUF must be followed by at least sizeof (uword) bytes,
    the first of which may be modified.  */
-bool
+static bool
 buf_has_encoding_errors (char *buf, size_t size)
 {
   if (! unibyte_mask)
diff --git a/src/grep.h b/src/grep.h
index b45992f..d10145e 100644
--- a/src/grep.h
+++ b/src/grep.h
@@ -29,7 +29,6 @@ extern bool match_words;	/* -w */
 extern bool match_lines;	/* -x */
 extern char eolbyte;		/* -z */
 
-extern bool buf_has_encoding_errors (char *, size_t);
 extern char const *pattern_file_name (size_t, size_t *);
 
 #endif
diff --git a/src/kwsearch.c b/src/kwsearch.c
index 29d140c..c3e69b3 100644
--- a/src/kwsearch.c
+++ b/src/kwsearch.c
@@ -78,7 +78,7 @@ Fcompile (char const *pattern, size_t size)
 }
 
 size_t
-Fexecute (char *buf, size_t size, size_t *match_size,
+Fexecute (char const *buf, size_t size, size_t *match_size,
           char const *start_ptr)
 {
   char const *beg, *try, *end, *mb_start;
diff --git a/src/pcresearch.c b/src/pcresearch.c
index 01616c2..1948acf 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -32,9 +32,6 @@ enum { NSUB = 300 };
 /* Compiled internal form of a Perl regular expression.  */
 static pcre *cre;
 
-/* PCRE options used to compile the pattern.  */
-static int reflags;
-
 /* Additional information about the pattern.  */
 static pcre_extra *extra;
 
@@ -107,15 +104,13 @@ Pcompile (char const *pattern, size_t size)
   int fix_len_max = MAX (sizeof wprefix - 1 + sizeof wsuffix - 1,
                          sizeof xprefix - 1 + sizeof xsuffix - 1);
   char *re = xnmalloc (4, size + (fix_len_max + 4 - 1) / 4);
-  int flags = (PCRE_MULTILINE
-               | (match_icase ? PCRE_CASELESS : 0));
+  int flags = PCRE_DOLLAR_ENDONLY | (match_icase ? PCRE_CASELESS : 0);
   char const *patlim = pattern + size;
   char *n = re;
   char const *p;
   char const *pnul;
-  bool multibyte_locale = 1 < MB_CUR_MAX;
 
-  if (multibyte_locale)
+  if (1 < MB_CUR_MAX)
     {
       if (! localeinfo.using_utf8)
         die (EXIT_TROUBLE, 0, _("-P supports only unibyte and UTF-8 locales"));
@@ -126,32 +121,6 @@ Pcompile (char const *pattern, size_t size)
   if (memchr (pattern, '\n', size))
     die (EXIT_TROUBLE, 0, _("the -P option only supports a single pattern"));
 
-  if (! eolbyte)
-    {
-      bool line_at_a_time = match_lines;
-      if (! line_at_a_time)
-        {
-          bool escaped = false;
-          bool after_unescaped_left_bracket = false;
-          for (p = pattern; *p; p++)
-            if (escaped)
-              escaped = after_unescaped_left_bracket = false;
-            else
-              {
-                if (*p == '$' || (*p == '^' && !after_unescaped_left_bracket)
-                    || (*p == '(' && (p[1] == '?' || p[1] == '*')))
-                  {
-                    line_at_a_time = true;
-                    break;
-                  }
-                escaped = *p == '\\';
-                after_unescaped_left_bracket = *p == '[';
-              }
-        }
-      if (line_at_a_time)
-        flags = (flags & ~ PCRE_MULTILINE) | PCRE_DOLLAR_ENDONLY;
-    }
-
   *n = '\0';
   if (match_words)
     strcpy (n, wprefix);
@@ -182,7 +151,6 @@ Pcompile (char const *pattern, size_t size)
   if (match_lines)
     strcpy (n, xsuffix);
 
-  reflags = flags;
   cre = pcre_compile (re, flags, &ep, &e, pcre_maketables ());
   if (!cre)
     die (EXIT_TROUBLE, 0, "%s", ep);
@@ -210,7 +178,7 @@ Pcompile (char const *pattern, size_t size)
 }
 
 size_t
-Pexecute (char *buf, size_t size, size_t *match_size,
+Pexecute (char const *buf, size_t size, size_t *match_size,
           char const *start_ptr)
 {
 #if !HAVE_LIBPCRE
@@ -229,38 +197,14 @@ Pexecute (char *buf, size_t size, size_t *match_size,
      error.  */
   char const *subject = buf;
 
-  /* If the pattern has no problematic operators and the input is
-     unibyte or is free of encoding errors, a multiline search is
-     typically more efficient.  Otherwise, a single-line search is
-     either less confusing because the problematic operators are
-     interpreted more naturally, or it is typically faster because
-     pcre_exec doesn't waste time validating the entire input
-     buffer.  */
-  bool multiline = (reflags & PCRE_MULTILINE) != 0;
-  if (multiline && (reflags & PCRE_UTF8) != 0)
-    {
-      multiline = ! buf_has_encoding_errors (buf, size - 1);
-      buf[size - 1] = eolbyte;
-    }
-
   for (; p < buf + size; p = line_start = line_end + 1)
     {
-      bool too_big;
-
-      if (multiline)
-        {
-          size_t pcre_size_max = MIN (INT_MAX, SIZE_MAX - 1);
-          size_t scan_size = MIN (pcre_size_max + 1, buf + size - p);
-          line_end = memrchr (p, eolbyte, scan_size);
-          too_big = ! line_end;
-        }
-      else
-        {
-          line_end = memchr (p, eolbyte, buf + size - p);
-          too_big = INT_MAX < line_end - p;
-        }
-
-      if (too_big)
+      /* Use a single_line search.  Although this code formerly used
+         PCRE_MULTILINE for performance, the performance wasn't always
+         better and the correctness issues were too puzzling.  See
+         Bug#22655.  */
+      line_end = memchr (p, eolbyte, buf + size - p);
+      if (INT_MAX < line_end - p)
         die (EXIT_TROUBLE, 0, _("exceeded PCRE's line length limit"));
 
       for (;;)
@@ -289,27 +233,11 @@ Pexecute (char *buf, size_t size, size_t *match_size,
           int options = 0;
           if (!bol)
             options |= PCRE_NOTBOL;
-          if (multiline)
-            options |= PCRE_NO_UTF8_CHECK;
 
           e = jit_exec (subject, line_end - subject, search_offset,
                         options, sub);
           if (e != PCRE_ERROR_BADUTF8)
-            {
-              if (0 < e && multiline && sub[1] - sub[0] != 0)
-                {
-                  char const *nl = memchr (subject + sub[0], eolbyte,
-                                           sub[1] - sub[0]);
-                  if (nl)
-                    {
-                      /* This match crosses a line boundary; reject it.  */
-                      p = subject + sub[0];
-                      line_end = nl;
-                      continue;
-                    }
-                }
-              break;
-            }
+            break;
           int valid_bytes = sub[0];
 
           if (search_offset <= valid_bytes)
@@ -382,15 +310,6 @@ Pexecute (char *buf, size_t size, size_t *match_size,
           beg = matchbeg;
           end = matchend;
         }
-      else if (multiline)
-        {
-          char const *prev_nl = memrchr (line_start - 1, eolbyte,
-                                         matchbeg - (line_start - 1));
-          char const *next_nl = memchr (matchend, eolbyte,
-                                        line_end + 1 - matchend);
-          beg = prev_nl + 1;
-          end = next_nl + 1;
-        }
       else
         {
           beg = line_start;
diff --git a/src/search.h b/src/search.h
index b6c1945..4957a63 100644
--- a/src/search.h
+++ b/src/search.h
@@ -54,15 +54,15 @@ extern wint_t mb_next_wc (char const *, char const *);
 /* dfasearch.c */
 extern struct localeinfo localeinfo;
 extern void GEAcompile (char const *, size_t, reg_syntax_t);
-extern size_t EGexecute (char *, size_t, size_t *, char const *);
+extern size_t EGexecute (char const *, size_t, size_t *, char const *);
 
 /* kwsearch.c */
 extern void Fcompile (char const *, size_t);
-extern size_t Fexecute (char *, size_t, size_t *, char const *);
+extern size_t Fexecute (char const *, size_t, size_t *, char const *);
 
 /* pcresearch.c */
 extern void Pcompile (char const *, size_t);
-extern size_t Pexecute (char *, size_t, size_t *, char const *);
+extern size_t Pexecute (char const *, size_t, size_t *, char const *);
 
 /* Return the number of bytes in the character at the start of S, which
    is of size N.  N must be positive.  MBS is the conversion state.
-- 
2.7.4

From 2a8fd4a7db5b7ce3cca6bc644e0c3643cbc96009 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Sat, 19 Nov 2016 22:48:37 -0800
Subject: [PATCH 2/2] grep: further -P performance fix

Problem reported by Stephane Chazelas in:
http://bugs.gnu.org/22655#103
* src/pcresearch.c (Pexecute): Set the subject to the start of
each line as it is found.
---
 src/pcresearch.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/pcresearch.c b/src/pcresearch.c
index 1948acf..108baff 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -194,12 +194,12 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
 
   /* The search address to pass to pcre_exec.  This is the start of
      the buffer, or just past the most-recently discovered encoding
-     error.  */
+     error or line end.  */
   char const *subject = buf;
 
-  for (; p < buf + size; p = line_start = line_end + 1)
+  do
     {
-      /* Use a single_line search.  Although this code formerly used
+      /* Search line by line.  Although this code formerly used
          PCRE_MULTILINE for performance, the performance wasn't always
          better and the correctness issues were too puzzling.  See
          Bug#22655.  */
@@ -269,7 +269,9 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
       if (e != PCRE_ERROR_NOMATCH)
         break;
       bol = true;
+      p = subject = line_start = line_end + 1;
     }
+  while (p < buf + size);
 
   if (e <= 0)
     {
-- 
2.7.4

bug#22655: grep -Pz '^' now fails!

Reply via email to