On 11/3/18 9:25 PM, Norihiro Tanaka wrote:

>   $ seq -f '%040g' 0 9999 | sed '1s/$/\\(0\\)\\1/' >pat

Thanks for the test case and sorry about the delay. And thanks for spotting the
speedup opportunity. I found a few problems with the proposed patch, though:

> +        if (keys[1] == '\\')
> +          keys++;

This mishandles the case where the input byte sequence contains '\', '\', '1'
where the first '\' is the last byte of a multibyte character. Such a byte
sequence can contain backreferences but the function will say it doesn't.

> +      if (backref && prev < p)
> +        {
> +          len = p - prev;
> +          buf = xrealloc (buf, (buflen + len) * sizeof *buf);
> +          memcpy (buf + buflen, p, len);
> +          buflen += len;
> +        }

This seems to have three problems. First, the memcpy copies from P, but it
should copy from PREV. Second, this code assigns to LEN, which breaks a later
use of LEN. Third, if there are many patterns with backreferences, repeated use
of realloc will have O(N**2) behavior.

> +      for (size_t i = 0; i < dc->pcount; i++)
> +        dc->patterns[i + 1] = dc->patterns[i];

This copies dc->patterns[0] to the later values in that array, when a memmove
was intended.

So, after installing the patch, I immediately installed the attached patch,
which should address the abovementioned issues.

Thanks again. You did the hard work - I merely proofread it.
>From c2ec762dbc132d3c4a727c8e2ecab2a7286729d6 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Sun, 22 Dec 2019 16:39:09 -0800
Subject: [PATCH] grep: fix some bugs in pattern-grouping speedup

This fixes some bugs in the previous commit,
and should finish the fix for Bug#33249.
* NEWS: Mention fix for Bug#33249.
* src/dfasearch.c (possible_backrefs_in_pattern, regex_compile)
(GEAcompile): In new code, prefer ptrdiff_t to size_t when either
will do, since ptrdiff_t has better error checking.  At some point
we should adjust the old code too.
(possible_backrefs_in_pattern): Rename from
find_backref_in_pattern.  New arg BS_SAFE.  All uses changed.
Fix false negative if a multibyte character ends in a single
'\\' byte, followed by the two bytes '\\', '1'.
(regex_compile): Simplify.
(GEAcompile): Avoid quadratic behavior when reallocating growing
buffers.  Fix a couple of bugs in copying pattern data involving
backreferences.  Fix another bug in copying pattern metadata
involving backreferences, by removing the need to copy it.
---
 NEWS            |   4 ++
 src/dfasearch.c | 125 ++++++++++++++++++++++++++++++------------------
 2 files changed, 82 insertions(+), 47 deletions(-)

diff --git a/NEWS b/NEWS
index 8eda865..718659c 100644
--- a/NEWS
+++ b/NEWS
@@ -21,6 +21,10 @@ GNU grep NEWS                                    -*- outline -*-
   output is /dev/null.
   [Bug#37716 introduced in grep 3.2]
 
+  A performance bug has been fixed when grep is given many patterns
+  each of which lack backreferences.
+  [Bug#33249 introduced in grep 2.5]
+
   A performance bug has been fixed for patterns like '01.2' that
   cause grep to reorder tokens internally.
   [Bug#34951 introduced in grep 3.2]
diff --git a/src/dfasearch.c b/src/dfasearch.c
index 42d16dc..eb7732b 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -35,7 +35,7 @@ struct dfa_comp
   struct dfa *dfa;
 
   /* Regex compiled regexps. */
-  struct re_pattern_buffer* patterns;
+  struct re_pattern_buffer *patterns;
   size_t pcount;
   struct re_registers regs;
 
@@ -103,25 +103,49 @@ kwsmusts (struct dfa_comp *dc)
   dfamustfree (dm);
 }
 
-static bool
-find_backref_in_pattern (const char *keys, size_t len)
+/* Return true if KEYS, of length LEN, might contain a backreference.
+   Return false if KEYS cannot contain a backreference.
+   BS_SAFE is true of encodings where a backslash cannot appear as the
+   last byte of a multibyte character.  */
+static bool _GL_ATTRIBUTE_PURE
+possible_backrefs_in_pattern (char const *keys, ptrdiff_t len, bool bs_safe)
 {
-  for (; len; keys++, len--)
-    if (keys[0] == '\\')
-      {
-        if (1 <= keys[1] && keys[1] <= '9')
-          return true;
-
-        if (keys[1] == '\\')
-          keys++;
-      }
-
+  /* Normally a backslash, but in an unsafe encoding this is a a
+     non-char value so that the comparison below always fails, because
+     if there are two adjacent '\' bytes the first might the last byte
+     of a multibyte character.  */
+  int second_backslash = bs_safe ? '\\' : CHAR_MAX + 1;
+
+  /* This code can return true even if KEYS lacks a backreference, for
+     patterns like [\2], or for encodings where '\' appears as
+     the last byte of a multibyte character.  However, false alarms
+     should be rare and do not affect correctness.  */
+
+  /* Do not look for a backslash in the pattern's last byte, since it
+     can't be part of a backreference and this streamlines the code.  */
+  len--;
+
+  if (0 <= len)
+    {
+      char const *lim = keys + len;
+      for (char const *p = keys; (p = memchr (p, '\\', lim - p)); p++)
+        {
+          if ('1' <= p[1] && p[1] <= '9')
+            return true;
+          if (p[1] == second_backslash)
+            {
+              p++;
+              if (p == lim)
+                break;
+            }
+        }
+    }
   return false;
 }
 
 static bool
-regex_compile (struct dfa_comp *dc, const char *p, size_t len, size_t pcount,
-               size_t lineno, bool syntax_only)
+regex_compile (struct dfa_comp *dc, char const *p, ptrdiff_t len,
+               ptrdiff_t pcount, ptrdiff_t lineno, bool syntax_only)
 {
   struct re_pattern_buffer pat0;
   struct re_pattern_buffer *pat = syntax_only ? &pat0 : &dc->patterns[pcount];
@@ -129,29 +153,23 @@ regex_compile (struct dfa_comp *dc, const char *p, size_t len, size_t pcount,
   pat->allocated = 0;
 
   /* Do not use a fastmap with -i, to work around glibc Bug#20381.  */
-  pat->fastmap = (syntax_only || match_icase) ? NULL : xmalloc (UCHAR_MAX + 1);
+  pat->fastmap = syntax_only | match_icase ? NULL : xmalloc (UCHAR_MAX + 1);
 
   pat->translate = NULL;
 
   char const *err = re_compile_pattern (p, len, pat);
-
   if (!err)
     return true;
 
-  /* With patterns specified only on the command line, emit the bare
-     diagnostic.  Otherwise, include a filename:lineno: prefix.  */
-  size_t new_lineno;
-  const char *pat_filename;
-
-  if (lineno != (size_t) -1)
-    pat_filename = pattern_file_name (lineno + 1, &new_lineno);
-  else
-    pat_filename = "\0";
+  /* Emit a filename:lineno: prefix for patterns taken from files.  */
+  size_t pat_lineno = lineno;
+  char const *pat_filename
+    = lineno < 0 ? "\0" : pattern_file_name (lineno + 1, &pat_lineno);
 
   if (*pat_filename == '\0')
     error (0, 0, "%s", err);
   else
-    error (0, 0, "%s:%zu: %s", pat_filename, new_lineno, err);
+    error (0, 0, "%s:%zu: %s", pat_filename, pat_lineno, err);
 
   return false;
 }
@@ -169,6 +187,7 @@ GEAcompile (char *pattern, size_t size, reg_syntax_t syntax_bits)
   re_set_syntax (syntax_bits);
   int dfaopts = eolbyte ? 0 : DFA_EOL_NUL;
   dfasyntax (dc->dfa, &localeinfo, syntax_bits, dfaopts);
+  bool bs_safe = !localeinfo.multibyte | localeinfo.using_utf8;
 
   /* For GNU regex, pass the patterns separately to detect errors like
      "[\nallo\n]\n", where the patterns are "[", "allo" and "]", and
@@ -177,12 +196,20 @@ GEAcompile (char *pattern, size_t size, reg_syntax_t syntax_bits)
   char const *p = pattern;
   char const *patlim = pattern + size;
   bool compilation_failed = false;
-  size_t palloc = 0;
+
+  dc->patterns = xmalloc (sizeof *dc->patterns);
+  dc->patterns++;
+  dc->pcount = 0;
+  size_t palloc = 1;
 
   char const *prev = pattern;
+
+  /* Buffer containing backreference-free patterns.  */
   char *buf = NULL;
-  size_t buflen = 0;
-  size_t lineno = 0;
+  ptrdiff_t buflen = 0;
+  size_t bufalloc = 0;
+
+  ptrdiff_t lineno = 0;
 
   do
     {
@@ -196,18 +223,26 @@ GEAcompile (char *pattern, size_t size, reg_syntax_t syntax_bits)
       else
         len = patlim - p;
 
-      bool backref = find_backref_in_pattern (p, len);
+      bool backref = possible_backrefs_in_pattern (p, len, bs_safe);
 
       if (backref && prev < p)
         {
-          len = p - prev;
-          buf = xrealloc (buf, (buflen + len) * sizeof *buf);
-          memcpy (buf + buflen, p, len);
-          buflen += len;
+          ptrdiff_t prevlen = p - prev;
+          while (bufalloc < buflen + prevlen)
+            buf = x2realloc (buf, &bufalloc);
+          memcpy (buf + buflen, prev, prevlen);
+          buflen += prevlen;
         }
 
-      if (palloc <= dc->pcount)
-        dc->patterns = x2nrealloc (dc->patterns, &palloc, sizeof *dc->patterns);
+      /* Ensure room for at least two more patterns.  The extra one is
+         for the regex_compile that may be executed after this loop
+         exits, and its (unused) slot is patterns[-1] until then.  */
+      while (palloc <= dc->pcount + 1)
+        {
+          dc->patterns = x2nrealloc (dc->patterns - 1, &palloc,
+                                     sizeof *dc->patterns);
+          dc->patterns++;
+        }
 
       if (!regex_compile (dc, p, len, dc->pcount, lineno, !backref))
         compilation_failed = true;
@@ -230,10 +265,10 @@ GEAcompile (char *pattern, size_t size, reg_syntax_t syntax_bits)
     {
       if (pattern < prev)
         {
-          size_t len = patlim - prev;
-          buf = xrealloc (buf, (buflen + len) * sizeof *buf);
-          memcpy (buf + buflen, prev, len);
-          buflen += len;
+          ptrdiff_t prevlen = patlim - prev;
+          buf = xrealloc (buf, buflen + prevlen);
+          memcpy (buf + buflen, prev, prevlen);
+          buflen += prevlen;
         }
       else
         {
@@ -244,16 +279,12 @@ GEAcompile (char *pattern, size_t size, reg_syntax_t syntax_bits)
 
   if (buf != NULL)
     {
-      dc->patterns = x2nrealloc (dc->patterns, &palloc, sizeof *dc->patterns);
-
-      for (size_t i = 0; i < dc->pcount; i++)
-        dc->patterns[i + 1] = dc->patterns[i];
+      dc->patterns--;
+      dc->pcount++;
 
       if (!regex_compile (dc, buf, buflen, 0, -1, false))
         abort ();
 
-      dc->pcount++;
-
       if (buf != pattern)
         free (buf);
     }
-- 
2.17.1

Reply via email to