I installed the attached patches into grep master. These fix the performance regressions noted at the start of Bug#22357. I see that the related performance problems noted in Bug#21763 seem to be fixed too, I expect because of Norihiro Tanaka's recent changes, so I'll boldly close both bug reports.

To some extent the attached patches restore the old behavior for grep -F, when grep is given two or more patterns. The patch doesn't change the underlying algorithms; it merely uses a different heuristic to decide whether to use the -F matcher. Although I wouldn't be surprised if the attached patches hurt performance in some cases, I didn't uncover any such cases in my performance testing, which I admit mostly consisted of running the examples in the abovementioned bug reports.

I'll leave Bug#22239 open, as I get the following performance figures (user+system CPU time) for the Bug#22239 benchmark, where list.txt is created by "aspell dump master | head -n 100000 >list.txt", and the grep commands all use the operands "-F -f list.txt /etc/passwd" in the en_US.utf8 locale on Fedora 24 x86-64.

  no -i       -i       grep version
   0.25      0.33      2.16
   0.26     10.95      2.21
   0.11      2.90*     current master (including attached patches)

In the C locale, the current grep master is always significantly faster than grep 2.16 or 2.21 on the benchmark, so the only significant problem is the number marked "*". I ran the benchmarks on an AMD Phenom II X4 910e.
From 75bca3ca0a6ea87a66531ab7c0f8859e1d6c4300 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Tue, 20 Dec 2016 08:56:06 -0800
Subject: [PATCH 1/3] grep: simplify line counting in patterns

* src/grep.c (n_patterns): Rename from patfile_lineno,
as it is now origin-zero.  Now size_t, not uintmax_t.
(count_nl_bytes, fl_add): Simplify to just buffer and size.
All callers changed.
---
 src/grep.c | 49 ++++++++++++++++++++-----------------------------
 1 file changed, 20 insertions(+), 29 deletions(-)

diff --git a/src/grep.c b/src/grep.c
index 30e0f54..76f83d5 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -104,46 +104,35 @@ static size_t n_fl_pair_slots;
    and any command-line argument that serves as a regular expression.  */
 static size_t n_pattern_files;
 
-/* Given the concatenation of all patterns, one per line, be they
-   specified via -e, a lone command-line argument or -f, this is the
-   number of the first line of each entity, in that concatenation.
+/* The number of patterns seen so far.
    It is advanced by fl_add and, when needed, used in pattern_file_name
    to derive a file-relative line number.  */
-static uintmax_t patfile_lineno = 1;
+static size_t n_patterns;
 
-/* Return the number of newline bytes in BUF starting at offset BEG
-   and up to and not including offset END.  */
+/* Return the number of newline bytes in BUF with size SIZE.  */
 static size_t _GL_ATTRIBUTE_PURE
-count_nl_bytes (char const *buf, size_t beg, size_t end)
+count_nl_bytes (char const *buf, size_t size)
 {
-  char const *p = buf + beg;
-  char const *end_p = buf + end;
-  uintmax_t n = 0;
-  while (true)
-    {
-      p = memchr (p, '\n', end_p - p);
-      if (!p)
-        break;
-      p++;
-      n++;
-    }
+  char const *p = buf;
+  char const *end_p = buf + size;
+  size_t n = 0;
+  while ((p = memchr (p, '\n', end_p - p)))
+    p++, n++;
   return n;
 }
 
-/* Append a FILENAME,line-number pair to FL_PAIR.  The line number we save
-   with FILENAME is the initial value of the global PATFILE_LINENO.
-   PATFILE_LINENO is then incremented by the number of newlines in BUF
-   from offset BEG up to but not including offset END.  */
+/* Append a FILENAME,line-number pair to FL_PAIR, and update
+   pattern-related counts from the contents of BUF with SIZE bytes.  */
 static void
-fl_add (char const *buf, size_t beg, size_t end, char const *filename)
+fl_add (char const *buf, size_t size, char const *filename)
 {
   if (n_fl_pair_slots <= n_pattern_files)
     fl_pair = x2nrealloc (fl_pair, &n_fl_pair_slots, sizeof *fl_pair);
 
-  fl_pair[n_pattern_files].lineno = patfile_lineno;
+  fl_pair[n_pattern_files].lineno = n_patterns + 1;
   fl_pair[n_pattern_files].filename = filename;
   n_pattern_files++;
-  patfile_lineno += count_nl_bytes (buf, beg, end);
+  n_patterns += count_nl_bytes (buf, size);
 }
 
 /* Map the line number, LINENO, of one of the input patterns to the
@@ -2521,10 +2510,11 @@ main (int argc, char **argv)
             keyalloc = keycc + cc + 1;
             keys = x2realloc (keys, &keyalloc);
           }
-        memcpy (&keys[keycc], optarg, cc);
+        oldcc = keycc;
+        memcpy (keys + oldcc, optarg, cc);
         keycc += cc;
         keys[keycc++] = '\n';
-        fl_add (keys, keycc - cc - 1, keycc, "");
+        fl_add (keys + oldcc, cc + 1, "");
         break;
 
       case 'f':
@@ -2548,7 +2538,7 @@ main (int argc, char **argv)
         /* Append final newline if file ended in non-newline. */
         if (oldcc != keycc && keys[keycc - 1] != '\n')
           keys[keycc++] = '\n';
-        fl_add (keys, oldcc, keycc, xstrdup (optarg));
+        fl_add (keys + oldcc, keycc - oldcc, optarg);
         break;
 
       case 'h':
@@ -2741,7 +2731,8 @@ main (int argc, char **argv)
       /* Make a copy so that it can be reallocated or freed later.  */
       keycc = strlen (argv[optind]);
       keys = xmemdup (argv[optind++], keycc + 1);
-      fl_add (keys, 0, keycc, "");
+      fl_add (keys, keycc, "");
+      n_patterns++;
     }
   else
     usage (EXIT_TROUBLE);
-- 
2.7.4

From 214be13d33a26ab45231afe8e983d9a174aae282 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Tue, 20 Dec 2016 08:56:06 -0800
Subject: [PATCH 2/3] grep: simplify matcher configuration

* src/grep.c (matcher, compile): Remove static vars.
(compile_fp_t): Now takes a 3rd syntax argument.
(Gcomppile, Ecompile, Acompile, GAcompile, PAcompile): Remove.
(struct matcher): Now nameless, since it is used only once.
Make 'name' a bit shorter.  New member 'syntax'.
(matchers): Initialize it, and change removed functions to GEAcompile.
(F_MATCHER_INDEX, G_MATCHER_INDEX): New constants.
(setmatcher): New arg MATCHER, and return new matcher index.
Avoid unnecessary call to strcmp.
(main): Keep matcher as a local int, not a global pointer.
* src/kwsearch.c (Fcompile):
* src/pcresearch.c (Pcompile): Ignore the 3rd syntax argument.
---
 src/grep.c       | 111 +++++++++++++++++++------------------------------------
 src/kwsearch.c   |   2 +-
 src/pcresearch.c |   2 +-
 src/search.h     |   4 +-
 4 files changed, 41 insertions(+), 78 deletions(-)

diff --git a/src/grep.c b/src/grep.c
index 76f83d5..574a380 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -491,8 +491,6 @@ bool match_words;
 bool match_lines;
 char eolbyte;
 
-static char const *matcher;
-
 /* For error messages. */
 /* The input file name, or (if standard input) null or a --label argument.  */
 static char const *filename;
@@ -577,9 +575,8 @@ static bool seek_failed;
 static bool seek_data_failed;
 
 /* Functions we'll use to search. */
-typedef void (*compile_fp_t) (char const *, size_t);
+typedef void (*compile_fp_t) (char const *, size_t, reg_syntax_t);
 typedef size_t (*execute_fp_t) (char const *, size_t, size_t *, char const *);
-static compile_fp_t compile;
 static execute_fp_t execute;
 
 static char const *
@@ -2019,70 +2016,36 @@ if any error occurs and -q is not given, the exit status is 2.\n"));
 
 /* Pattern compilers and matchers.  */
 
-static void
-Gcompile (char const *pattern, size_t size)
-{
-  GEAcompile (pattern, size, RE_SYNTAX_GREP);
-}
-
-static void
-Ecompile (char const *pattern, size_t size)
-{
-  GEAcompile (pattern, size, RE_SYNTAX_EGREP);
-}
-
-static void
-Acompile (char const *pattern, size_t size)
-{
-  GEAcompile (pattern, size, RE_SYNTAX_AWK);
-}
-
-static void
-GAcompile (char const *pattern, size_t size)
-{
-  GEAcompile (pattern, size, RE_SYNTAX_GNU_AWK);
-}
-
-static void
-PAcompile (char const *pattern, size_t size)
-{
-  GEAcompile (pattern, size, RE_SYNTAX_POSIX_AWK);
-}
-
-struct matcher
+static struct
 {
-  char const name[16];
+  char const name[12];
+  int syntax; /* used if compile == GEAcompile */
   compile_fp_t compile;
   execute_fp_t execute;
+} const matchers[] = {
+  { "grep", RE_SYNTAX_GREP, GEAcompile, EGexecute },
+  { "egrep", RE_SYNTAX_EGREP, GEAcompile, EGexecute },
+  { "fgrep", 0, Fcompile, Fexecute, },
+  { "awk", RE_SYNTAX_AWK, GEAcompile, EGexecute },
+  { "gawk", RE_SYNTAX_GNU_AWK, GEAcompile, EGexecute },
+  { "posixawk", RE_SYNTAX_POSIX_AWK, GEAcompile, EGexecute },
+  { "perl", 0, Pcompile, Pexecute, },
 };
-static struct matcher const matchers[] = {
-  { "grep",      Gcompile, EGexecute },
-  { "egrep",     Ecompile, EGexecute },
-  { "fgrep",     Fcompile,  Fexecute },
-  { "awk",       Acompile, EGexecute },
-  { "gawk",     GAcompile, EGexecute },
-  { "posixawk", PAcompile, EGexecute },
-  { "perl",      Pcompile,  Pexecute },
-  { "", NULL, NULL },
-};
+/* Keep these in sync with the 'matchers' table.  */
+enum { F_MATCHER_INDEX = 2, G_MATCHER_INDEX = 0 };
 
-/* Set the matcher to M if available.  Exit in case of conflicts or if
-   M is not available.  */
-static void
-setmatcher (char const *m)
+/* Return the index of the matcher corresponding to M if available.
+   MATCHER is the index of the previous matcher, or -1 if none.
+   Exit in case of conflicts or if M is not available.  */
+static int
+setmatcher (char const *m, int matcher)
 {
-  struct matcher const *p;
-
-  if (matcher && !STREQ (matcher, m))
-    die (EXIT_TROUBLE, 0, _("conflicting matchers specified"));
-
-  for (p = matchers; p->compile; p++)
-    if (STREQ (m, p->name))
+  for (int i = 0; i < sizeof matchers / sizeof *matchers; i++)
+    if (STREQ (m, matchers[i].name))
       {
-        matcher = p->name;
-        compile = p->compile;
-        execute = p->execute;
-        return;
+        if (0 <= matcher && matcher != i)
+          die (EXIT_TROUBLE, 0, _("conflicting matchers specified"));
+        return i;
       }
 
   die (EXIT_TROUBLE, 0, _("invalid matcher %s"), m);
@@ -2367,6 +2330,7 @@ main (int argc, char **argv)
 {
   char *keys = NULL;
   size_t keycc = 0, oldcc, keyalloc = 0;
+  int matcher = -1;
   bool with_filenames = false;
   size_t cc;
   int opt, prepended;
@@ -2409,9 +2373,6 @@ main (int argc, char **argv)
     error (0, 0, _("warning: GREP_OPTIONS is deprecated;"
                    " please use an alias or script"));
 
-  compile = matchers[0].compile;
-  execute = matchers[0].execute;
-
   while (prev_optind = optind,
          (opt = get_nondigit_option (argc, argv, &default_context)) != -1)
     switch (opt)
@@ -2440,23 +2401,23 @@ main (int argc, char **argv)
         break;
 
       case 'E':
-        setmatcher ("egrep");
+        matcher = setmatcher ("egrep", matcher);
         break;
 
       case 'F':
-        setmatcher ("fgrep");
+        matcher = setmatcher ("fgrep", matcher);
         break;
 
       case 'P':
-        setmatcher ("perl");
+        matcher = setmatcher ("perl", matcher);
         break;
 
       case 'G':
-        setmatcher ("grep");
+        matcher = setmatcher ("grep", matcher);
         break;
 
       case 'X': /* undocumented on purpose */
-        setmatcher (optarg);
+        matcher = setmatcher (optarg, matcher);
         break;
 
       case 'H':
@@ -2794,24 +2755,26 @@ main (int argc, char **argv)
 
   initialize_unibyte_mask ();
 
+  if (matcher < 0)
+    matcher = G_MATCHER_INDEX;
+
   /* In a unibyte locale, switch from fgrep to grep if
      the pattern matches words (where grep is typically faster).
      In a multibyte locale, switch from fgrep to grep if either
      (1) the pattern has an encoding error (where fgrep might not work), or
      (2) case is ignored and a fast fgrep ignore-case is not available.  */
-  if (compile == Fcompile
+  if (matcher == F_MATCHER_INDEX
       && (MB_CUR_MAX <= 1
           ? match_words
           : (contains_encoding_error (keys, keycc)
              || (match_icase && !fgrep_icase_available (keys, keycc)))))
     {
       fgrep_to_grep_pattern (&keys, &keycc);
-      matcher = "grep";
-      compile = Gcompile;
-      execute = EGexecute;
+      matcher = G_MATCHER_INDEX;
     }
 
-  compile (keys, keycc);
+  execute = matchers[matcher].execute;
+  matchers[matcher].compile (keys, keycc, matchers[matcher].syntax);
   free (keys);
   /* We need one byte prior and one after.  */
   char eolbytes[3] = { 0, eolbyte, 0 };
diff --git a/src/kwsearch.c b/src/kwsearch.c
index c3e69b3..7275973 100644
--- a/src/kwsearch.c
+++ b/src/kwsearch.c
@@ -34,7 +34,7 @@ wordchar (wint_t wc)
 static kwset_t kwset;
 
 void
-Fcompile (char const *pattern, size_t size)
+Fcompile (char const *pattern, size_t size, reg_syntax_t ignored)
 {
   size_t total = size;
 
diff --git a/src/pcresearch.c b/src/pcresearch.c
index 108baff..0e34861 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -88,7 +88,7 @@ static int empty_match[2];
 #endif
 
 void
-Pcompile (char const *pattern, size_t size)
+Pcompile (char const *pattern, size_t size, reg_syntax_t ignored)
 {
 #if !HAVE_LIBPCRE
   die (EXIT_TROUBLE, 0,
diff --git a/src/search.h b/src/search.h
index 4957a63..1ff5be2 100644
--- a/src/search.h
+++ b/src/search.h
@@ -57,11 +57,11 @@ extern void GEAcompile (char const *, size_t, reg_syntax_t);
 extern size_t EGexecute (char const *, size_t, size_t *, char const *);
 
 /* kwsearch.c */
-extern void Fcompile (char const *, size_t);
+extern void Fcompile (char const *, size_t, reg_syntax_t);
 extern size_t Fexecute (char const *, size_t, size_t *, char const *);
 
 /* pcresearch.c */
-extern void Pcompile (char const *, size_t);
+extern void Pcompile (char const *, size_t, reg_syntax_t);
 extern size_t Pexecute (char const *, size_t, size_t *, char const *);
 
 /* Return the number of bytes in the character at the start of S, which
-- 
2.7.4

From 290ca116c9172d97b2b026951fac722d3bd3ced9 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Tue, 20 Dec 2016 18:04:39 -0800
Subject: [PATCH 3/3] grep: fix performance with multiple patterns

Problem reported by Jaroslav Skarvada (Bug#22357).
* NEWS: Document this and other recent performance fixes.
* src/grep.c (E_MATCHER_INDEX): New constant.
(all_single_byte_after_folding):
New function, split out from fgrep_icase_available.
(fgrep_icase_available): Use it.
(try_fgrep_pattern): New function, which also uses it.
(main): With two or more patterns, use try_fgrep_pattern to fix
performance regression.  The number "two" here is just a heuristic.
---
 NEWS       |  11 ++++++
 src/grep.c | 132 +++++++++++++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 123 insertions(+), 20 deletions(-)

diff --git a/NEWS b/NEWS
index aa0483b..2e7d1ae 100644
--- a/NEWS
+++ b/NEWS
@@ -8,6 +8,17 @@ GNU grep NEWS                                    -*- outline -*-
   /proc file system and standard output is /dev/null.
   [bug introduced in grep-2.27]
 
+** Bug fixes
+
+  Fix performance regression with multiple patterns, e.g., for -Fi in
+  a multi-byte locale, or for -Fw in a single-byte locale.
+  [bugs introduced in grep-2.19, grep-2.22 and grep-2.26]
+
+** Improvements
+
+  Improve performance for -E or -G pattern lists that are easily
+  converted to -F format.
+
 
 * Noteworthy changes in release 2.27 (2016-12-06) [stable]
 
diff --git a/src/grep.c b/src/grep.c
index 574a380..f36654c 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -2032,7 +2032,7 @@ static struct
   { "perl", 0, Pcompile, Pexecute, },
 };
 /* Keep these in sync with the 'matchers' table.  */
-enum { F_MATCHER_INDEX = 2, G_MATCHER_INDEX = 0 };
+enum { E_MATCHER_INDEX = 1, F_MATCHER_INDEX = 2, G_MATCHER_INDEX = 0 };
 
 /* Return the index of the matcher corresponding to M if available.
    MATCHER is the index of the previous matcher, or -1 if none.
@@ -2245,23 +2245,12 @@ contains_encoding_error (char const *pat, size_t patlen)
   return false;
 }
 
-/* Return true if the fgrep pattern PAT, of size PATLEN, matches only
-   single-byte characters including case folding, and so is suitable
-   to be converted to a grep pattern.  */
+/* Return true if the set of single-byte characters USED contains only
+   characters whose case counterparts are also single-byte.  */
 
 static bool
-fgrep_icase_available (char const *pat, size_t patlen)
+all_single_byte_after_folding (bool used[UCHAR_MAX + 1])
 {
-  bool used[UCHAR_MAX + 1] = { 0, };
-
-  for (size_t i = 0; i < patlen; i++)
-    {
-      unsigned char c = pat[i];
-      if (localeinfo.sbctowc[c] == WEOF)
-        return false;
-      used[c] = true;
-    }
-
   for (int c = 0; c <= UCHAR_MAX; c++)
     if (used[c])
       {
@@ -2281,6 +2270,26 @@ fgrep_icase_available (char const *pat, size_t patlen)
   return true;
 }
 
+/* Return true if the -F patterns PAT, of size PATLEN, match only
+   single-byte characters including case folding, and so can be
+   processed by the -F matcher even if -i is given.  */
+
+static bool
+fgrep_icase_available (char const *pat, size_t patlen)
+{
+  bool used[UCHAR_MAX + 1] = { 0, };
+
+  for (size_t i = 0; i < patlen; i++)
+    {
+      unsigned char c = pat[i];
+      if (localeinfo.sbctowc[c] == WEOF)
+        return false;
+      used[c] = true;
+    }
+
+  return all_single_byte_after_folding (used);
+}
+
 /* Change the pattern *KEYS_P, of size *LEN_P, from fgrep to grep style.  */
 
 static void
@@ -2325,6 +2334,84 @@ fgrep_to_grep_pattern (char **keys_p, size_t *len_p)
   *len_p = p - new_keys;
 }
 
+/* If it is easy, convert the MATCHER-style patterns KEYS (of size
+   *LEN_P) to -F style, update *LEN_P to a possibly-smaller value, and
+   return F_MATCHER_INDEX.  If not, leave KEYS and *LEN_P alone and
+   return MATCHER.  This function is conservative and sometimes misses
+   conversions, e.g., it does not convert the -E pattern "(a|a|[aa])"
+   to the -F pattern "a".  */
+
+static int
+try_fgrep_pattern (int matcher, char *keys, size_t *len_p)
+{
+  int result = matcher;
+  size_t len = *len_p;
+  char *new_keys = xmalloc (len + 1);
+  char *p = new_keys;
+  mbstate_t mb_state = { 0 };
+  size_t mblen_lim = match_icase ? 1 : -3;
+  bool used[UCHAR_MAX + 1] = { 0, };
+
+  while (len != 0)
+    {
+      switch (*keys)
+        {
+        case '$': case '*': case '.': case '[': case '^':
+          goto fail;
+
+        case '(': case '+': case '?': case '{': case '|':
+          if (matcher != G_MATCHER_INDEX)
+            goto fail;
+          break;
+
+        case '\\':
+          if (1 < len)
+            switch (keys[1])
+              {
+              case '\n':
+              case 'B': case 'S': case 'W': case'\'': case '<':
+              case 'b': case 's': case 'w': case '`': case '>':
+              case '1': case '2': case '3': case '4':
+              case '5': case '6': case '7': case '8': case '9':
+                goto fail;
+
+              case '(': case '+': case '?': case '{': case '|':
+                if (matcher == G_MATCHER_INDEX)
+                  goto fail;
+                /* Fall through.  */
+              default:
+                keys++, len--;
+                break;
+              }
+          break;
+        }
+
+      {
+        size_t n = mb_clen (keys, len, &mb_state);
+        if (mblen_lim < n)
+          goto fail;
+        used[to_uchar (keys[0])] = true;
+        p = mempcpy (p, keys, n);
+        keys += n;
+        len -= n;
+      }
+    }
+
+  if (mblen_lim == 1 && !all_single_byte_after_folding (used))
+    goto fail;
+
+  if (*len_p != p - new_keys)
+    {
+      *len_p = p - new_keys;
+      memcpy (keys, new_keys, p - new_keys);
+    }
+  result = F_MATCHER_INDEX;
+
+ fail:
+  free (new_keys);
+  return result;
+}
+
 int
 main (int argc, char **argv)
 {
@@ -2758,11 +2845,11 @@ main (int argc, char **argv)
   if (matcher < 0)
     matcher = G_MATCHER_INDEX;
 
-  /* In a unibyte locale, switch from fgrep to grep if
-     the pattern matches words (where grep is typically faster).
-     In a multibyte locale, switch from fgrep to grep if either
-     (1) the pattern has an encoding error (where fgrep might not work), or
-     (2) case is ignored and a fast fgrep ignore-case is not available.  */
+  /* In a single-byte locale, switch from -F to -G if it is a single
+     pattern that matches words, where -G is typically faster.  In a
+     multi-byte locale, switch if the patterns have an encoding error
+     (where -F does not work) or if -i and the patterns will not work
+     for -iF.  */
   if (matcher == F_MATCHER_INDEX
       && (MB_CUR_MAX <= 1
           ? match_words
@@ -2772,6 +2859,11 @@ main (int argc, char **argv)
       fgrep_to_grep_pattern (&keys, &keycc);
       matcher = G_MATCHER_INDEX;
     }
+  /* With two or more patterns, if -F works then switch from either -E
+     or -G, as -F is probably faster then.  */
+  else if ((matcher == G_MATCHER_INDEX || matcher == E_MATCHER_INDEX)
+           && 1 < n_patterns)
+    matcher = try_fgrep_pattern (matcher, keys, &keycc);
 
   execute = matchers[matcher].execute;
   matchers[matcher].compile (keys, keycc, matchers[matcher].syntax);
-- 
2.7.4

Reply via email to