Hi,

Performace for as following case is fixed in bug#43040.

  $ yes 0 | head -100000 | sed '$s/././' >pat
  $ grep -vf pat /dev/null

However, still slow and a lot of memory wasted for the following cases.

  $ grep -vf /usr/share/dict/linux.words /usr/share/dict/linux.words

This bug is introduced in commit abb7f4f2325f26f930ff59b702fe42568a8e81e7.
Though it's an optimization for patterns with backreferences, it seems
to cause performance degradation in many cases due to regex
implementation issues.

grep needs regex engine when patterns is not supported by DFA engine,
and when either given only matching (-o) or color option (--color) is
given.

In other words, if none of them are met, grep only uses regex to check
the syntax.  grep avoids compilation of regex not to check syntax by this
patch.
From 71d54acb707b5c96080d2c18c034ed0c9305a35d Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <nori...@kcn.ne.jp>
Date: Sun, 20 Sep 2020 15:39:13 +0900
Subject: [PATCH] grep: avoid unneeded compilation of regex

grep needs regex engine when patterns is not supported by DFA engine, and
when either given only matching (-o) or color option (--color) is given.

In other words, if none of them are met, grep only uses regex to check the
syntax.  grep avoids compilation of regex avoid not to check syntax.

* src/dfasearch.c (GEAcompile): Add new argument, and avoid unneeded
compilation of regex.
* src/grep.c (compile_fp_t): Update prototype.
(main): Update caller.
* src/kwsearch.c (Fcompile): Update caller and add new argument.
* src/pcresearch.c (Pcompile): Add new argument.
* src/search.h (GEAcompile, Fcompile, Pcompile): Update prototype.
---
 src/dfasearch.c  |   30 +++++++++++++++++-------------
 src/grep.c       |    7 ++++---
 src/kwsearch.c   |    6 +++---
 src/pcresearch.c |    2 +-
 src/search.h     |    6 +++---
 5 files changed, 28 insertions(+), 23 deletions(-)

diff --git a/src/dfasearch.c b/src/dfasearch.c
index 354878f..812a0dc 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -179,7 +179,8 @@ regex_compile (struct dfa_comp *dc, char const *p, 
ptrdiff_t len,
    Return a description of the compiled pattern.  */
 
 void *
-GEAcompile (char *pattern, size_t size, reg_syntax_t syntax_bits)
+GEAcompile (char *pattern, size_t size, reg_syntax_t syntax_bits,
+            bool exact)
 {
   char *motif;
   struct dfa_comp *dc = xcalloc (1, sizeof (*dc));
@@ -274,18 +275,6 @@ GEAcompile (char *pattern, size_t size, reg_syntax_t 
syntax_bits)
         }
     }
 
-  if (buf != NULL)
-    {
-      dc->patterns--;
-      dc->pcount++;
-
-      if (!regex_compile (dc, buf, buflen, 0, -1, false))
-        abort ();
-
-      if (buf != pattern)
-        free (buf);
-    }
-
   /* In the match_words and match_lines cases, we use a different pattern
      for the DFA matcher that will quickly throw out cases that won't work.
      Then if DFA succeeds we do some hairy stuff using the regex matcher
@@ -321,6 +310,21 @@ GEAcompile (char *pattern, size_t size, reg_syntax_t 
syntax_bits)
   kwsmusts (dc);
   dfacomp (NULL, 0, dc->dfa, 1);
 
+  if (buf != NULL)
+    {
+      if (exact || !dfasupported (dc->dfa))
+        {
+          dc->patterns--;
+          dc->pcount++;
+
+          if (!regex_compile (dc, buf, buflen, 0, -1, false))
+            abort ();
+        }
+
+      if (buf != pattern)
+        free (buf);
+    }
+
   free (motif);
 
   return dc;
diff --git a/src/grep.c b/src/grep.c
index 3b40bbb..1453b14 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -635,7 +635,7 @@ static bool seek_failed;
 static bool seek_data_failed;
 
 /* Functions we'll use to search. */
-typedef void *(*compile_fp_t) (char *, size_t, reg_syntax_t);
+typedef void *(*compile_fp_t) (char *, size_t, reg_syntax_t, bool);
 typedef size_t (*execute_fp_t) (void *, char const *, size_t, size_t *,
                                 char const *);
 static execute_fp_t execute;
@@ -2973,8 +2973,9 @@ main (int argc, char **argv)
     matcher = try_fgrep_pattern (matcher, keys, &keycc);
 
   execute = matchers[matcher].execute;
-  compiled_pattern = matchers[matcher].compile (keys, keycc,
-                                                matchers[matcher].syntax);
+  compiled_pattern =
+    matchers[matcher].compile (keys, keycc, matchers[matcher].syntax,
+                               only_matching | color_option);
   /* We need one byte prior and one after.  */
   char eolbytes[3] = { 0, eolbyte, 0 };
   size_t match_size;
diff --git a/src/kwsearch.c b/src/kwsearch.c
index 7081060..1174dbc 100644
--- a/src/kwsearch.c
+++ b/src/kwsearch.c
@@ -47,7 +47,7 @@ struct kwsearch
    followed by '\n'.  Return a description of the compiled pattern.  */
 
 void *
-Fcompile (char *pattern, size_t size, reg_syntax_t ignored)
+Fcompile (char *pattern, size_t size, reg_syntax_t ignored, bool exact)
 {
   kwset_t kwset;
   char *buf = NULL;
@@ -178,7 +178,7 @@ Fexecute (void *vcp, char const *buf, size_t size, size_t 
*match_size,
             {
               fgrep_to_grep_pattern (&kwsearch->pattern, &kwsearch->size);
               kwsearch->re = GEAcompile (kwsearch->pattern, kwsearch->size,
-                                         RE_SYNTAX_GREP);
+                                         RE_SYNTAX_GREP, !!start_ptr);
             }
           return EGexecute (kwsearch->re, buf, size, match_size, start_ptr);
         }
@@ -245,7 +245,7 @@ Fexecute (void *vcp, char const *buf, size_t size, size_t 
*match_size,
                     fgrep_to_grep_pattern (&kwsearch->pattern, 
&kwsearch->size);
                     kwsearch->re = GEAcompile (kwsearch->pattern,
                                                kwsearch->size,
-                                               RE_SYNTAX_GREP);
+                                               RE_SYNTAX_GREP, !!start_ptr);
                   }
                 if (beg + len < buf + size)
                   {
diff --git a/src/pcresearch.c b/src/pcresearch.c
index a668c45..6798967 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -113,7 +113,7 @@ jit_exec (struct pcre_comp *pc, char const *subject, int 
search_bytes,
    followed by '\n'.  Return a description of the compiled pattern.  */
 
 void *
-Pcompile (char *pattern, size_t size, reg_syntax_t ignored)
+Pcompile (char *pattern, size_t size, reg_syntax_t ignored, bool exact)
 {
   int e;
   char const *ep;
diff --git a/src/search.h b/src/search.h
index 1d85a7e..af81dcb 100644
--- a/src/search.h
+++ b/src/search.h
@@ -56,15 +56,15 @@ extern ptrdiff_t mb_goback (char const **, size_t *, char 
const *,
                             char const *);
 
 /* dfasearch.c */
-extern void *GEAcompile (char *, size_t, reg_syntax_t);
+extern void *GEAcompile (char *, size_t, reg_syntax_t, bool);
 extern size_t EGexecute (void *, char const *, size_t, size_t *, char const *);
 
 /* kwsearch.c */
-extern void *Fcompile (char *, size_t, reg_syntax_t);
+extern void *Fcompile (char *, size_t, reg_syntax_t, bool);
 extern size_t Fexecute (void *, char const *, size_t, size_t *, char const *);
 
 /* pcresearch.c */
-extern void *Pcompile (char *, size_t, reg_syntax_t);
+extern void *Pcompile (char *, size_t, reg_syntax_t, bool);
 extern size_t Pexecute (void *, char const *, size_t, size_t *, char const *);
 
 /* grep.c */
-- 
1.7.1

From ca0d0c9e79478df4645c15a5a885955d1c6221c9 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <nori...@kcn.ne.jp>
Date: Sun, 20 Sep 2020 16:00:04 +0900
Subject: [PATCH] dfa: change dfasupported() to global function

* lib/dfa.c (dfasupported): Rename, and change it to global function.
Update caller.
* lib/dfa.h (dfasupported): Add prototype.
---
 lib/dfa.c |    6 +++---
 lib/dfa.h |    3 +++
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/lib/dfa.c b/lib/dfa.c
index c25a391..4df5b00 100644
--- a/lib/dfa.c
+++ b/lib/dfa.c
@@ -3657,8 +3657,8 @@ free_mbdata (struct dfa *d)
 }
 
 /* Return true if every construct in D is supported by this DFA matcher.  */
-static bool _GL_ATTRIBUTE_PURE
-dfa_supported (struct dfa const *d)
+bool
+dfasupported (struct dfa const *d)
 {
   for (idx_t i = 0; i < d->tindex; i++)
     {
@@ -3814,7 +3814,7 @@ dfacomp (char const *s, idx_t len, struct dfa *d, bool 
searchflag)
 
   dfassbuild (d);
 
-  if (dfa_supported (d))
+  if (dfasupported (d))
     {
       maybe_disable_superset_dfa (d);
       dfaanalyze (d, searchflag);
diff --git a/lib/dfa.h b/lib/dfa.h
index c5bff89..2f77f26 100644
--- a/lib/dfa.h
+++ b/lib/dfa.h
@@ -113,6 +113,9 @@ extern struct dfa *dfasuperset (struct dfa const *d) 
_GL_ATTRIBUTE_PURE;
 /* The DFA is likely to be fast.  */
 extern bool dfaisfast (struct dfa const *) _GL_ATTRIBUTE_PURE;
 
+/* Return true if every construct in D is supported by this DFA matcher.  */
+extern bool dfasupported (struct dfa const *) _GL_ATTRIBUTE_PURE;
+
 /* Free the storage held by the components of a struct dfa. */
 extern void dfafree (struct dfa *);
 
-- 
1.7.1

Reply via email to