We can set RE_NO_SUB for calling regex only to check syntax.  It brings
performance gains in cases to have a lot of enormous epsilon nodes.


$ printf '(%020000d)\n' | sed 's/0/|/g' >pat

(before)
$ time -p env LC_ALL=C src/grep -Ef pat /dev/null
real 6.15
user 4.62
sys 1.52

(after)
$ time -p env LC_ALL=C src/grep -Ef pat /dev/null
real 0.66
user 0.19
sys 0.46
From 0ef4329c9b4a5785c54dfa1d36aac2bb72893198 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <nori...@kcn.ne.jp>
Date: Thu, 8 Oct 2020 18:20:13 +0900
Subject: [PATCH] grep: set RE_NO_SUB for calling regex only to check syntax

* src/dfasearch.c (regex_compile): New parameter. All callers changed.
(GEAcompile): Move setting syntax for regex into regex_compile() function.
---
 src/dfasearch.c |   16 ++++++++++++----
 1 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/dfasearch.c b/src/dfasearch.c
index 812a0dc..8ede0ec 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -145,7 +145,8 @@ possible_backrefs_in_pattern (char const *keys, ptrdiff_t 
len, bool bs_safe)
 
 static bool
 regex_compile (struct dfa_comp *dc, char const *p, ptrdiff_t len,
-               ptrdiff_t pcount, ptrdiff_t lineno, bool syntax_only)
+               ptrdiff_t pcount, ptrdiff_t lineno, reg_syntax_t syntax_bits,
+               bool syntax_only)
 {
   struct re_pattern_buffer pat0;
   struct re_pattern_buffer *pat = syntax_only ? &pat0 : &dc->patterns[pcount];
@@ -157,6 +158,11 @@ regex_compile (struct dfa_comp *dc, char const *p, 
ptrdiff_t len,
 
   pat->translate = NULL;
 
+  if (syntax_only)
+    re_set_syntax (syntax_bits | RE_NO_SUB);
+  else
+    re_set_syntax (syntax_bits);
+
   char const *err = re_compile_pattern (p, len, pat);
   if (!err)
     return true;
@@ -189,7 +195,6 @@ GEAcompile (char *pattern, size_t size, reg_syntax_t 
syntax_bits,
 
   if (match_icase)
     syntax_bits |= RE_ICASE;
-  re_set_syntax (syntax_bits);
   int dfaopts = eolbyte ? 0 : DFA_EOL_NUL;
   dfasyntax (dc->dfa, &localeinfo, syntax_bits, dfaopts);
   bool bs_safe = !localeinfo.multibyte | localeinfo.using_utf8;
@@ -242,7 +247,10 @@ GEAcompile (char *pattern, size_t size, reg_syntax_t 
syntax_bits,
           dc->patterns++;
         }
 
-      if (!regex_compile (dc, p, len, dc->pcount, lineno, !backref))
+      re_set_syntax (syntax_bits);
+
+      if (!regex_compile (dc, p, len, dc->pcount, lineno, syntax_bits,
+                          !backref))
         compilation_failed = true;
 
       p = sep + 1;
@@ -317,7 +325,7 @@ GEAcompile (char *pattern, size_t size, reg_syntax_t 
syntax_bits,
           dc->patterns--;
           dc->pcount++;
 
-          if (!regex_compile (dc, buf, buflen, 0, -1, false))
+          if (!regex_compile (dc, buf, buflen, 0, -1, syntax_bits, false))
             abort ();
         }
 
-- 
1.7.1

Reply via email to