Hi,

grep uses KWset matcher for multiple word matching.  It is very slow when
most of the parts matched to a pattern are not words.  So, if a part firstly
matched to pattern is not a word, use the grep matcher to match for its line.

By the way, if START_PTR is set, grep matcher uses regex matcher which is
very slow to match words.  Therefore, we use grep matcher when only START_PTR
is not set.

Example, although it is a very extreme case...

$ cat >pat <<EOF
0
00 0
00 00 0
00 00 00 0
00 00 00 00 0
00 00 00 00 00 0
00 00 00 00 00 00 0
00 00 00 00 00 00 00 0
00 00 00 00 00 00 00 00 0
00 00 00 00 00 00 00 00 00 0
00 00 00 00 00 00 00 00 00 00 0
00 00 00 00 00 00 00 00 00 00 00 0
00 00 00 00 00 00 00 00 00 00 00 00 0
00 00 00 00 00 00 00 00 00 00 00 00 00 0
EOF
$ yes '00 00 00 00 00 00 00 00 00 00 00 00 00' | head -1000000 >inp

$ env LC_ALL=C time -p src/grep -wf pat inp
real 5.75
user 5.67
sys 0.02

Retry after applied the patch.

$ env LC_ALL=C time -p src/grep -wf pat inp
real 0.32
user 0.31
sys 0.00

Thanks,
Norihiro
From b4f07fa0288ad68932fc606ed760fd61db9df6d0 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <nori...@kcn.ne.jp>
Date: Sun, 13 Jan 2019 07:53:32 +0900
Subject: [PATCH] grep: fix slow for multiple word matching

grep uses KWset matcher for multiple word matching.  It is very slow when
most of the parts matched to a pattern are not words.  So, if a part firstly
matched to pattern is not a word, use the grep matcher to match for its line.

By the way, if START_PTR is set, grep matcher uses regex matcher which is
very slow to match words.  Therefore, we use grep matcher when only START_PTR
is not set.

* src/kwsearch.c (Fexecute): If a part  matched firstly to pattern is not
word, we use the grep matcher to match for its line.
---
 src/kwsearch.c |   17 +++++++++++++++++
 1 files changed, 17 insertions(+), 0 deletions(-)

diff --git a/src/kwsearch.c b/src/kwsearch.c
index 42567e9..9240ec9 100644
--- a/src/kwsearch.c
+++ b/src/kwsearch.c
@@ -239,6 +239,22 @@ Fexecute (void *vcp, char const *buf, size_t size, size_t 
*match_size,
                 else
                   goto success;
               }
+            if (!start_ptr)
+              {
+                if (!kwsearch->re)
+                  {
+                    fgrep_to_grep_pattern (&kwsearch->pattern, 
&kwsearch->size);
+                    kwsearch->re = GEAcompile (kwsearch->pattern, 
kwsearch->size,
+                                               RE_SYNTAX_GREP);
+                  }
+                end = memchr (beg + len, eol, (buf + size) - (beg + len));
+                end = end ? end + 1 : buf + size;
+                if (EGexecute (kwsearch->re, beg, end - beg, match_size, NULL)
+                    != (size_t) -1)
+                  goto success_match_words;
+                beg = end - 1;
+                break;
+              }
             if (!len)
               break;
             offset = kwsexec (kwset, beg, --len, &kwsmatch, true);
@@ -259,6 +275,7 @@ Fexecute (void *vcp, char const *buf, size_t size, size_t 
*match_size,
  success:
   end = memchr (beg + len, eol, (buf + size) - (beg + len));
   end = end ? end + 1 : buf + size;
+ success_match_words:
   beg = memrchr (buf, eol, beg - buf);
   beg = beg ? beg + 1 : buf;
   len = end - beg;
-- 
1.7.1

Reply via email to