bug#21763: bug#22239: bug#22357: grep -f not only huge memory usage, but also huge time cost

Paul Eggert Fri, 23 Dec 2016 17:39:23 -0800

Norihiro Tanaka wrote:

are you aware of extreme slowdown in the following cases after third patch?


  yes $(printf %040d 0) | head -10000000 >inp
  printf '0\n1\n' >pat
  env LC_ALL=C src/grep -w -f pat inp

No. Thanks, I hadn't considered that possibility. I looked into the slowdown andinstalled the attached patches, which cause 'grep' to run about as fast on thistest case as grep 2.25 (though not as fast as grep 2.26). The main fix is inpatch 5. On my platform:


  -------grep version------
   v2.25  v2.26  v2.27 master     locale      command
    1.21   0.69  24.95   1.22     C           grep -w -f pat inp
  207.36 203.15 202.03   1.22     en_US.utf8  grep -w -f pat inp
    1.21   0.69  25.95   0.85     C           grep -w -f pat inp -F
   66.33  68.07  67.21   1.22     en_US.utf8  grep -w -f pat inp -F

All numbers are user+system CPU seconds on Fedora 24 x86-64 (AMD Phenom II X4910e). "master" means after the attached patches are installed.

Perhaps we can fiddle with the heuristics a bit so that v2.26 is notsignificantly faster than the master in the C locale.

From 969447fdebabe2046ec4261837d5e8f12f75fba9 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Fri, 23 Dec 2016 08:04:13 -0800
Subject: [PATCH 1/8] maint: rewrite to avoid some macros

These days, the dangerous powers of C macros are not needed if
constants or functions will do just as well.
* src/grep.c (SEP_CHAR_SELECTED, SEP_CHAR_REJECTED, SEP_STR_GROUP)
(INITIAL_BUFSIZE):
* src/kwset.c (DEPTH_SIZE):
Now constants, not macros.
* src/kwset.c (link): Remove macro. Instead, rename local vars
from 'link' to 'cur'.
(malloc) [GREP]: Remove macro.  All uses of malloc changed to xmalloc.
Omit double-inclusion of xalloc.h.  Do not depend on 'GREP'.
(U): Now a function, not a macro.
* src/kwset.c, src/searchutils.c (NCHAR): Move this macro to ...
* src/system.h: ... here, and make it a constant.
---
 src/grep.c        |  8 ++---
 src/kwset.c       | 95 ++++++++++++++++++++++++++-----------------------------
 src/searchutils.c |  2 --
 src/system.h      |  1 +
 4 files changed, 50 insertions(+), 56 deletions(-)

diff --git a/src/grep.c b/src/grep.c
index f36654c..3729ae0 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -50,9 +50,9 @@
 #include "xalloc.h"
 #include "xstrtol.h"
 
-#define SEP_CHAR_SELECTED ':'
-#define SEP_CHAR_REJECTED '-'
-#define SEP_STR_GROUP    "--"
+enum { SEP_CHAR_SELECTED = ':' };
+enum { SEP_CHAR_REJECTED = '-' };
+char const SEP_STR_GROUP[] = "--";
 
 #define AUTHORS \
   proper_name ("Mike Haertel"), \
@@ -797,7 +797,7 @@ skipped_file (char const *name, bool command_line, bool is_dir)
 
 static char *buffer;		/* Base of buffer. */
 static size_t bufalloc;		/* Allocated buffer size, counting slop. */
-#define INITIAL_BUFSIZE 32768	/* Initial buffer size, not counting slop. */
+enum { INITIAL_BUFSIZE = 32768 }; /* Initial buffer size, not counting slop. */
 static int bufdesc;		/* File descriptor. */
 static char *bufbeg;		/* Beginning of user-visible stuff. */
 static char *buflim;		/* Limit of user-visible stuff. */
diff --git a/src/kwset.c b/src/kwset.c
index 264ef22..506c6cd 100644
--- a/src/kwset.c
+++ b/src/kwset.c
@@ -42,19 +42,14 @@
 #include "obstack.h"
 #include "xalloc.h"
 
-#define link kwset_link
-
-#ifdef GREP
-# include "xalloc.h"
-# undef malloc
-# define malloc xmalloc
-#endif
-
-#define NCHAR (UCHAR_MAX + 1)
-#define obstack_chunk_alloc malloc
+#define obstack_chunk_alloc xmalloc
 #define obstack_chunk_free free
 
-#define U(c) to_uchar (c)
+static unsigned char
+U (char ch)
+{
+  return to_uchar (ch);
+}
 
 /* Balanced tree of edges and labels leaving a given trie node. */
 struct tree
@@ -159,7 +154,7 @@ kwsalloc (char const *trans, bool reverse)
 
 /* This upper bound is valid for CHAR_BIT >= 4 and
    exact for CHAR_BIT in { 4..11, 13, 15, 17, 19 }. */
-#define DEPTH_SIZE (CHAR_BIT + CHAR_BIT/2)
+enum { DEPTH_SIZE = CHAR_BIT + CHAR_BIT / 2 };
 
 /* Add the given string to the contents of the keyword set.  */
 void
@@ -181,46 +176,46 @@ kwsincr (kwset_t kwset, char const *text, size_t len)
       /* Descend the tree of outgoing links for this trie node,
          looking for the current character and keeping track
          of the path followed. */
-      struct tree *link = trie->links;
+      struct tree *cur = trie->links;
       struct tree *links[DEPTH_SIZE];
       enum { L, R } dirs[DEPTH_SIZE];
       links[0] = (struct tree *) &trie->links;
       dirs[0] = L;
       int depth = 1;
 
-      while (link && label != link->label)
+      while (cur && label != cur->label)
         {
-          links[depth] = link;
-          if (label < link->label)
-            dirs[depth++] = L, link = link->llink;
+          links[depth] = cur;
+          if (label < cur->label)
+            dirs[depth++] = L, cur = cur->llink;
           else
-            dirs[depth++] = R, link = link->rlink;
+            dirs[depth++] = R, cur = cur->rlink;
         }
 
       /* The current character doesn't have an outgoing link at
          this trie node, so build a new trie node and install
          a link in the current trie node's tree. */
-      if (!link)
+      if (!cur)
         {
-          link = obstack_alloc (&kwset->obstack, sizeof *link);
-          link->llink = NULL;
-          link->rlink = NULL;
-          link->trie = obstack_alloc (&kwset->obstack, sizeof *link->trie);
-          link->trie->accepting = 0;
-          link->trie->links = NULL;
-          link->trie->parent = trie;
-          link->trie->next = NULL;
-          link->trie->fail = NULL;
-          link->trie->depth = trie->depth + 1;
-          link->trie->shift = 0;
-          link->label = label;
-          link->balance = 0;
+          cur = obstack_alloc (&kwset->obstack, sizeof *cur);
+          cur->llink = NULL;
+          cur->rlink = NULL;
+          cur->trie = obstack_alloc (&kwset->obstack, sizeof *cur->trie);
+          cur->trie->accepting = 0;
+          cur->trie->links = NULL;
+          cur->trie->parent = trie;
+          cur->trie->next = NULL;
+          cur->trie->fail = NULL;
+          cur->trie->depth = trie->depth + 1;
+          cur->trie->shift = 0;
+          cur->label = label;
+          cur->balance = 0;
 
           /* Install the new tree node in its parent. */
           if (dirs[--depth] == L)
-            links[depth]->llink = link;
+            links[depth]->llink = cur;
           else
-            links[depth]->rlink = link;
+            links[depth]->rlink = cur;
 
           /* Back up the tree fixing the balance flags. */
           while (depth && !links[depth]->balance)
@@ -291,7 +286,7 @@ kwsincr (kwset_t kwset, char const *text, size_t len)
             }
         }
 
-      trie = link->trie;
+      trie = cur->trie;
     }
 
   /* Mark the node we finally reached as accepting, encoding the
@@ -326,7 +321,7 @@ static void
 treefails (struct tree const *tree, struct trie const *fail,
            struct trie *recourse, bool reverse)
 {
-  struct tree *link;
+  struct tree *cur;
 
   if (!tree)
     return;
@@ -338,16 +333,16 @@ treefails (struct tree const *tree, struct trie const *fail,
      node that has a descendant on the current label. */
   while (fail)
     {
-      link = fail->links;
-      while (link && tree->label != link->label)
-        if (tree->label < link->label)
-          link = link->llink;
+      cur = fail->links;
+      while (cur && tree->label != cur->label)
+        if (tree->label < cur->label)
+          cur = cur->llink;
         else
-          link = link->rlink;
-      if (link)
+          cur = cur->rlink;
+      if (cur)
         {
-          tree->trie->fail = link->trie;
-          if (!reverse && link->trie->accepting && !tree->trie->accepting)
+          tree->trie->fail = cur->trie;
+          if (!reverse && cur->trie->accepting && !tree->trie->accepting)
             tree->trie->accepting = SIZE_MAX;
           return;
         }
@@ -641,18 +636,18 @@ static size_t
 memoff2_kwset (char const *s, size_t n, kwset_t kwset,
                struct kwsmatch *kwsmatch)
 {
-  struct tree const *link = kwset->trie->links;
-  struct tree const *clink = link->llink ? link->llink : link->rlink;
+  struct tree const *cur = kwset->trie->links;
+  struct tree const *clink = cur->llink ? cur->llink : cur->rlink;
   char const *mch = (clink
-                     ? memchr2 (s, link->label, clink->label, n)
-                     : memchr (s, link->label, n));
+                     ? memchr2 (s, cur->label, clink->label, n)
+                     : memchr (s, cur->label, n));
   if (! mch)
     return SIZE_MAX;
   else
     {
       size_t off = mch - s;
-      if (*mch == link->label)
-        kwsmatch->index = link->trie->accepting / 2;
+      if (*mch == cur->label)
+        kwsmatch->index = cur->trie->accepting / 2;
       else
         kwsmatch->index = clink->trie->accepting / 2;
       kwsmatch->offset[0] = off;
diff --git a/src/searchutils.c b/src/searchutils.c
index 73d6c1c..deaab60 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -22,8 +22,6 @@
 #define SYSTEM_INLINE _GL_EXTERN_INLINE
 #include "search.h"
 
-#define NCHAR (UCHAR_MAX + 1)
-
 kwset_t
 kwsinit (bool mb_trans)
 {
diff --git a/src/system.h b/src/system.h
index 6f4918d..c875275 100644
--- a/src/system.h
+++ b/src/system.h
@@ -37,6 +37,7 @@
 #include <ctype.h>
 
 enum { EXIT_TROUBLE = 2 };
+enum { NCHAR = UCHAR_MAX + 1 };
 
 #include <gettext.h>
 #define N_(String) gettext_noop(String)
-- 
2.7.4

From 19227eb98bc586a5ef3c2fb993a23c1182a4dab6 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Fri, 23 Dec 2016 10:54:54 -0800
Subject: [PATCH 2/8] grep: remove C label

* src/kwsearch.c (Fexecute): Remove label.
---
 src/kwsearch.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/kwsearch.c b/src/kwsearch.c
index 7275973..d04d34c 100644
--- a/src/kwsearch.c
+++ b/src/kwsearch.c
@@ -103,7 +103,7 @@ Fexecute (char const *buf, size_t size, size_t *match_size,
                                buf + size - beg + match_lines, &kwsmatch,
                                longest);
       if (offset == (size_t) -1)
-        goto failure;
+        break;
       len = kwsmatch.size[0] - 2 * match_lines;
       if (mb_check && mb_goback (&mb_start, beg + offset, buf + size) != 0)
         {
@@ -157,7 +157,6 @@ Fexecute (char const *buf, size_t size, size_t *match_size,
         goto success;
     } /* for (beg in buf) */
 
- failure:
   return -1;
 
  success:
-- 
2.7.4

From f139976800435db91032f3b1d9435a166890be38 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Fri, 23 Dec 2016 11:10:23 -0800
Subject: [PATCH 3/8] grep: simplify Fexecute

* src/kwsearch.c (Fexecute): Avoid the need for a 'try' local or
for a 'goto success'.  Update mb_start to reflect newline found.
---
 src/kwsearch.c | 46 ++++++++++++++++++++++++----------------------
 1 file changed, 24 insertions(+), 22 deletions(-)

diff --git a/src/kwsearch.c b/src/kwsearch.c
index d04d34c..5596ebd 100644
--- a/src/kwsearch.c
+++ b/src/kwsearch.c
@@ -81,7 +81,7 @@ size_t
 Fexecute (char const *buf, size_t size, size_t *match_size,
           char const *start_ptr)
 {
-  char const *beg, *try, *end, *mb_start;
+  char const *beg, *end, *mb_start;
   size_t len;
   char eol = eolbyte;
   struct kwsmatch kwsmatch;
@@ -131,30 +131,32 @@ Fexecute (char const *buf, size_t size, size_t *match_size,
           len += start_ptr == NULL;
           goto success_in_beg_and_len;
         }
-      if (match_words)
-        for (try = beg; ; )
+      if (! match_words)
+        goto success;
+
+      /* Succeed if the preceding and following characters are word
+         constituents.  If the following character is not a word
+         constituent, keep trying with shorter matches.  */
+      char const *bol = memrchr (mb_start, eol, beg - mb_start);
+      if (bol)
+        mb_start = bol + 1;
+      if (! wordchar (mb_prev_wc (mb_start, beg, buf + size)))
+        for (;;)
           {
-            char const *bol = memrchr (buf, eol, beg - buf);
-            bol = bol ? bol + 1 : buf;
-            if (wordchar (mb_prev_wc (bol, try, buf + size)))
-              break;
-            if (wordchar (mb_next_wc (try + len, buf + size)))
+            if (! wordchar (mb_next_wc (beg + len, buf + size)))
               {
-                if (!len)
-                  break;
-                offset = kwsexec (kwset, beg, --len, &kwsmatch, true);
-                if (offset == (size_t) -1)
-                  break;
-                try = beg + offset;
-                len = kwsmatch.size[0];
+                if (start_ptr)
+                  goto success_in_beg_and_len;
+                else
+                  goto success;
               }
-            else if (!start_ptr)
-              goto success;
-            else
-              goto success_in_beg_and_len;
-          } /* for (try) */
-      else
-        goto success;
+            if (!len)
+              break;
+            offset = kwsexec (kwset, beg, --len, &kwsmatch, true);
+            if (offset != 0)
+              break;
+            len = kwsmatch.size[0];
+          }
     } /* for (beg in buf) */
 
   return -1;
-- 
2.7.4

From 740048e66e7c55a8e42f4f7e4c24256a61506f70 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Fri, 23 Dec 2016 12:25:24 -0800
Subject: [PATCH 4/8] grep: specialize word-finding functions

This improves performance a bit.
* src/dfasearch.c, src/kwsearch.c (wordchar):
Remove; now in searchutils.c.
* src/grep.c (main): Call wordinit if -w.
* src/search.h: Adjust.
* src/searchutils.c: Include verify.h.
(word_start): New static var.
(wordchar): Move here from dfasearch.c and kwsearch.c.
(wordinit, wordchars_count, wordchar_next, wordchar_prev):
New functions.
(mb_prev_wc, mb_next_wc): Remove.
All callers changed to use the new functions instead.
---
 src/dfasearch.c   | 11 ++-----
 src/grep.c        |  1 +
 src/kwsearch.c    | 11 ++-----
 src/search.h      |  5 +--
 src/searchutils.c | 91 +++++++++++++++++++++++++++++++++++++++++++------------
 5 files changed, 80 insertions(+), 39 deletions(-)

diff --git a/src/dfasearch.c b/src/dfasearch.c
index 24a36cd..87e1f7e 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -26,13 +26,6 @@
 
 struct localeinfo localeinfo;
 
-/* Whether -w considers WC to be a word constituent.  */
-static bool
-wordchar (wint_t wc)
-{
-  return wc == L'_' || iswalnum (wc);
-}
-
 /* KWset compiled pattern.  For Ecompile and Gcompile, we compile
    a list of strings, at least one of which is known to occur in
    any string matching the regexp. */
@@ -394,8 +387,8 @@ EGexecute (char const *buf, size_t size, size_t *match_size,
                 while (match <= best_match)
                   {
                     regoff_t shorter_len = 0;
-                    if (!wordchar (mb_prev_wc (beg, match, end - 1))
-                        && !wordchar (mb_next_wc (match + len, end - 1)))
+                    if (! wordchar_next (match + len, end - 1)
+                        && ! wordchar_prev (beg, match, end - 1))
                       goto assess_pattern_match;
                     if (len > 0)
                       {
diff --git a/src/grep.c b/src/grep.c
index 3729ae0..f9d1d86 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -2651,6 +2651,7 @@ main (int argc, char **argv)
         break;
 
       case 'w':
+        wordinit ();
         match_words = true;
         break;
 
diff --git a/src/kwsearch.c b/src/kwsearch.c
index 5596ebd..b30dfd0 100644
--- a/src/kwsearch.c
+++ b/src/kwsearch.c
@@ -21,13 +21,6 @@
 #include <config.h>
 #include "search.h"
 
-/* Whether -w considers WC to be a word constituent.  */
-static bool
-wordchar (wint_t wc)
-{
-  return wc == L'_' || iswalnum (wc);
-}
-
 /* KWset compiled pattern.  For Ecompile and Gcompile, we compile
    a list of strings, at least one of which is known to occur in
    any string matching the regexp. */
@@ -140,10 +133,10 @@ Fexecute (char const *buf, size_t size, size_t *match_size,
       char const *bol = memrchr (mb_start, eol, beg - mb_start);
       if (bol)
         mb_start = bol + 1;
-      if (! wordchar (mb_prev_wc (mb_start, beg, buf + size)))
+      if (! wordchar_prev (mb_start, beg, buf + size))
         for (;;)
           {
-            if (! wordchar (mb_next_wc (beg + len, buf + size)))
+            if (! wordchar_next (beg + len, buf + size))
               {
                 if (start_ptr)
                   goto success_in_beg_and_len;
diff --git a/src/search.h b/src/search.h
index 1ff5be2..6fe1797 100644
--- a/src/search.h
+++ b/src/search.h
@@ -46,10 +46,11 @@ _GL_INLINE_HEADER_BEGIN
 typedef signed char mb_len_map_t;
 
 /* searchutils.c */
+extern void wordinit (void);
 extern kwset_t kwsinit (bool);
+extern size_t wordchar_next (char const *, char const *);
+extern bool wordchar_prev (char const *, char const *, char const *);
 extern ptrdiff_t mb_goback (char const **, char const *, char const *);
-extern wint_t mb_prev_wc (char const *, char const *, char const *);
-extern wint_t mb_next_wc (char const *, char const *);
 
 /* dfasearch.c */
 extern struct localeinfo localeinfo;
diff --git a/src/searchutils.c b/src/searchutils.c
index deaab60..e0a1db3 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -22,6 +22,30 @@
 #define SYSTEM_INLINE _GL_EXTERN_INLINE
 #include "search.h"
 
+#include <verify.h>
+
+/* For each byte B, word_start[B] is 1 if B is a single-byte character
+   that is a word constituent, 0 if B cannot start a word constituent,
+   and -1 if B might be or might not be the start of a word
+   constituent.  */
+static wint_t word_start[NCHAR];
+verify (WEOF != 0 && WEOF != 1);
+
+/* Whether -w considers WC to be a word constituent.  */
+static bool
+wordchar (wint_t wc)
+{
+  return wc == L'_' || iswalnum (wc);
+}
+
+void
+wordinit (void)
+{
+  for (int i = 0; i < NCHAR; i++)
+    word_start[i] = (localeinfo.sbclen[i] == -2 ? WEOF
+                     : wordchar (localeinfo.sbctowc[i]));
+}
+
 kwset_t
 kwsinit (bool mb_trans)
 {
@@ -93,27 +117,56 @@ mb_goback (char const **mb_start, char const *cur, char const *end)
   return p == cur ? 0 : cur - p0;
 }
 
-/* In the buffer BUF, return the wide character that is encoded just
-   before CUR.  The buffer ends at END.  Return WEOF if there is no
-   wide character just before CUR.  */
-wint_t
-mb_prev_wc (char const *buf, char const *cur, char const *end)
+/* Examine the start of BUF (of size SIZE) for word constituents.
+   If COUNTALL, examine as many as possible; otherwise, examine at most one.
+   Return the total number of bytes in the examined characters.  */
+static size_t
+wordchars_count (char const *buf, char const *end, bool countall)
 {
-  if (cur == buf)
-    return WEOF;
-  char const *p = buf;
-  cur--;
-  cur -= mb_goback (&p, cur, end);
-  return mb_next_wc (cur, end);
+  size_t n = 0;
+  mbstate_t mbs = { 0 };
+  while (n < end - buf)
+    {
+      wint_t ws = word_start[to_uchar (buf[n])];
+      if (ws == 0)
+        break;
+      else if (ws == 1)
+        n++;
+      else
+        {
+          wchar_t wc = 0;
+          size_t wcbytes = mbrtowc (&wc, buf + n, end - buf - n, &mbs);
+          if (!wordchar (wc))
+            break;
+          n += wcbytes + !wcbytes;
+        }
+      if (!countall)
+        break;
+    }
+  return n;
 }
 
-/* Return the wide character that is encoded at CUR.  The buffer ends
-   at END.  Return WEOF if there is no wide character encoded at CUR.  */
-wint_t
-mb_next_wc (char const *cur, char const *end)
+/* If BUF starts with a word constituent, return the number of bytes
+   used to represent it; otherwise, return zero.  The buffer ends at END.  */
+size_t
+wordchar_next (char const *buf, char const *end)
 {
-  wchar_t wc;
-  mbstate_t mbs = { 0 };
-  return (end - cur != 0 && mbrtowc (&wc, cur, end - cur, &mbs) < (size_t) -2
-          ? wc : WEOF);
+  return wordchars_count (buf, end, false);
+}
+
+/* In the buffer BUF, return true if the character whose encoding
+   contains the byte before CUR is a word constituent.  The buffer
+   ends at END.  */
+bool
+wordchar_prev (char const *buf, char const *cur, char const *end)
+{
+  if (buf == cur)
+    return false;
+  cur--;
+  wint_t ws = word_start[to_uchar (*cur)];
+  if (! localeinfo.multibyte)
+    return ws == 1;
+  char const *p = buf;
+  cur -= mb_goback (&p, cur, end);
+  return wordchar_next (cur, end) != 0;
 }
-- 
2.7.4

From e89a7e6d4be4669a8a73650c28bb1eb69399d703 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Fri, 23 Dec 2016 12:43:46 -0800
Subject: [PATCH 5/8] grep: speed up -wf in C locale

Problem reported by Norihiro Tanaka (Bug#22357#100).
This patch improves the performance on that benchmark on my
platform so that grep is now only about 2x slower than grep 2.26,
which means it is considerably faster than grep 2.25 and earlier.
* src/kwsearch.c (Fexecute):
Use wordchars_size to boost performance for this case.
* src/search.h, src/searchutils.c (wordchars_size): New function.
---
 src/kwsearch.c    | 6 ++++++
 src/search.h      | 1 +
 src/searchutils.c | 9 +++++++++
 3 files changed, 16 insertions(+)

diff --git a/src/kwsearch.c b/src/kwsearch.c
index b30dfd0..6005b60 100644
--- a/src/kwsearch.c
+++ b/src/kwsearch.c
@@ -150,6 +150,12 @@ Fexecute (char const *buf, size_t size, size_t *match_size,
               break;
             len = kwsmatch.size[0];
           }
+
+      /* No word match was found at BEG.  Skip past word constituents,
+         since they cannot precede the next match and not skipping
+         them could make things much slower.  */
+      beg += wordchars_size (beg, buf + size);
+      mb_start = beg;
     } /* for (beg in buf) */
 
   return -1;
diff --git a/src/search.h b/src/search.h
index 6fe1797..1def4d6 100644
--- a/src/search.h
+++ b/src/search.h
@@ -48,6 +48,7 @@ typedef signed char mb_len_map_t;
 /* searchutils.c */
 extern void wordinit (void);
 extern kwset_t kwsinit (bool);
+extern size_t wordchars_size (char const *, char const *);
 extern size_t wordchar_next (char const *, char const *);
 extern bool wordchar_prev (char const *, char const *, char const *);
 extern ptrdiff_t mb_goback (char const **, char const *, char const *);
diff --git a/src/searchutils.c b/src/searchutils.c
index e0a1db3..6f6ae0b 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -146,6 +146,15 @@ wordchars_count (char const *buf, char const *end, bool countall)
   return n;
 }
 
+/* Examine the start of BUF for the longest prefix containing just
+   word constituents.  Return the total number of bytes in the prefix.
+   The buffer ends at END.  */
+size_t
+wordchars_size (char const *buf, char const *end)
+{
+  return wordchars_count (buf, end, true);
+}
+
 /* If BUF starts with a word constituent, return the number of bytes
    used to represent it; otherwise, return zero.  The buffer ends at END.  */
 size_t
-- 
2.7.4

From 4a9bbf519f3761a70c147597bcf967a27951b376 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Fri, 23 Dec 2016 12:57:10 -0800
Subject: [PATCH 6/8] grep: standardize on localeinfo.multibyte

* src/dfasearch.c (EGexecute):
* src/grep.c (main):
* src/kwsearch.c (Fexecute):
* src/pcresearch.c (Pcompile):
Prefer localeinfo.multibyte to (MB_CUR_MAX > 1).
---
 src/dfasearch.c  | 2 +-
 src/grep.c       | 2 +-
 src/kwsearch.c   | 2 +-
 src/pcresearch.c | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/dfasearch.c b/src/dfasearch.c
index 87e1f7e..7f68907 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -270,7 +270,7 @@ EGexecute (char const *buf, size_t size, size_t *match_size,
 
               if (exact_kwset_match)
                 {
-                  if (MB_CUR_MAX == 1 || localeinfo.using_utf8)
+                  if (!localeinfo.multibyte | localeinfo.using_utf8)
                     goto success;
                   if (mb_start < beg)
                     mb_start = beg;
diff --git a/src/grep.c b/src/grep.c
index f9d1d86..1c45286 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -2852,7 +2852,7 @@ main (int argc, char **argv)
      (where -F does not work) or if -i and the patterns will not work
      for -iF.  */
   if (matcher == F_MATCHER_INDEX
-      && (MB_CUR_MAX <= 1
+      && (! localeinfo.multibyte
           ? match_words
           : (contains_encoding_error (keys, keycc)
              || (match_icase && !fgrep_icase_available (keys, keycc)))))
diff --git a/src/kwsearch.c b/src/kwsearch.c
index 6005b60..7d11230 100644
--- a/src/kwsearch.c
+++ b/src/kwsearch.c
@@ -86,7 +86,7 @@ Fexecute (char const *buf, size_t size, size_t *match_size,
     mb_check = longest = false;
   else
     {
-      mb_check = MB_CUR_MAX > 1 && !localeinfo.using_utf8;
+      mb_check = localeinfo.multibyte & !localeinfo.using_utf8;
       longest = mb_check | !!start_ptr | match_words;
     }
 
diff --git a/src/pcresearch.c b/src/pcresearch.c
index 0e34861..245469c 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -110,7 +110,7 @@ Pcompile (char const *pattern, size_t size, reg_syntax_t ignored)
   char const *p;
   char const *pnul;
 
-  if (1 < MB_CUR_MAX)
+  if (localeinfo.multibyte)
     {
       if (! localeinfo.using_utf8)
         die (EXIT_TROUBLE, 0, _("-P supports only unibyte and UTF-8 locales"));
-- 
2.7.4

From 2c84095a777c2a20e99e92f406398501242ac131 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Fri, 23 Dec 2016 16:16:01 -0800
Subject: [PATCH 7/8] grep: improve word checking with UTF-8

* src/searchutils.c: Do not include <verify.h>.
(word_start): Remove, replacing with ...
(sbwordchar): New static var.  All uses changed.
(wordchar_prev): Return size_t, not bool, as this generates
slightly better code.  Go back faster if UTF-8.
---
 src/search.h      |  2 +-
 src/searchutils.c | 85 +++++++++++++++++++++++++++++++++----------------------
 2 files changed, 52 insertions(+), 35 deletions(-)

diff --git a/src/search.h b/src/search.h
index 1def4d6..b700ed5 100644
--- a/src/search.h
+++ b/src/search.h
@@ -50,7 +50,7 @@ extern void wordinit (void);
 extern kwset_t kwsinit (bool);
 extern size_t wordchars_size (char const *, char const *);
 extern size_t wordchar_next (char const *, char const *);
-extern bool wordchar_prev (char const *, char const *, char const *);
+extern size_t wordchar_prev (char const *, char const *, char const *);
 extern ptrdiff_t mb_goback (char const **, char const *, char const *);
 
 /* dfasearch.c */
diff --git a/src/searchutils.c b/src/searchutils.c
index 6f6ae0b..3ba3cdb 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -22,14 +22,9 @@
 #define SYSTEM_INLINE _GL_EXTERN_INLINE
 #include "search.h"
 
-#include <verify.h>
-
-/* For each byte B, word_start[B] is 1 if B is a single-byte character
-   that is a word constituent, 0 if B cannot start a word constituent,
-   and -1 if B might be or might not be the start of a word
-   constituent.  */
-static wint_t word_start[NCHAR];
-verify (WEOF != 0 && WEOF != 1);
+/* For each byte B, sbwordchar[B] is true if B is a single-byte
+   character that is a word constituent, and is false otherwise.  */
+static bool sbwordchar[NCHAR];
 
 /* Whether -w considers WC to be a word constituent.  */
 static bool
@@ -42,8 +37,7 @@ void
 wordinit (void)
 {
   for (int i = 0; i < NCHAR; i++)
-    word_start[i] = (localeinfo.sbclen[i] == -2 ? WEOF
-                     : wordchar (localeinfo.sbctowc[i]));
+    sbwordchar[i] = wordchar (localeinfo.sbctowc[i]);
 }
 
 kwset_t
@@ -94,23 +88,46 @@ mb_goback (char const **mb_start, char const *cur, char const *end)
 {
   const char *p = *mb_start;
   const char *p0 = p;
-  mbstate_t cur_state;
 
-  memset (&cur_state, 0, sizeof cur_state);
+  if (cur <= p)
+    return cur - p;
 
-  while (p < cur)
+  if (localeinfo.using_utf8)
     {
-      size_t clen = mb_clen (p, end - p, &cur_state);
-
-      if ((size_t) -2 <= clen)
+      p = cur;
+
+      if (cur < end && (*cur & 0xc0) == 0x80)
+        for (int i = 1; i <= 3; i++)
+          if ((cur[-i] & 0xc0) != 0x80)
+            {
+              mbstate_t mbs = { 0 };
+              size_t clen = mb_clen (cur - i, end - (cur - i), &mbs);
+              if (i < clen && clen < (size_t) -2)
+                {
+                  p0 = cur - i;
+                  p = p0 + clen;
+                }
+              break;
+            }
+    }
+  else
+    {
+      mbstate_t mbs = { 0 };
+      do
         {
-          /* An invalid sequence, or a truncated multibyte character.
-             Treat it as a single byte character.  */
-          clen = 1;
-          memset (&cur_state, 0, sizeof cur_state);
+          size_t clen = mb_clen (p, end - p, &mbs);
+
+          if ((size_t) -2 <= clen)
+            {
+              /* An invalid sequence, or a truncated multibyte character.
+                 Treat it as a single byte character.  */
+              clen = 1;
+              memset (&mbs, 0, sizeof mbs);
+            }
+          p0 = p;
+          p += clen;
         }
-      p0 = p;
-      p += clen;
+      while (p < cur);
     }
 
   *mb_start = p;
@@ -127,11 +144,11 @@ wordchars_count (char const *buf, char const *end, bool countall)
   mbstate_t mbs = { 0 };
   while (n < end - buf)
     {
-      wint_t ws = word_start[to_uchar (buf[n])];
-      if (ws == 0)
-        break;
-      else if (ws == 1)
+      unsigned char b = buf[n];
+      if (sbwordchar[b])
         n++;
+      else if (localeinfo.sbclen[b] != -2)
+        break;
       else
         {
           wchar_t wc = 0;
@@ -163,19 +180,19 @@ wordchar_next (char const *buf, char const *end)
   return wordchars_count (buf, end, false);
 }
 
-/* In the buffer BUF, return true if the character whose encoding
+/* In the buffer BUF, return nonzero if the character whose encoding
    contains the byte before CUR is a word constituent.  The buffer
    ends at END.  */
-bool
+size_t
 wordchar_prev (char const *buf, char const *cur, char const *end)
 {
   if (buf == cur)
-    return false;
-  cur--;
-  wint_t ws = word_start[to_uchar (*cur)];
-  if (! localeinfo.multibyte)
-    return ws == 1;
+    return 0;
+  unsigned char b = *--cur;
+  if (! localeinfo.multibyte
+      || (localeinfo.using_utf8 && localeinfo.sbclen[b] != -2))
+    return sbwordchar[b];
   char const *p = buf;
   cur -= mb_goback (&p, cur, end);
-  return wordchar_next (cur, end) != 0;
+  return wordchar_next (cur, end);
 }
-- 
2.7.4

From 2353c63efb1c471c752a1b6575f81dcafb67684b Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Fri, 23 Dec 2016 17:29:54 -0800
Subject: [PATCH 8/8] grep: fix comment in searchutils.c

---
 src/searchutils.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/searchutils.c b/src/searchutils.c
index 3ba3cdb..1552ed7 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -77,7 +77,7 @@ kwsinit (bool mb_trans)
    start of a multibyte character or is an error-encoding byte.  The
    buffer ends at END (i.e., one past the address of the buffer's last
    byte).  If CUR is already at a boundary, return 0.  If *MB_START is
-   greater than or equal to CUR, return the negative value CUR - *MB_START.
+   greater than CUR, return the negative value CUR - *MB_START.
 
    When returning zero, set *MB_START to CUR.  When returning a
    positive value, set *MB_START to the next boundary after CUR, or to
-- 
2.7.4

bug#21763: bug#22239: bug#22357: grep -f not only huge memory usage, but also huge time cost

Reply via email to