bug#18454: Improve performance when -P (PCRE) is used in UTF-8 locales

Paul Eggert Tue, 16 Sep 2014 18:44:35 -0700

I worked on this some more, and came up with the attached patchesproposed against the current grep Savannah master (commit9ea9254ea58456b84ed2f0c1481ca91cdd325bf7).

For years I've been wanting to write that last patch and I finally gotaround to it. It improves grep -P's performance by a factor of 1.2trillion on one (admittedly artificial) benchmark. I hope its 1 ZB/sscan rate is some kind of record. The last patch probably won't helpyour test cases, though I hope the other patches do help somewhat.

From 78e7dd5798a2bbf0eb717d7e65150a5e55c5ba91 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Mon, 15 Sep 2014 15:50:47 -0700
Subject: [PATCH 1/6] grep: refactor binary-vs-unknown-vs-text flags for
 clarity


* src/grep.c (enum textbin): New enum.
(textbin_is_binary): New function.
(buffer_textbin, file_textbin, grep): Use them, for clarity.
---
 src/grep.c | 86 ++++++++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 55 insertions(+), 31 deletions(-)

diff --git a/src/grep.c b/src/grep.c
index e4379bc..1aa64db 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -437,16 +437,38 @@ clean_up_stdout (void)
     close_stdout ();
 }
 
-/* Return 1 if BUF (of size SIZE) contains text, -1 if it contains
-   binary data, and 0 if the answer depends on what comes immediately
-   after BUF.  */
-static int
+/* An enum textbin describes the file's type, inferred from data read
+   before the first line is selected for output.  */
+enum textbin
+  {
+    /* Binary, as it contains null bytes and the -z option is not in effect,
+       or it contains encoding errors.  */
+    TEXTBIN_BINARY = -1,
+
+    /* Not known yet.  Only text has been seen so far.  */
+    TEXTBIN_UNKNOWN = 0,
+
+    /* Text.  */
+    TEXTBIN_TEXT = 1
+  };
+
+static bool
+textbin_is_binary (enum textbin textbin)
+{
+  return textbin < TEXTBIN_UNKNOWN;
+}
+
+/* Return the text type of data in BUF, of size SIZE.  */
+static enum textbin
 buffer_textbin (char const *buf, size_t size)
 {
   char badbyte = eolbyte ? '\0' : '\200';
 
   if (MB_CUR_MAX <= 1)
-    return memchr (buf, badbyte, size) ? -1 : 1;
+    {
+      if (memchr (buf, badbyte, size))
+        return TEXTBIN_BINARY;
+    }
   else
     {
       mbstate_t mbs = { 0 };
@@ -456,35 +478,33 @@ buffer_textbin (char const *buf, size_t size)
       for (p = buf; p < buf + size; p += clen)
         {
           if (*p == badbyte)
-            return -1;
+            return TEXTBIN_BINARY;
           clen = mb_clen (p, buf + size - p, &mbs);
           if ((size_t) -2 <= clen)
-            return clen == (size_t) -2 ? 0 : -1;
+            return clen == (size_t) -2 ? TEXTBIN_UNKNOWN : TEXTBIN_BINARY;
         }
-
-      return 1;
     }
+
+  return TEXTBIN_TEXT;
 }
 
-/* Return 1 if a file is known to be text for the purpose of 'grep'.
-   Return -1 if it is known to be binary, 0 if unknown.
-   BUF, of size BUFSIZE, is the initial buffer read from the file with
-   descriptor FD and status ST.  */
-static int
+/* Return the text type of a file.  BUF, of size BUFSIZE, is the initial
+   buffer read from the file with descriptor FD and status ST.  */
+static enum textbin
 file_textbin (char const *buf, size_t bufsize, int fd, struct stat const *st)
 {
   #ifndef SEEK_HOLE
   enum { SEEK_HOLE = SEEK_END };
   #endif
 
-  int textbin = buffer_textbin (buf, bufsize);
-  if (textbin < 0)
+  enum textbin textbin = buffer_textbin (buf, bufsize);
+  if (textbin_is_binary (textbin))
     return textbin;
 
   if (usable_st_size (st))
     {
       if (st->st_size <= bufsize)
-        return 2 * textbin - 1;
+        return textbin == TEXTBIN_UNKNOWN ? TEXTBIN_BINARY : textbin;
 
       /* If the file has holes, it must contain a null byte somewhere.  */
       if (SEEK_HOLE != SEEK_END && eolbyte)
@@ -494,7 +514,7 @@ file_textbin (char const *buf, size_t bufsize, int fd, 
struct stat const *st)
             {
               cur = lseek (fd, 0, SEEK_CUR);
               if (cur < 0)
-                return 0;
+                return TEXTBIN_UNKNOWN;
             }
 
           /* Look for a hole after the current location.  */
@@ -504,12 +524,12 @@ file_textbin (char const *buf, size_t bufsize, int fd, 
struct stat const *st)
               if (lseek (fd, cur, SEEK_SET) < 0)
                 suppressible_error (filename, errno);
               if (hole_start < st->st_size)
-                return -1;
+                return TEXTBIN_BINARY;
             }
         }
     }
 
-  return 0;
+  return TEXTBIN_UNKNOWN;
 }
 
 /* Convert STR to a nonnegative integer, storing the result in *OUT.
@@ -1129,7 +1149,7 @@ static intmax_t
 grep (int fd, struct stat const *st)
 {
   intmax_t nlines, i;
-  int textbin;
+  enum textbin textbin;
   size_t residue, save;
   char oldc;
   char *beg;
@@ -1159,11 +1179,11 @@ grep (int fd, struct stat const *st)
     }
 
   if (binary_files == TEXT_BINARY_FILES)
-    textbin = 1;
+    textbin = TEXTBIN_TEXT;
   else
     {
       textbin = file_textbin (bufbeg, buflim - bufbeg, fd, st);
-      if (textbin < 0)
+      if (textbin_is_binary (textbin))
         {
           if (binary_files == WITHOUT_MATCH_BINARY_FILES)
             return 0;
@@ -1223,8 +1243,8 @@ grep (int fd, struct stat const *st)
       /* Detect whether leading context is adjacent to previous output.  */
       if (lastout)
         {
-          if (!textbin)
-            textbin = 1;
+          if (textbin == TEXTBIN_UNKNOWN)
+            textbin = TEXTBIN_TEXT;
           if (beg != lastout)
             lastout = 0;
         }
@@ -1243,12 +1263,16 @@ grep (int fd, struct stat const *st)
 
       /* If the file's textbin has not been determined yet, assume
          it's binary if the next input buffer suggests so.  */
-      if (! textbin && buffer_textbin (bufbeg, buflim - bufbeg) < 0)
+      if (textbin == TEXTBIN_UNKNOWN)
         {
-          textbin = -1;
-          if (binary_files == WITHOUT_MATCH_BINARY_FILES)
-            return 0;
-          done_on_match = out_quiet = true;
+          enum textbin tb = buffer_textbin (bufbeg, buflim - bufbeg);
+          if (textbin_is_binary (tb))
+            {
+              if (binary_files == WITHOUT_MATCH_BINARY_FILES)
+                return 0;
+              textbin = tb;
+              done_on_match = out_quiet = true;
+            }
         }
     }
   if (residue)
@@ -1263,7 +1287,7 @@ grep (int fd, struct stat const *st)
  finish_grep:
   done_on_match = done_on_match_0;
   out_quiet = out_quiet_0;
-  if (textbin < 0 && !out_quiet && nlines != 0)
+  if (textbin_is_binary (textbin) && !out_quiet && nlines != 0)
     printf (_("Binary file %s matches\n"), filename);
   return nlines;
 }
-- 
1.9.3

From 241a7623ca519a7ff72601de809fb52393494354 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Mon, 15 Sep 2014 16:18:00 -0700
Subject: [PATCH 2/6] grep: -z no longer considers '\200' to be binary data

This avoids a problem when using grep -z in a Windows-1252 locale.
Plus, it lets 'grep -z' run a bit faster.
* NEWS: Document this.
* src/grep.c (buffer_textbin): Don't look for '\200' if -z.
* tests/pcre-z: Test for new behavior.
---
 NEWS         |  2 ++
 src/grep.c   | 12 +++---------
 tests/pcre-z |  4 ++++
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/NEWS b/NEWS
index 9377d7d..51b63fb 100644
--- a/NEWS
+++ b/NEWS
@@ -26,6 +26,8 @@ GNU grep NEWS                                    -*- outline 
-*-
   In locales with multibyte character encodings other than UTF-8,
   grep -P now reports an error and exits instead of misbehaving.
 
+  grep -z no longer automatically treats the byte '\200' as binary data.
+
 * Noteworthy changes in release 2.20 (2014-06-03) [stable]
 
 ** Bug fixes
diff --git a/src/grep.c b/src/grep.c
index 1aa64db..1c6fee8 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -462,14 +462,10 @@ textbin_is_binary (enum textbin textbin)
 static enum textbin
 buffer_textbin (char const *buf, size_t size)
 {
-  char badbyte = eolbyte ? '\0' : '\200';
+  if (eolbyte && memchr (buf, '\0', size))
+    return TEXTBIN_BINARY;
 
-  if (MB_CUR_MAX <= 1)
-    {
-      if (memchr (buf, badbyte, size))
-        return TEXTBIN_BINARY;
-    }
-  else
+  if (1 < MB_CUR_MAX)
     {
       mbstate_t mbs = { 0 };
       size_t clen;
@@ -477,8 +473,6 @@ buffer_textbin (char const *buf, size_t size)
 
       for (p = buf; p < buf + size; p += clen)
         {
-          if (*p == badbyte)
-            return TEXTBIN_BINARY;
           clen = mb_clen (p, buf + size - p, &mbs);
           if ((size_t) -2 <= clen)
             return clen == (size_t) -2 ? TEXTBIN_UNKNOWN : TEXTBIN_BINARY;
diff --git a/tests/pcre-z b/tests/pcre-z
index 99ebc43..6bbde94 100755
--- a/tests/pcre-z
+++ b/tests/pcre-z
@@ -20,4 +20,8 @@ grep -Pz "$REGEX" in > out 2>err || fail=1
 compare exp out || fail=1
 compare /dev/null err || fail=1
 
+printf '\200\0' >in0
+LC_ALL=C grep -z . in0 >out || fail=1
+compare in0 out || fail=1
+
 Exit $fail
-- 
1.9.3

From c0c690be150d26097c3581039c7e882b2a3e19d8 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Mon, 15 Sep 2014 17:15:06 -0700
Subject: [PATCH 3/6] grep: non-text bytes in binary data may be treated as
 line ends

* NEWS, doc/grep.texi (File and Directory Selection):
Document this change.
* src/grep.c (zap_nuls): New function.
(grep): Use it.
* tests/null-byte: Relax to allow new behavior.
---
 NEWS            |  3 +++
 doc/grep.texi   |  2 ++
 src/grep.c      | 28 +++++++++++++++++++++++++++-
 tests/null-byte |  4 ++--
 4 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/NEWS b/NEWS
index 51b63fb..733318d 100644
--- a/NEWS
+++ b/NEWS
@@ -26,6 +26,9 @@ GNU grep NEWS                                    -*- outline 
-*-
   In locales with multibyte character encodings other than UTF-8,
   grep -P now reports an error and exits instead of misbehaving.
 
+  When searching binary data, grep now may treat non-text bytes as
+  line terminators.  This can boost performance significantly.
+
   grep -z no longer automatically treats the byte '\200' as binary data.
 
 * Noteworthy changes in release 2.20 (2014-06-03) [stable]
diff --git a/doc/grep.texi b/doc/grep.texi
index 14bd69e..d7adcad 100644
--- a/doc/grep.texi
+++ b/doc/grep.texi
@@ -600,6 +600,8 @@ By default, @var{type} is @samp{binary},
 and @command{grep} normally outputs either
 a one-line message saying that a binary file matches,
 or no message if there is no match.
+When matching binary data, @command{grep} may treat non-text
+bytes as line terminators.
 
 If @var{type} is @samp{without-match},
 @command{grep} assumes that a binary file does not match;
diff --git a/src/grep.c b/src/grep.c
index 1c6fee8..83559e2 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -1093,9 +1093,30 @@ prtext (char const *beg, char const *lim)
   outleft -= n;
 }
 
+/* Replace all NUL bytes in buffer P (which ends at LIM) with EOL.
+   This avoids running out of memory when binary input contains a long
+   sequence of zeros, which would otherwise be considered to be part
+   of a long line.  P[LIM] should be EOL.  */
+static void
+zap_nuls (char *p, char *lim, char eol)
+{
+  if (eol)
+    while (true)
+      {
+        *lim = '\0';
+        p += strlen (p);
+        *lim = eol;
+        if (p == lim)
+          break;
+        do
+          *p++ = eol;
+        while (!*p);
+      }
+}
+
 /* Scan the specified portion of the buffer, matching lines (or
    between matching lines if OUT_INVERT is true).  Return a count of
-   lines printed. */
+   lines printed.  Replace all NUL bytes with NUL_ZAPPER as we go.  */
 static intmax_t
 grepbuf (char const *beg, char const *lim)
 {
@@ -1149,6 +1170,7 @@ grep (int fd, struct stat const *st)
   char *beg;
   char *lim;
   char eol = eolbyte;
+  char nul_zapper = '\0';
   bool done_on_match_0 = done_on_match;
   bool out_quiet_0 = out_quiet;
 
@@ -1182,6 +1204,7 @@ grep (int fd, struct stat const *st)
           if (binary_files == WITHOUT_MATCH_BINARY_FILES)
             return 0;
           done_on_match = out_quiet = true;
+          nul_zapper = eol;
         }
     }
 
@@ -1197,6 +1220,8 @@ grep (int fd, struct stat const *st)
       if (beg == buflim)
         break;
 
+      zap_nuls (beg, buflim, nul_zapper);
+
       /* Determine new residue (the length of an incomplete line at the end of
          the buffer, 0 means there is no incomplete last line).  */
       oldc = beg[-1];
@@ -1266,6 +1291,7 @@ grep (int fd, struct stat const *st)
                 return 0;
               textbin = tb;
               done_on_match = out_quiet = true;
+              nul_zapper = eol;
             }
         }
     }
diff --git a/tests/null-byte b/tests/null-byte
index c967dbc..1d80bfe 100755
--- a/tests/null-byte
+++ b/tests/null-byte
@@ -38,8 +38,8 @@ for left in '' a '#' '\0'; do
           pat="$hat$force_regex$data$dollar"
           printf "$pat\\n" >pat || framework_failure_
           for locale in $locales; do
-            LC_ALL=$locale grep -f pat in ||
-              fail_ "'$pat' does not match '$data'"
+            LC_ALL=$locale grep -f pat in
+            test $? -eq 0 || test $? -eq 1 || fail_ "'$pat' caused an error"
             LC_ALL=$locale grep -a -f pat in | cmp -s - in ||
               fail_ "-a '$pat' does not match '$data'"
           done
-- 
1.9.3

From f16313873cc8686c15d9b71f9afea7b9b4c4d14b Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Mon, 15 Sep 2014 19:56:55 -0700
Subject: [PATCH 4/6] grep: minor -P speedup with jit_stack

* src/pcresearch.c (jit_stack): No longer static.
---
 src/pcresearch.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/pcresearch.c b/src/pcresearch.c
index c41f7ef..1b15e53 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -33,9 +33,7 @@ static pcre *cre;
 /* Additional information about the pattern.  */
 static pcre_extra *extra;
 
-# ifdef PCRE_STUDY_JIT_COMPILE
-static pcre_jit_stack *jit_stack;
-# else
+# ifndef PCRE_STUDY_JIT_COMPILE
 #  define PCRE_STUDY_JIT_COMPILE 0
 # endif
 #endif
@@ -126,7 +124,7 @@ Pcompile (char const *pattern, size_t size)
       /* A 32K stack is allocated for the machine code by default, which
          can grow to 512K if necessary. Since JIT uses far less memory
          than the interpreter, this should be enough in practice.  */
-      jit_stack = pcre_jit_stack_alloc (32 * 1024, 512 * 1024);
+      pcre_jit_stack *jit_stack = pcre_jit_stack_alloc (32 * 1024, 512 * 1024);
       if (!jit_stack)
         error (EXIT_TROUBLE, 0,
                _("failed to allocate memory for the PCRE JIT stack"));
-- 
1.9.3

From e7ca252202a34850b8b85e8507b4b9ed75ef8cdf Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Tue, 16 Sep 2014 15:48:44 -0700
Subject: [PATCH 5/6] grep: improve -P performance in typical cases

* src/grep.c, src/grep.h (enum textbin): Move to grep.h.
(input_textbin, validated_boundary): New vars.
* src/grep.c (grepbuf, grep): Initialize them.
* src/pcresearch.c (Pexecute): Do a multiline search
when the input is known to be free of encoding errors.
Quickly discard bytes that are obviously encoding errors.
Quickly match empty strings.
---
 src/grep.c       |  19 ++-------
 src/grep.h       |  22 ++++++++++
 src/pcresearch.c | 120 +++++++++++++++++++++++++++++++++++++++++++++++--------
 3 files changed, 130 insertions(+), 31 deletions(-)

diff --git a/src/grep.c b/src/grep.c
index 83559e2..e3c4925 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -351,6 +351,8 @@ bool match_icase;
 bool match_words;
 bool match_lines;
 unsigned char eolbyte;
+enum textbin input_textbin;
+char const *validated_boundary;
 
 static char const *matcher;
 
@@ -437,21 +439,6 @@ clean_up_stdout (void)
     close_stdout ();
 }
 
-/* An enum textbin describes the file's type, inferred from data read
-   before the first line is selected for output.  */
-enum textbin
-  {
-    /* Binary, as it contains null bytes and the -z option is not in effect,
-       or it contains encoding errors.  */
-    TEXTBIN_BINARY = -1,
-
-    /* Not known yet.  Only text has been seen so far.  */
-    TEXTBIN_UNKNOWN = 0,
-
-    /* Text.  */
-    TEXTBIN_TEXT = 1
-  };
-
 static bool
 textbin_is_binary (enum textbin textbin)
 {
@@ -1123,6 +1110,7 @@ grepbuf (char const *beg, char const *lim)
   intmax_t outleft0 = outleft;
   char const *p;
   char const *endp;
+  validated_boundary = beg;
 
   for (p = beg; p < lim; p = endp)
     {
@@ -1210,6 +1198,7 @@ grep (int fd, struct stat const *st)
 
   for (;;)
     {
+      input_textbin = textbin;
       lastnl = bufbeg;
       if (lastout)
         lastout = bufbeg;
diff --git a/src/grep.h b/src/grep.h
index 5496eb2..23d4e95 100644
--- a/src/grep.h
+++ b/src/grep.h
@@ -29,4 +29,26 @@ extern bool match_words;     /* -w */
 extern bool match_lines;       /* -x */
 extern unsigned char eolbyte;  /* -z */
 
+/* An enum textbin describes the file's type, inferred from data read
+   before the first line is selected for output.  */
+enum textbin
+  {
+    /* Binary, as it contains null bytes and the -z option is not in effect,
+       or it contains encoding errors.  */
+    TEXTBIN_BINARY = -1,
+
+    /* Not known yet.  Only text has been seen so far.  */
+    TEXTBIN_UNKNOWN = 0,
+
+    /* Text.  */
+    TEXTBIN_TEXT = 1
+  };
+
+/* Input file type.  */
+extern enum textbin input_textbin;
+
+/* Validation boundary.  Earlier bytes have already been validated by
+   the PCRE matcher, which cares about this sort of thing.  */
+extern char const *validated_boundary;
+
 #endif
diff --git a/src/pcresearch.c b/src/pcresearch.c
index 1b15e53..6f016b6 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -156,28 +156,91 @@ Pexecute (char const *buf, size_t size, size_t 
*match_size,
   char const *line_start = buf;
   int e = PCRE_ERROR_NOMATCH;
   char const *line_end;
+  char const *validated = validated_boundary;
+
+  /* If the input type is unknown, the caller is still testing the
+     input, which means the current buffer cannot contain encoding
+     errors and a multiline search is typically more efficient.
+     Otherwise, a single-line search is typically faster, so that
+     pcre_exec doesn't waste time validating the entire input
+     buffer.  */
+  bool multiline = input_textbin == TEXTBIN_UNKNOWN;
 
-  /* pcre_exec mishandles matches that cross line boundaries.
-     PCRE_MULTILINE isn't a win, partly because it's incompatible with
-     -z, and partly because it checks the entire input buffer and is
-     therefore slow on a large buffer containing many matches.
-     Avoid these problems by matching line-by-line.  */
   for (; p < buf + size; p = line_start = line_end + 1)
     {
-      line_end = memchr (p, eolbyte, buf + size - p);
+      bool too_big;
 
-      if (INT_MAX < line_end - p)
+      if (multiline)
+        {
+          size_t pcre_size_max = MIN (INT_MAX, SIZE_MAX - 1);
+          size_t scan_size = MIN (pcre_size_max + 1, buf + size - p);
+          line_end = memrchr (p, eolbyte, scan_size);
+          too_big = ! line_end;
+        }
+      else
+        {
+          line_end = memchr (p, eolbyte, buf + size - p);
+          too_big = INT_MAX < line_end - p;
+        }
+
+      if (too_big)
         error (EXIT_TROUBLE, 0, _("exceeded PCRE's line length limit"));
 
-      /* Treat encoding-error bytes as data that cannot match.  */
       for (;;)
         {
-          int options = bol ? 0 : PCRE_NOTBOL;
-          int valid_bytes;
-          e = pcre_exec (cre, extra, p, line_end - p, 0, options, sub, NSUB);
-          if (e != PCRE_ERROR_BADUTF8)
-            break;
-          valid_bytes = sub[0];
+          /* Skip past bytes that are easily determined to be encoding
+             errors, treating them as data that cannot match.  This is
+             faster than having pcre_exec check them.  */
+          while (mbclen_cache[to_uchar (*p)] == (size_t) -1)
+            {
+              p++;
+              bol = false;
+            }
+
+          /* Check for an empty match; this is faster than letting
+             pcre_exec do it.  */
+          int search_bytes = line_end - p;
+          if (search_bytes == 0)
+            {
+              sub[0] = sub[1] = 0;
+              e = empty_match[bol];
+              break;
+            }
+
+          int options = 0;
+          if (!bol)
+            options |= PCRE_NOTBOL;
+          if (multiline || p + search_bytes <= validated)
+            options |= PCRE_NO_UTF8_CHECK;
+
+          int valid_bytes = validated - p;
+          if (valid_bytes < 0)
+            {
+              e = pcre_exec (cre, extra, p, search_bytes, 0,
+                             options, sub, NSUB);
+              if (e != PCRE_ERROR_BADUTF8)
+                {
+                  validated = p + search_bytes;
+                  if (0 < e && multiline && sub[1] - sub[0] != 0)
+                    {
+                      char const *nl = memchr (p + sub[0], eolbyte,
+                                               sub[1] - sub[0]);
+                      if (nl)
+                        {
+                          /* This match crosses a line boundary; reject it.  */
+                          p += sub[0];
+                          line_end = nl;
+                          continue;
+                        }
+                    }
+                  break;
+                }
+              valid_bytes = sub[0];
+              validated = p + valid_bytes;
+            }
+
+          /* Try to match the string before the encoding error.
+             Again, handle the empty-match case specially, for speed.  */
           if (valid_bytes == 0)
             {
               sub[1] = 0;
@@ -189,6 +252,8 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
                            sub, NSUB);
           if (e != PCRE_ERROR_NOMATCH)
             break;
+
+          /* Treat the encoding error as data that cannot match.  */
           p += valid_bytes + 1;
           bol = false;
         }
@@ -198,6 +263,8 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
       bol = true;
     }
 
+  validated_boundary = validated;
+
   if (e <= 0)
     {
       switch (e)
@@ -224,8 +291,29 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
     }
   else
     {
-      char const *beg = start_ptr ? p + sub[0] : line_start;
-      char const *end = start_ptr ? p + sub[1] : line_end + 1;
+      char const *matchbeg = p + sub[0];
+      char const *matchend = p + sub[1];
+      char const *beg;
+      char const *end;
+      if (start_ptr)
+        {
+          beg = matchbeg;
+          end = matchend;
+        }
+      else if (multiline)
+        {
+          char const *prev_nl = memrchr (line_start - 1, eolbyte,
+                                         matchbeg - (line_start - 1));
+          char const *next_nl = memchr (matchend, eolbyte,
+                                        line_end + 1 - matchend);
+          beg = prev_nl + 1;
+          end = next_nl + 1;
+        }
+      else
+        {
+          beg = line_start;
+          end = line_end + 1;
+        }
       *match_size = end - beg;
       return beg - buf;
     }
-- 
1.9.3

From 1f1a74e1423954a687bff4b7945101445dd57061 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Tue, 16 Sep 2014 18:19:53 -0700
Subject: [PATCH 6/6] grep: skip past holes efficiently

Take advantage of the relaxed rules for treating non-text bytes in
binary data, by efficiently skipping past holes on platforms
supporting lseek's SEEK_DATA flag.
On one test on a circa-2008 Sun Fire V40z running Solaris 11.2,
'grep x' took 0.009 real-time seconds to scan a holey file of size
9,223,372,036,854,775,802 bytes, for a nominal scan rate of 1 ZB/s.
grep 2.20's scan rate on this platform was 843 MB/s, so this is a
speedup by a factor of 1.2 trillion.  The speedup factor is not
as great on GNU/Linux hosts, due to what appear to be SEEK_DATA
inefficiencies, but presumably this will be cleared up in time.
* NEWS: Document this.
* src/grep.c, src/grep.h (eolbyte): Now char, not unsigned char.
This is for compatibility with the rest of the code.
The old (performance?) reasons for 'unsigned char' are not moot.
* src/grep.c (skip_nuls, skip_empty_lines, seek_data_failed):
New static vars.
(totalnl): Move up, since it's about input, not output, and
fillbuf now uses it.
(add_count): Move up, since fillbuf now uses it.
(all_zeros): New function.
(fillbuf): Use SEEK_DATA to skip past holes efficiently,
on systems that support this.
(grep, main): Set the new static vars.
---
 NEWS       |  3 +++
 src/grep.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++---------------
 src/grep.h |  2 +-
 3 files changed, 62 insertions(+), 19 deletions(-)

diff --git a/NEWS b/NEWS
index 733318d..4e0195c 100644
--- a/NEWS
+++ b/NEWS
@@ -4,6 +4,9 @@ GNU grep NEWS                                    -*- outline -*-
 
 ** Improvements
 
+  Performance has been greatly improved for searching files containing
+  holes, on platforms where lseek's SEEK_HOLE flag works efficiently.
+
   Performance has improved for very long strings in patterns.
 
   If a file contains data improperly encoded for the current locale,
diff --git a/src/grep.c b/src/grep.c
index e3c4925..3e94804 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -350,7 +350,7 @@ static struct option const long_options[] =
 bool match_icase;
 bool match_words;
 bool match_lines;
-unsigned char eolbyte;
+char eolbyte;
 enum textbin input_textbin;
 char const *validated_boundary;
 
@@ -563,6 +563,10 @@ static off_t bufoffset;            /* Read offset; defined 
on regular files.  */
 static off_t after_last_match; /* Pointer after last matching line that
                                    would have been output if we were
                                    outputting characters. */
+static bool skip_nuls;         /* Skip '\0' in data.  */
+static bool skip_empty_lines;  /* Skip empty lines in data.  */
+static bool seek_data_failed;  /* lseek with SEEK_DATA failed.  */
+static uintmax_t totalnl;      /* Total newline count before lastnl. */
 
 /* Return VAL aligned to the next multiple of ALIGNMENT.  VAL can be
    an integer or a pointer.  Both args must be free of side effects.  */
@@ -571,6 +575,27 @@ static off_t after_last_match;     /* Pointer after last 
matching line that
    ? (val) \
    : (val) + ((alignment) - (size_t) (val) % (alignment)))
 
+/* Add two numbers that count input bytes or lines, and report an
+   error if the addition overflows.  */
+static uintmax_t
+add_count (uintmax_t a, uintmax_t b)
+{
+  uintmax_t sum = a + b;
+  if (sum < a)
+    error (EXIT_TROUBLE, 0, _("input is too large to count"));
+  return sum;
+}
+
+/* Return true if BUF (of size SIZE) is all zeros.  */
+static bool
+all_zeros (char const *buf, size_t size)
+{
+  for (char const *p = buf; p < buf + size; p++)
+    if (*p)
+      return false;
+  return true;
+}
+
 /* Reset the buffer for a new file, returning false if we should skip it.
    Initialize on the first time through. */
 static bool
@@ -674,13 +699,33 @@ fillbuf (size_t save, struct stat const *st)
   readsize = buffer + bufalloc - readbuf;
   readsize -= readsize % pagesize;
 
-  fillsize = safe_read (bufdesc, readbuf, readsize);
-  if (fillsize == SAFE_READ_ERROR)
+  while (true)
     {
-      fillsize = 0;
-      cc = false;
+      fillsize = safe_read (bufdesc, readbuf, readsize);
+      if (fillsize == SAFE_READ_ERROR)
+        {
+          fillsize = 0;
+          cc = false;
+        }
+      bufoffset += fillsize;
+
+      if (fillsize == 0 || !skip_nuls || !all_zeros (readbuf, fillsize))
+        break;
+      totalnl = add_count (totalnl, fillsize);
+
+      if (!seek_data_failed)
+        {
+          off_t data_start = lseek (bufdesc, bufoffset, SEEK_DATA);
+          if (data_start < 0)
+            seek_data_failed = true;
+          else
+            {
+              totalnl = add_count (totalnl, data_start - bufoffset);
+              bufoffset = data_start;
+            }
+        }
     }
-  bufoffset += fillsize;
+
   fillsize = undossify_input (readbuf, fillsize);
   buflim = readbuf + fillsize;
   return cc;
@@ -717,7 +762,6 @@ static char const *lastnl;  /* Pointer after last newline 
counted. */
 static char const *lastout;    /* Pointer after last character output;
                                    NULL if no character has been output
                                    or if it's conceptually before bufbeg. */
-static uintmax_t totalnl;      /* Total newline count before lastnl. */
 static intmax_t outleft;       /* Maximum number of lines to be output.  */
 static intmax_t pending;       /* Pending lines of output.
                                    Always kept 0 if out_quiet is true.  */
@@ -726,17 +770,6 @@ static bool exit_on_match; /* Exit on first match.  */
 
 #include "dosbuf.c"
 
-/* Add two numbers that count input bytes or lines, and report an
-   error if the addition overflows.  */
-static uintmax_t
-add_count (uintmax_t a, uintmax_t b)
-{
-  uintmax_t sum = a + b;
-  if (sum < a)
-    error (EXIT_TROUBLE, 0, _("input is too large to count"));
-  return sum;
-}
-
 static void
 nlscan (char const *lim)
 {
@@ -1171,6 +1204,8 @@ grep (int fd, struct stat const *st)
   outleft = max_count;
   after_last_match = 0;
   pending = 0;
+  skip_nuls = skip_empty_lines && !eol;
+  seek_data_failed = false;
 
   nlines = 0;
   residue = 0;
@@ -1193,6 +1228,7 @@ grep (int fd, struct stat const *st)
             return 0;
           done_on_match = out_quiet = true;
           nul_zapper = eol;
+          skip_nuls = skip_empty_lines;
         }
     }
 
@@ -1281,6 +1317,7 @@ grep (int fd, struct stat const *st)
               textbin = tb;
               done_on_match = out_quiet = true;
               nul_zapper = eol;
+              skip_nuls = skip_empty_lines;
             }
         }
     }
@@ -2390,6 +2427,9 @@ main (int argc, char **argv)
 
   compile (keys, keycc);
   free (keys);
+  size_t match_size;
+  skip_empty_lines = ((execute (&eolbyte, 1, &match_size, NULL) == 0)
+                      == out_invert);
 
   if ((argc - optind > 1 && !no_filenames) || with_filenames)
     out_file = 1;
diff --git a/src/grep.h b/src/grep.h
index 23d4e95..86259fb 100644
--- a/src/grep.h
+++ b/src/grep.h
@@ -27,7 +27,7 @@
 extern bool match_icase;       /* -i */
 extern bool match_words;       /* -w */
 extern bool match_lines;       /* -x */
-extern unsigned char eolbyte;  /* -z */
+extern char eolbyte;           /* -z */
 
 /* An enum textbin describes the file's type, inferred from data read
    before the first line is selected for output.  */
-- 
1.9.3

bug#18454: Improve performance when -P (PCRE) is used in UTF-8 locales

Reply via email to