bug#20526: grep BUG: text file is detected as binary

Paul Eggert Wed, 06 Jan 2016 00:33:52 -0800

Paul Eggert wrote:

grep -rP 'fed.*cba' .


On my machine the above command is 125x slower with the new grep than the old
one, which suggests some tuning is in order before releasing. (It's bogged down
inside libpcre somewhere.)


I installed the attached patch, which fixed this performance bug for me.

>From 6e8f5b27ab033f4551e61740c1bdd6ffa13e9047 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Wed, 6 Jan 2016 00:26:26 -0800
Subject: [PATCH] grep: restore -P PCRE_NO_UTF8_CHECK optimization

On my platform in the en_US.utf8 locale, this makes 'grep -P "z.*a" k'
220x faster, where k is created by the shell command:
yes 'abcdefg hijklmn opqrstu vwxyz' | head -n 10000000 >k
* src/dfasearch.c (EGexecute):
* src/grep.c (execute_fp_t):
* src/kwsearch.c (Fexecute):
* src/pcresearch.c (Pexecute):
First arg is now char *, not char const *, since Pexecute now
temporarily modifies this argument.
* src/grep.c, src/grep.h (buf_has_encoding_errors): Now extern.
* src/pcresearch.c (Pexecute): Use it.  If the input is free of
encoding errors, use a multiline search and the PCRE_NO_UTF8_CHECK
option, as this is typically way faster.  This restores an
optimization that was removed with the recent changes for binary
file detection.
---
 src/dfasearch.c  |  2 +-
 src/grep.c       |  4 ++--
 src/grep.h       |  2 ++
 src/kwsearch.c   |  2 +-
 src/pcresearch.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++-------
 5 files changed, 56 insertions(+), 11 deletions(-)

diff --git a/src/dfasearch.c b/src/dfasearch.c
index 0205011..a330eac 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -202,7 +202,7 @@ GEAcompile (char const *pattern, size_t size, reg_syntax_t syntax_bits)
 }
 
 size_t
-EGexecute (char const *buf, size_t size, size_t *match_size,
+EGexecute (char *buf, size_t size, size_t *match_size,
            char const *start_ptr)
 {
   char const *buflim, *beg, *end, *ptr, *match, *best_match, *mb_start;
diff --git a/src/grep.c b/src/grep.c
index f6fb0bc..10aabf9 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -462,7 +462,7 @@ enum { SEEK_HOLE = SEEK_SET };
 
 /* Functions we'll use to search. */
 typedef void (*compile_fp_t) (char const *, size_t);
-typedef size_t (*execute_fp_t) (char const *, size_t, size_t *, char const *);
+typedef size_t (*execute_fp_t) (char *, size_t, size_t *, char const *);
 static compile_fp_t compile;
 static execute_fp_t execute;
 
@@ -561,7 +561,7 @@ skip_easy_bytes (char const *buf)
 /* Return true if BUF, of size SIZE, has an encoding error.
    BUF must be followed by at least sizeof (uword) bytes,
    the first of which may be modified.  */
-static bool
+bool
 buf_has_encoding_errors (char *buf, size_t size)
 {
   if (! unibyte_mask)
diff --git a/src/grep.h b/src/grep.h
index 577fb72..75b7ef7 100644
--- a/src/grep.h
+++ b/src/grep.h
@@ -29,4 +29,6 @@ extern bool match_words;	/* -w */
 extern bool match_lines;	/* -x */
 extern char eolbyte;		/* -z */
 
+extern bool buf_has_encoding_errors (char *, size_t);
+
 #endif
diff --git a/src/kwsearch.c b/src/kwsearch.c
index e33caaf..e9966d4 100644
--- a/src/kwsearch.c
+++ b/src/kwsearch.c
@@ -78,7 +78,7 @@ Fcompile (char const *pattern, size_t size)
 }
 
 size_t
-Fexecute (char const *buf, size_t size, size_t *match_size,
+Fexecute (char *buf, size_t size, size_t *match_size,
           char const *start_ptr)
 {
   char const *beg, *try, *end, *mb_start;
diff --git a/src/pcresearch.c b/src/pcresearch.c
index a647514..8f3d935 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -174,7 +174,7 @@ Pcompile (char const *pattern, size_t size)
 }
 
 size_t
-Pexecute (char const *buf, size_t size, size_t *match_size,
+Pexecute (char *buf, size_t size, size_t *match_size,
           char const *start_ptr)
 {
 #if !HAVE_LIBPCRE
@@ -194,13 +194,31 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
      error.  */
   char const *subject = buf;
 
+  /* If the input is free of encoding errors a multiline search is
+     typically more efficient.  Otherwise, a single-line search is
+     typically faster, so that pcre_exec doesn't waste time validating
+     the entire input buffer.  */
+  bool multiline = ! buf_has_encoding_errors (buf, size - 1);
+  buf[size - 1] = eolbyte;
+
   for (; p < buf + size; p = line_start = line_end + 1)
     {
-      /* A single-line search is typically faster, so that
-         pcre_exec doesn't waste time validating the entire input
-         buffer.  */
-      line_end = memchr (p, eolbyte, buf + size - p);
-      if (INT_MAX < line_end - p)
+      bool too_big;
+
+      if (multiline)
+        {
+          size_t pcre_size_max = MIN (INT_MAX, SIZE_MAX - 1);
+          size_t scan_size = MIN (pcre_size_max + 1, buf + size - p);
+          line_end = memrchr (p, eolbyte, scan_size);
+          too_big = ! line_end;
+        }
+      else
+        {
+          line_end = memchr (p, eolbyte, buf + size - p);
+          too_big = INT_MAX < line_end - p;
+        }
+
+      if (too_big)
         error (EXIT_TROUBLE, 0, _("exceeded PCRE's line length limit"));
 
       for (;;)
@@ -228,11 +246,27 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
           int options = 0;
           if (!bol)
             options |= PCRE_NOTBOL;
+          if (multiline)
+            options |= PCRE_NO_UTF8_CHECK;
 
           e = jit_exec (subject, line_end - subject, search_offset,
                         options, sub);
           if (e != PCRE_ERROR_BADUTF8)
-            break;
+            {
+              if (0 < e && multiline && sub[1] - sub[0] != 0)
+                {
+                  char const *nl = memchr (subject + sub[0], eolbyte,
+                                           sub[1] - sub[0]);
+                  if (nl)
+                    {
+                      /* This match crosses a line boundary; reject it.  */
+                      p = subject + sub[0];
+                      line_end = nl;
+                      continue;
+                    }
+                }
+              break;
+            }
           int valid_bytes = sub[0];
 
           /* Try to match the string before the encoding error.  */
@@ -304,6 +338,15 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
           beg = matchbeg;
           end = matchend;
         }
+      else if (multiline)
+        {
+          char const *prev_nl = memrchr (line_start - 1, eolbyte,
+                                         matchbeg - (line_start - 1));
+          char const *next_nl = memchr (matchend, eolbyte,
+                                        line_end + 1 - matchend);
+          beg = prev_nl + 1;
+          end = next_nl + 1;
+        }
       else
         {
           beg = line_start;
-- 
2.5.0

bug#20526: grep BUG: text file is detected as binary

Reply via email to