Attached are some proposed patches which should improve the performance of grep -P when applied to binary files, among other things. I have some other ideas for boosting performance further but thought I'd publish these first. Please give them a try if you have the time. I doubt whether this will "solve" the performance problem entirely with -P and encoding errors but at least it should be heading in the right direction.
From ad34b7d8556e9fc274690666ac6ded2b6576feb3 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Sun, 14 Sep 2014 11:42:08 -0700
Subject: [PROPOSED PATCH 1/6] grep: remove/refactor unnecessary code about
 line splitting

* src/grep.c (do_execute): Remove.  Caller now uses 'execute'.
* src/pcresearch.c (Pexecute): Improve comment about this.
---
 src/grep.c       | 45 +--------------------------------------------
 src/pcresearch.c |  7 +++++--
 2 files changed, 6 insertions(+), 46 deletions(-)

diff --git a/src/grep.c b/src/grep.c
index 1f801e9..719dff1 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -1048,49 +1048,6 @@ prtext (char const *beg, char const *lim)
   outleft -= n;
 }
 
-/* Invoke the matcher, EXECUTE, on buffer BUF of SIZE bytes.  If there
-   is no match, return (size_t) -1.  Otherwise, set *MATCH_SIZE to the
-   length of the match and return the offset of the start of the match.  */
-static size_t
-do_execute (char const *buf, size_t size, size_t *match_size)
-{
-  size_t result;
-  const char *line_next;
-
-  /* With the current implementation, using --ignore-case with a multi-byte
-     character set is very inefficient when applied to a large buffer
-     containing many matches.  We can avoid much of the wasted effort
-     by matching line-by-line.
-
-     FIXME: this is just an ugly workaround, and it doesn't really
-     belong here.  Also, PCRE is always using this same per-line
-     matching algorithm.  Either we fix -i, or we should refactor
-     this code---for example, we could add another function pointer
-     to struct matcher to split the buffer passed to execute.  It would
-     perform the memchr if line-by-line matching is necessary, or just
-     return buf + size otherwise.  */
-  if (! (execute == Fexecute || execute == Pexecute)
-      || MB_CUR_MAX == 1 || !match_icase)
-    return execute (buf, size, match_size, NULL);
-
-  for (line_next = buf; line_next < buf + size; )
-    {
-      const char *line_buf = line_next;
-      const char *line_end = memchr (line_buf, eolbyte,
-                                     (buf + size) - line_buf);
-      if (line_end == NULL)
-        line_next = line_end = buf + size;
-      else
-        line_next = line_end + 1;
-
-      result = execute (line_buf, line_next - line_buf, match_size, NULL);
-      if (result != (size_t) -1)
-        return (line_buf - buf) + result;
-    }
-
-  return (size_t) -1;
-}
-
 /* Scan the specified portion of the buffer, matching lines (or
    between matching lines if OUT_INVERT is true).  Return a count of
    lines printed. */
@@ -1104,7 +1061,7 @@ grepbuf (char const *beg, char const *lim)
   for (p = beg; p < lim; p = endp)
     {
       size_t match_size;
-      size_t match_offset = do_execute (p, lim - p, &match_size);
+      size_t match_offset = execute (p, lim - p, &match_size, NULL);
       if (match_offset == (size_t) -1)
         {
           if (!out_invert)
diff --git a/src/pcresearch.c b/src/pcresearch.c
index 3475d4a..0c5220d 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -149,8 +149,11 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
   int e = PCRE_ERROR_NOMATCH;
   char const *line_end;
 
-  /* PCRE can't limit the matching to single lines, therefore we have to
-     match each line in the buffer separately.  */
+  /* pcre_exec mishandles matches that cross line boundaries.
+     PCRE_MULTILINE isn't a win, partly because it's incompatible with
+     -z, and partly because it checks the entire input buffer and is
+     therefore slow on a large buffer containing many matches.
+     Avoid these problems by matching line-by-line.  */
   for (; p < buf + size; p = line_start = line_end + 1)
     {
       line_end = memchr (p, eolbyte, buf + size - p);
-- 
1.9.3

From b7b7711dd072c335a45dbf09115b1597fed2ae76 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Sun, 14 Sep 2014 11:44:12 -0700
Subject: [PROPOSED PATCH 2/6] grep: speed up -P on files containing many
 multibyte errors

* src/pcresearch.c (empty_match): New var.
(Pcompile): Set it.
(Pexecute): Use it.
---
 src/pcresearch.c | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/src/pcresearch.c b/src/pcresearch.c
index 0c5220d..95877e3 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -33,6 +33,10 @@ static pcre *cre;
 /* Additional information about the pattern.  */
 static pcre_extra *extra;
 
+/* Table, indexed by ! (flag & PCRE_NOTBOL), of whether the empty
+   string matches when that flag is used.  */
+static int empty_match[2];
+
 # ifdef PCRE_STUDY_JIT_COMPILE
 static pcre_jit_stack *jit_stack;
 # else
@@ -124,6 +128,10 @@ Pcompile (char const *pattern, size_t size)
                _("failed to allocate memory for the PCRE JIT stack"));
       pcre_assign_jit_stack (extra, NULL, jit_stack);
     }
+
+  empty_match[false] = pcre_exec (cre, extra, "", 0, 0, PCRE_NOTBOL, NULL, 0);
+  empty_match[true] = pcre_exec (cre, extra, "", 0, 0, 0, NULL, 0);
+
 # endif
   free (re);
 #endif /* HAVE_LIBPCRE */
@@ -144,7 +152,7 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
   int sub[nsub];
 
   char const *p = start_ptr ? start_ptr : buf;
-  int options = p == buf || p[-1] == eolbyte ? 0 : PCRE_NOTBOL;
+  bool bol = p[-1] == eolbyte;
   char const *line_start = buf;
   int e = PCRE_ERROR_NOMATCH;
   char const *line_end;
@@ -164,23 +172,26 @@ Pexecute (char const *buf, size_t size, size_t 
*match_size,
       /* Treat encoding-error bytes as data that cannot match.  */
       for (;;)
         {
+          int options = bol ? 0 : PCRE_NOTBOL;
           int valid_bytes;
           e = pcre_exec (cre, extra, p, line_end - p, 0, options, sub, nsub);
           if (e != PCRE_ERROR_BADUTF8)
             break;
           valid_bytes = sub[0];
-          e = pcre_exec (cre, extra, p, valid_bytes, 0,
-                         options | PCRE_NO_UTF8_CHECK | PCRE_NOTEOL,
-                         sub, nsub);
+          e = (valid_bytes == 0
+               ? empty_match[bol]
+               : pcre_exec (cre, extra, p, valid_bytes, 0,
+                            options | PCRE_NO_UTF8_CHECK | PCRE_NOTEOL,
+                            sub, nsub));
           if (e != PCRE_ERROR_NOMATCH)
             break;
           p += valid_bytes + 1;
-          options = PCRE_NOTBOL;
+          bol = false;
         }
 
       if (e != PCRE_ERROR_NOMATCH)
         break;
-      options = 0;
+      bol = true;
     }
 
   if (e <= 0)
@@ -188,7 +199,7 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
       switch (e)
         {
         case PCRE_ERROR_NOMATCH:
-          return -1;
+          break;
 
         case PCRE_ERROR_NOMEMORY:
           error (EXIT_TROUBLE, 0, _("memory exhausted"));
@@ -205,7 +216,6 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
           error (EXIT_TROUBLE, 0, _("internal PCRE error: %d"), e);
         }
 
-      /* NOTREACHED */
       return -1;
     }
   else
-- 
1.9.3

From f4a95dff902840826eed69fcc7205db5b3e86573 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Sat, 13 Sep 2014 17:58:53 -0700
Subject: [PROPOSED PATCH 3/6] grep: use bool for boolean in grep.c

* src/grep.c (show_version, suppress_errors, only_matching)
(align_tabs, match_icase, match_words, match_lines, errseen)
(write_error_seen, is_device_mode, usable_st_size)
(file_is_binary, skipped_file, reset, fillbuf, out_quiet)
(out_line, out_byte, count_matches, no_filenames, line_buffered)
(done_on_match, exit_on_match, print_line_head, prline, grep)
(grepdirent, grepfile, grepdesc, grep_command_line_arg)
(get_nondigit_option, main): Use bool for boolean.
(print_line_head, prline): Use char for byte.
* src/grep.h: Include <stdbool.h>, and adjust decls to match
changes in grep.c.
---
 src/grep.c | 232 +++++++++++++++++++++++++++++++------------------------------
 src/grep.h |   8 ++-
 2 files changed, 124 insertions(+), 116 deletions(-)

diff --git a/src/grep.c b/src/grep.c
index 719dff1..1e0cc6d 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -65,20 +65,20 @@ static struct stat out_stat;
 /* if non-zero, display usage information and exit */
 static int show_help;
 
-/* If non-zero, print the version on standard output and exit.  */
-static int show_version;
+/* Print the version on standard output and exit.  */
+static bool show_version;
 
-/* If nonzero, suppress diagnostics for nonexistent or unreadable files.  */
-static int suppress_errors;
+/* Suppress diagnostics for nonexistent or unreadable files.  */
+static bool suppress_errors;
 
 /* If nonzero, use color markers.  */
 static int color_option;
 
-/* If nonzero, show only the part of a line matching the expression. */
-static int only_matching;
+/* Show only the part of a line matching the expression. */
+static bool only_matching;
 
 /* If nonzero, make sure first content char in a line is on a tab stop. */
-static int align_tabs;
+static bool align_tabs;
 
 /* The group separator used when context is requested. */
 static const char *group_separator = SEP_STR_GROUP;
@@ -347,9 +347,9 @@ static struct option const long_options[] =
 };
 
 /* Define flags declared in grep.h. */
-int match_icase;
-int match_words;
-int match_lines;
+bool match_icase;
+bool match_words;
+bool match_lines;
 unsigned char eolbyte;
 
 static char const *matcher;
@@ -358,8 +358,8 @@ static char const *matcher;
 /* The input file name, or (if standard input) "-" or a --label argument.  */
 static char const *filename;
 static size_t filename_prefix_len;
-static int errseen;
-static int write_error_seen;
+static bool errseen;
+static bool write_error_seen;
 
 enum directories_type
   {
@@ -392,22 +392,22 @@ static enum
     SKIP_DEVICES
   } devices = READ_COMMAND_LINE_DEVICES;
 
-static int grepfile (int, char const *, int, int);
-static int grepdesc (int, int);
+static bool grepfile (int, char const *, bool, bool);
+static bool grepdesc (int, bool);
 
 static void dos_binary (void);
 static void dos_unix_byte_offsets (void);
 static size_t undossify_input (char *, size_t);
 
-static int
+static bool
 is_device_mode (mode_t m)
 {
   return S_ISCHR (m) || S_ISBLK (m) || S_ISSOCK (m) || S_ISFIFO (m);
 }
 
-/* Return nonzero if ST->st_size is defined.  Assume the file is not a
+/* Return if ST->st_size is defined.  Assume the file is not a
    symbolic link.  */
-static int
+static bool
 usable_st_size (struct stat const *st)
 {
   return S_ISREG (st->st_mode) || S_TYPEISSHM (st) || S_TYPEISTMO (st);
@@ -425,7 +425,7 @@ suppressible_error (char const *mesg, int errnum)
 {
   if (! suppress_errors)
     error (0, errnum, "%s", mesg);
-  errseen = 1;
+  errseen = true;
 }
 
 /* If there has already been a write error, don't bother closing
@@ -437,10 +437,10 @@ clean_up_stdout (void)
     close_stdout ();
 }
 
-/* Return 1 if a file is known to be binary for the purpose of 'grep'.
+/* Return true if a file is known to be binary for the purpose of 'grep'.
    BUF, of size BUFSIZE, is the initial buffer read from the file with
    descriptor FD and status ST.  */
-static int
+static bool
 file_is_binary (char const *buf, size_t bufsize, int fd, struct stat const *st)
 {
   #ifndef SEEK_HOLE
@@ -455,7 +455,7 @@ file_is_binary (char const *buf, size_t bufsize, int fd, 
struct stat const *st)
   /* If the initial buffer contains a null byte, guess that the file
      is binary.  */
   if (memchr (buf, '\0', bufsize))
-    return 1;
+    return true;
 
   /* If the file has holes, it must contain a null byte somewhere.  */
   if (SEEK_HOLE != SEEK_END && usable_st_size (st))
@@ -465,7 +465,7 @@ file_is_binary (char const *buf, size_t bufsize, int fd, 
struct stat const *st)
         {
           cur = lseek (fd, 0, SEEK_CUR);
           if (cur < 0)
-            return 0;
+            return false;
         }
 
       /* Look for a hole after the current location.  */
@@ -475,12 +475,12 @@ file_is_binary (char const *buf, size_t bufsize, int fd, 
struct stat const *st)
           if (lseek (fd, cur, SEEK_SET) < 0)
             suppressible_error (filename, errno);
           if (hole_start < st->st_size)
-            return 1;
+            return true;
         }
     }
 
   /* Guess that the file does not contain binary data.  */
-  return 0;
+  return false;
 }
 
 /* Convert STR to a nonnegative integer, storing the result in *OUT.
@@ -503,11 +503,11 @@ context_length_arg (char const *str, intmax_t *out)
     }
 }
 
-/* Return nonzero if the file with NAME should be skipped.
-   If COMMAND_LINE is nonzero, it is a command-line argument.
-   If IS_DIR is nonzero, it is a directory.  */
-static int
-skipped_file (char const *name, int command_line, int is_dir)
+/* Return true if the file with NAME should be skipped.
+   If COMMAND_LINE, it is a command-line argument.
+   If IS_DIR, it is a directory.  */
+static bool
+skipped_file (char const *name, bool command_line, bool is_dir)
 {
   return (is_dir
           ? (directories == SKIP_DIRECTORIES
@@ -541,9 +541,9 @@ static off_t after_last_match;      /* Pointer after last 
matching line that
    ? (val) \
    : (val) + ((alignment) - (size_t) (val) % (alignment)))
 
-/* Reset the buffer for a new file, returning zero if we should skip it.
+/* Reset the buffer for a new file, returning false if we should skip it.
    Initialize on the first time through. */
-static int
+static bool
 reset (int fd, struct stat const *st)
 {
   if (! pagesize)
@@ -569,22 +569,22 @@ reset (int fd, struct stat const *st)
           if (bufoffset < 0)
             {
               suppressible_error (_("lseek failed"), errno);
-              return 0;
+              return false;
             }
         }
     }
-  return 1;
+  return true;
 }
 
 /* Read new stuff into the buffer, saving the specified
    amount of old stuff.  When we're done, 'bufbeg' points
    to the beginning of the buffer contents, and 'buflim'
-   points just after the end.  Return zero if there's an error.  */
-static int
+   points just after the end.  Return false if there's an error.  */
+static bool
 fillbuf (size_t save, struct stat const *st)
 {
   size_t fillsize;
-  int cc = 1;
+  bool cc = true;
   char *readbuf;
   size_t readsize;
 
@@ -646,7 +646,10 @@ fillbuf (size_t save, struct stat const *st)
 
   fillsize = safe_read (bufdesc, readbuf, readsize);
   if (fillsize == SAFE_READ_ERROR)
-    fillsize = cc = 0;
+    {
+      fillsize = 0;
+      cc = false;
+    }
   bufoffset += fillsize;
   fillsize = undossify_input (readbuf, fillsize);
   buflim = readbuf + fillsize;
@@ -662,20 +665,19 @@ static enum
 } binary_files;                /* How to handle binary files.  */
 
 static int filename_mask;      /* If zero, output nulls after filenames.  */
-static int out_quiet;          /* Suppress all normal output. */
+static bool out_quiet;         /* Suppress all normal output. */
 static bool out_invert;                /* Print nonmatching stuff. */
 static int out_file;           /* Print filenames. */
-static int out_line;           /* Print line numbers. */
-static int out_byte;           /* Print byte offsets. */
+static bool out_line;          /* Print line numbers. */
+static bool out_byte;          /* Print byte offsets. */
 static intmax_t out_before;    /* Lines of leading context. */
 static intmax_t out_after;     /* Lines of trailing context. */
-static int count_matches;      /* Count matching lines.  */
+static bool count_matches;     /* Count matching lines.  */
 static int list_files;         /* List matching files.  */
-static int no_filenames;       /* Suppress file names.  */
+static bool no_filenames;      /* Suppress file names.  */
 static intmax_t max_count;     /* Stop after outputting this many
                                    lines from an input file.  */
-static int line_buffered;       /* If nonzero, use line buffering, i.e.
-                                   fflush everyline out.  */
+static bool line_buffered;     /* Use line buffering.  */
 static char *label = NULL;      /* Fake filename for stdin */
 
 
@@ -689,8 +691,8 @@ static uintmax_t totalnl;   /* Total newline count before 
lastnl. */
 static intmax_t outleft;       /* Maximum number of lines to be output.  */
 static intmax_t pending;       /* Pending lines of output.
                                    Always kept 0 if out_quiet is true.  */
-static int done_on_match;      /* Stop scanning file on first match.  */
-static int exit_on_match;      /* Exit on first match.  */
+static bool done_on_match;     /* Stop scanning file on first match.  */
+static bool exit_on_match;     /* Exit on first match.  */
 
 #include "dosbuf.c"
 
@@ -768,15 +770,15 @@ print_offset (uintmax_t pos, int min_width, const char 
*color)
 
 /* Print a whole line head (filename, line, byte).  */
 static void
-print_line_head (char const *beg, char const *lim, int sep)
+print_line_head (char const *beg, char const *lim, char sep)
 {
-  int pending_sep = 0;
+  bool pending_sep = false;
 
   if (out_file)
     {
       print_filename ();
       if (filename_mask)
-        pending_sep = 1;
+        pending_sep = true;
       else
         fputc (0, stdout);
     }
@@ -792,7 +794,7 @@ print_line_head (char const *beg, char const *lim, int sep)
       if (pending_sep)
         print_sep (sep);
       print_offset (totalnl, 4, line_num_color);
-      pending_sep = 1;
+      pending_sep = true;
     }
 
   if (out_byte)
@@ -802,7 +804,7 @@ print_line_head (char const *beg, char const *lim, int sep)
       if (pending_sep)
         print_sep (sep);
       print_offset (pos, 6, byte_num_color);
-      pending_sep = 1;
+      pending_sep = true;
     }
 
   if (pending_sep)
@@ -903,9 +905,9 @@ print_line_tail (const char *beg, const char *lim, const 
char *line_color)
 }
 
 static void
-prline (char const *beg, char const *lim, int sep)
+prline (char const *beg, char const *lim, char sep)
 {
-  int matching;
+  bool matching;
   const char *line_color;
   const char *match_color;
 
@@ -945,7 +947,7 @@ prline (char const *beg, char const *lim, int sep)
 
   if (ferror (stdout))
     {
-      write_error_seen = 1;
+      write_error_seen = true;
       error (EXIT_TROUBLE, 0, _("write error"));
     }
 
@@ -1098,12 +1100,14 @@ static intmax_t
 grep (int fd, struct stat const *st)
 {
   intmax_t nlines, i;
-  int not_text;
+  bool not_text;
   size_t residue, save;
   char oldc;
   char *beg;
   char *lim;
   char eol = eolbyte;
+  bool done_on_match_0 = done_on_match;
+  bool out_quiet_0 = out_quiet;
 
   if (! reset (fd, st))
     return 0;
@@ -1130,8 +1134,8 @@ grep (int fd, struct stat const *st)
               && file_is_binary (bufbeg, buflim - bufbeg, fd, st));
   if (not_text && binary_files == WITHOUT_MATCH_BINARY_FILES)
     return 0;
-  done_on_match += not_text;
-  out_quiet += not_text;
+  done_on_match |= not_text;
+  out_quiet |= not_text;
 
   for (;;)
     {
@@ -1208,17 +1212,18 @@ grep (int fd, struct stat const *st)
     }
 
  finish_grep:
-  done_on_match -= not_text;
-  out_quiet -= not_text;
+  done_on_match = done_on_match_0;
+  out_quiet = out_quiet_0;
   if ((not_text & ~out_quiet) && nlines != 0)
     printf (_("Binary file %s matches\n"), filename);
   return nlines;
 }
 
-static int
-grepdirent (FTS *fts, FTSENT *ent, int command_line)
+static bool
+grepdirent (FTS *fts, FTSENT *ent, bool command_line)
 {
-  int follow, dirdesc;
+  bool follow;
+  int dirdesc;
   struct stat *st = ent->fts_statp;
   command_line &= ent->fts_level == FTS_ROOTLEVEL;
 
@@ -1226,7 +1231,7 @@ grepdirent (FTS *fts, FTSENT *ent, int command_line)
     {
       if (directories == RECURSE_DIRECTORIES && command_line)
         out_file &= ~ (2 * !no_filenames);
-      return 1;
+      return true;
     }
 
   if (skipped_file (ent->fts_name, command_line,
@@ -1234,7 +1239,7 @@ grepdirent (FTS *fts, FTSENT *ent, int command_line)
                      || ent->fts_info == FTS_DNR)))
     {
       fts_set (fts, ent, FTS_SKIP);
-      return 1;
+      return true;
     }
 
   filename = ent->fts_path + filename_prefix_len;
@@ -1247,7 +1252,7 @@ grepdirent (FTS *fts, FTSENT *ent, int command_line)
       if (directories == RECURSE_DIRECTORIES)
         {
           out_file |= 2 * !no_filenames;
-          return 1;
+          return true;
         }
       fts_set (fts, ent, FTS_SKIP);
       break;
@@ -1256,13 +1261,13 @@ grepdirent (FTS *fts, FTSENT *ent, int command_line)
       if (!suppress_errors)
         error (0, 0, _("warning: %s: %s"), filename,
                _("recursive directory loop"));
-      return 1;
+      return true;
 
     case FTS_DNR:
     case FTS_ERR:
     case FTS_NS:
       suppressible_error (filename, ent->fts_errno);
-      return 1;
+      return true;
 
     case FTS_DEFAULT:
     case FTS_NSOK:
@@ -1279,12 +1284,12 @@ grepdirent (FTS *fts, FTSENT *ent, int command_line)
               if (fstatat (fts->fts_cwd_fd, ent->fts_accpath, &st1, flag) != 0)
                 {
                   suppressible_error (filename, errno);
-                  return 1;
+                  return true;
                 }
               st = &st1;
             }
           if (is_device_mode (st->st_mode))
-            return 1;
+            return true;
         }
       break;
 
@@ -1294,7 +1299,7 @@ grepdirent (FTS *fts, FTSENT *ent, int command_line)
 
     case FTS_SL:
     case FTS_W:
-      return 1;
+      return true;
 
     default:
       abort ();
@@ -1306,24 +1311,24 @@ grepdirent (FTS *fts, FTSENT *ent, int command_line)
   return grepfile (dirdesc, ent->fts_accpath, follow, command_line);
 }
 
-static int
-grepfile (int dirdesc, char const *name, int follow, int command_line)
+static bool
+grepfile (int dirdesc, char const *name, bool follow, bool command_line)
 {
   int desc = openat_safer (dirdesc, name, O_RDONLY | (follow ? 0 : 
O_NOFOLLOW));
   if (desc < 0)
     {
       if (follow || (errno != ELOOP && errno != EMLINK))
         suppressible_error (filename, errno);
-      return 1;
+      return true;
     }
   return grepdesc (desc, command_line);
 }
 
-static int
-grepdesc (int desc, int command_line)
+static bool
+grepdesc (int desc, bool command_line)
 {
   intmax_t count;
-  int status = 1;
+  bool status = true;
   struct stat st;
 
   /* Get the file status, possibly for the second time.  This catches
@@ -1339,7 +1344,7 @@ grepdesc (int desc, int command_line)
     }
 
   if (desc != STDIN_FILENO && command_line
-      && skipped_file (filename, 1, S_ISDIR (st.st_mode)))
+      && skipped_file (filename, true, S_ISDIR (st.st_mode) != 0))
     goto closeout;
 
   if (desc != STDIN_FILENO
@@ -1404,7 +1409,7 @@ grepdesc (int desc, int command_line)
     {
       if (! suppress_errors)
         error (0, 0, _("input file %s is also the output"), quote (filename));
-      errseen = 1;
+      errseen = true;
       goto closeout;
     }
 
@@ -1456,18 +1461,18 @@ grepdesc (int desc, int command_line)
   return status;
 }
 
-static int
+static bool
 grep_command_line_arg (char const *arg)
 {
   if (STREQ (arg, "-"))
     {
       filename = label ? label : _("(standard input)");
-      return grepdesc (STDIN_FILENO, 1);
+      return grepdesc (STDIN_FILENO, true);
     }
   else
     {
       filename = arg;
-      return grepfile (AT_FDCWD, arg, 1, 1);
+      return grepfile (AT_FDCWD, arg, true, true);
     }
 }
 
@@ -1721,14 +1726,15 @@ static int
 get_nondigit_option (int argc, char *const *argv, intmax_t *default_context)
 {
   static int prev_digit_optind = -1;
-  int this_digit_optind, was_digit;
+  int this_digit_optind;
+  bool was_digit;
   char buf[INT_BUFSIZE_BOUND (intmax_t) + 4];
   char *p = buf;
   int opt;
 
-  was_digit = 0;
+  was_digit = false;
   this_digit_optind = optind;
-  while (1)
+  while (true)
     {
       opt = getopt_long (argc, (char **) argv, short_options,
                          long_options, NULL);
@@ -1758,7 +1764,7 @@ get_nondigit_option (int argc, char *const *argv, 
intmax_t *default_context)
         }
       *p++ = opt;
 
-      was_digit = 1;
+      was_digit = true;
       prev_digit_optind = this_digit_optind;
       this_digit_optind = optind;
     }
@@ -1890,9 +1896,9 @@ main (int argc, char **argv)
 {
   char *keys;
   size_t keycc, oldcc, keyalloc;
-  int with_filenames;
+  bool with_filenames, ok;
   size_t cc;
-  int opt, status, prepended;
+  int opt, prepended;
   int prev_optind, last_recursive;
   int fread_errno;
   intmax_t default_context;
@@ -1904,7 +1910,7 @@ main (int argc, char **argv)
 
   keys = NULL;
   keycc = 0;
-  with_filenames = 0;
+  with_filenames = false;
   eolbyte = '\n';
   filename_mask = ~0;
 
@@ -1915,7 +1921,7 @@ main (int argc, char **argv)
   /* Default before/after context: changed by -C/-NUM options */
   default_context = -1;
   /* Changed by -o option */
-  only_matching = 0;
+  only_matching = false;
 
   /* Internationalization. */
 #if defined HAVE_SETLOCALE
@@ -1987,8 +1993,8 @@ main (int argc, char **argv)
         break;
 
       case 'H':
-        with_filenames = 1;
-        no_filenames = 0;
+        with_filenames = true;
+        no_filenames = false;
         break;
 
       case 'I':
@@ -1996,7 +2002,7 @@ main (int argc, char **argv)
         break;
 
       case 'T':
-        align_tabs = 1;
+        align_tabs = true;
         break;
 
       case 'U':
@@ -2008,7 +2014,7 @@ main (int argc, char **argv)
         break;
 
       case 'V':
-        show_version = 1;
+        show_version = true;
         break;
 
       case 'a':
@@ -2016,11 +2022,11 @@ main (int argc, char **argv)
         break;
 
       case 'b':
-        out_byte = 1;
+        out_byte = true;
         break;
 
       case 'c':
-        count_matches = 1;
+        count_matches = true;
         break;
 
       case 'd':
@@ -2063,13 +2069,13 @@ main (int argc, char **argv)
         break;
 
       case 'h':
-        with_filenames = 0;
-        no_filenames = 1;
+        with_filenames = false;
+        no_filenames = true;
         break;
 
       case 'i':
       case 'y':                        /* For old-timers . . . */
-        match_icase = 1;
+        match_icase = true;
         break;
 
       case 'L':
@@ -2095,15 +2101,15 @@ main (int argc, char **argv)
         break;
 
       case 'n':
-        out_line = 1;
+        out_line = true;
         break;
 
       case 'o':
-        only_matching = 1;
+        only_matching = true;
         break;
 
       case 'q':
-        exit_on_match = 1;
+        exit_on_match = true;
         exit_failure = 0;
         break;
 
@@ -2116,7 +2122,7 @@ main (int argc, char **argv)
         break;
 
       case 's':
-        suppress_errors = 1;
+        suppress_errors = true;
         break;
 
       case 'v':
@@ -2124,11 +2130,11 @@ main (int argc, char **argv)
         break;
 
       case 'w':
-        match_words = 1;
+        match_words = true;
         break;
 
       case 'x':
-        match_lines = 1;
+        match_lines = true;
         break;
 
       case 'Z':
@@ -2199,7 +2205,7 @@ main (int argc, char **argv)
         break;
 
       case LINE_BUFFERED_OPTION:
-        line_buffered = 1;
+        line_buffered = true;
         break;
 
       case LABEL_OPTION:
@@ -2226,8 +2232,8 @@ main (int argc, char **argv)
     list_files = 0;
   if (exit_on_match | list_files)
     {
-      count_matches = 0;
-      done_on_match = 1;
+      count_matches = false;
+      done_on_match = true;
     }
   out_quiet = count_matches | done_on_match;
 
@@ -2267,7 +2273,7 @@ main (int argc, char **argv)
         {
           /* No keys were specified (e.g. -f /dev/null).  Match nothing.  */
           out_invert ^= true;
-          match_lines = match_words = 0;
+          match_lines = match_words = false;
         }
       else
         /* Strip trailing newline. */
@@ -2323,21 +2329,21 @@ main (int argc, char **argv)
 
   if (optind < argc)
     {
-      status = 1;
+      ok = true;
       do
-        status &= grep_command_line_arg (argv[optind]);
+        ok &= grep_command_line_arg (argv[optind]);
       while (++optind < argc);
     }
   else if (directories == RECURSE_DIRECTORIES && prepended < last_recursive)
     {
       /* Grep through ".", omitting leading "./" from diagnostics.  */
       filename_prefix_len = 2;
-      status = grep_command_line_arg (".");
+      ok = grep_command_line_arg (".");
     }
   else
-    status = grep_command_line_arg ("-");
+    ok = grep_command_line_arg ("-");
 
   /* We register via atexit() to test stdout.  */
-  exit (errseen ? EXIT_TROUBLE : status);
+  exit (errseen ? EXIT_TROUBLE : ok);
 }
 /* vim:set shiftwidth=2: */
diff --git a/src/grep.h b/src/grep.h
index 4935872..5496eb2 100644
--- a/src/grep.h
+++ b/src/grep.h
@@ -20,11 +20,13 @@
 #ifndef GREP_GREP_H
 #define GREP_GREP_H 1
 
+#include <stdbool.h>
+
 /* The following flags are exported from grep for the matchers
    to look at. */
-extern int match_icase;                /* -i */
-extern int match_words;                /* -w */
-extern int match_lines;                /* -x */
+extern bool match_icase;       /* -i */
+extern bool match_words;       /* -w */
+extern bool match_lines;       /* -x */
 extern unsigned char eolbyte;  /* -z */
 
 #endif
-- 
1.9.3

From f784a73a01b823109d660aa8d256535623e98971 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Sun, 14 Sep 2014 13:49:18 -0700
Subject: [PROPOSED PATCH 4/6] grep: treat a file as binary if its prefix
 contains encoding errors

* NEWS:
* doc/grep.texi (File and Directory Selection):
Document this.
* src/grep.c (buffer_encoding, buffer_textbin): New functions.
(file_textbin): Rename from file_is_binary.  Now returns 3-way value.
All callers changed.
(file_textbin, grep): Check the input more carefully for text vs
binary data.
(contains_encoding_error): Remove; use replaced by buffer_encoding.
* tests/backref-multibyte-slow:
* tests/high-bit-range:
* tests/invalid-multibyte-infloop:
Use -a, since the input is now considered to be binary.
* tests/invalid-multibyte-infloop: Add a check for new behavior.
---
 NEWS                            |   4 ++
 doc/grep.texi                   |   3 +-
 src/grep.c                      | 126 +++++++++++++++++++++++++++-------------
 tests/backref-multibyte-slow    |   2 +-
 tests/high-bit-range            |   2 +-
 tests/invalid-multibyte-infloop |  14 ++++-
 6 files changed, 106 insertions(+), 45 deletions(-)

diff --git a/NEWS b/NEWS
index 36bb48f..9377d7d 100644
--- a/NEWS
+++ b/NEWS
@@ -6,6 +6,10 @@ GNU grep NEWS                                    -*- outline 
-*-
 
   Performance has improved for very long strings in patterns.
 
+  If a file contains data improperly encoded for the current locale,
+  and this is discovered before any of the file's contents are output,
+  grep now treats the file as binary.
+
   grep -P no longer reports an error and exits when given invalid UTF-8 data.
   Instead, it considers the data to be non-matching.
 
diff --git a/doc/grep.texi b/doc/grep.texi
index c8e4acd..14bd69e 100644
--- a/doc/grep.texi
+++ b/doc/grep.texi
@@ -592,7 +592,8 @@ this is equivalent to the @samp{--binary-files=text} option.
 @item --binary-files=@var{type}
 @opindex --binary-files
 @cindex binary files
-If a file's allocation metadata or its first few bytes
+If a file's allocation metadata,
+or if its data read before a line is selected for output,
 indicate that the file contains binary data,
 assume that the file is of type @var{type}.
 By default, @var{type} is @samp{binary},
diff --git a/src/grep.c b/src/grep.c
index 1e0cc6d..ccba1b6 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -437,50 +437,74 @@ clean_up_stdout (void)
     close_stdout ();
 }
 
-/* Return true if a file is known to be binary for the purpose of 'grep'.
+/* Return 1 if BUF (of size SIZE) contains text, -1 if it contains
+   binary data, and 0 if the answer depends on what comes immediately
+   after BUF.  */
+static int
+buffer_textbin (char const *buf, size_t size)
+{
+  mbstate_t mbs = { 0 };
+  size_t charlen;
+  char badbyte = eolbyte ? '\0' : '\200';
+  char const *p;
+
+  for (p = buf; p < buf + size; p += charlen)
+    {
+      if (*p == badbyte)
+        return -1;
+      charlen = mbrlen (p, buf + size - p, &mbs);
+      if ((size_t) -2 <= charlen)
+        return charlen == (size_t) -2 ? 0 : -1;
+      charlen += !charlen;
+    }
+
+  return 1;
+}
+
+/* Return 1 if a file is known to be text for the purpose of 'grep'.
+   Return -1 if it is known to be binary, 0 if unknown.
    BUF, of size BUFSIZE, is the initial buffer read from the file with
    descriptor FD and status ST.  */
-static bool
-file_is_binary (char const *buf, size_t bufsize, int fd, struct stat const *st)
+static int
+file_textbin (char const *buf, size_t bufsize, int fd, struct stat const *st)
 {
   #ifndef SEEK_HOLE
   enum { SEEK_HOLE = SEEK_END };
   #endif
 
-  /* If -z, test only whether the initial buffer contains '\200';
-     knowing about holes won't help.  */
-  if (! eolbyte)
-    return memchr (buf, '\200', bufsize) != 0;
+  int textbin = buffer_textbin (buf, bufsize);
+  if (textbin < 0)
+    return textbin;
 
-  /* If the initial buffer contains a null byte, guess that the file
-     is binary.  */
-  if (memchr (buf, '\0', bufsize))
-    return true;
-
-  /* If the file has holes, it must contain a null byte somewhere.  */
-  if (SEEK_HOLE != SEEK_END && usable_st_size (st))
+  if (usable_st_size (st))
     {
-      off_t cur = bufsize;
-      if (O_BINARY || fd == STDIN_FILENO)
-        {
-          cur = lseek (fd, 0, SEEK_CUR);
-          if (cur < 0)
-            return false;
-        }
+      if (st->st_size <= bufsize)
+        return 2 * textbin - 1;
 
-      /* Look for a hole after the current location.  */
-      off_t hole_start = lseek (fd, cur, SEEK_HOLE);
-      if (0 <= hole_start)
+      /* If the file has holes, it must contain a null byte somewhere.  */
+      if (SEEK_HOLE != SEEK_END && eolbyte)
         {
-          if (lseek (fd, cur, SEEK_SET) < 0)
-            suppressible_error (filename, errno);
-          if (hole_start < st->st_size)
-            return true;
+          off_t cur = bufsize;
+          if (O_BINARY || fd == STDIN_FILENO)
+            {
+              cur = lseek (fd, 0, SEEK_CUR);
+              if (cur < 0)
+                return 0;
+            }
+
+          /* Look for a hole after the current location.  */
+          off_t hole_start = lseek (fd, cur, SEEK_HOLE);
+          if (0 <= hole_start)
+            {
+              if (lseek (fd, cur, SEEK_SET) < 0)
+                suppressible_error (filename, errno);
+              if (hole_start < st->st_size)
+                return -1;
+            }
         }
     }
 
-  /* Guess that the file does not contain binary data.  */
-  return false;
+  return 0;
 }
 
 /* Convert STR to a nonnegative integer, storing the result in *OUT.
@@ -1100,7 +1124,7 @@ static intmax_t
 grep (int fd, struct stat const *st)
 {
   intmax_t nlines, i;
-  bool not_text;
+  int textbin;
   size_t residue, save;
   char oldc;
   char *beg;
@@ -1129,13 +1153,18 @@ grep (int fd, struct stat const *st)
       return 0;
     }
 
-  not_text = (((binary_files == BINARY_BINARY_FILES && !out_quiet)
-               || binary_files == WITHOUT_MATCH_BINARY_FILES)
-              && file_is_binary (bufbeg, buflim - bufbeg, fd, st));
-  if (not_text && binary_files == WITHOUT_MATCH_BINARY_FILES)
-    return 0;
-  done_on_match |= not_text;
-  out_quiet |= not_text;
+  if (binary_files == TEXT_BINARY_FILES)
+    textbin = 1;
+  else
+    {
+      textbin = file_textbin (bufbeg, buflim - bufbeg, fd, st);
+      if (textbin < 0)
+        {
+          if (binary_files == WITHOUT_MATCH_BINARY_FILES)
+            return 0;
+          done_on_match = out_quiet = true;
+        }
+    }
 
   for (;;)
     {
@@ -1187,8 +1216,13 @@ grep (int fd, struct stat const *st)
         }
 
       /* Detect whether leading context is adjacent to previous output.  */
-      if (beg != lastout)
-        lastout = 0;
+      if (lastout)
+        {
+          if (!textbin)
+            textbin = 1;
+          if (beg != lastout)
+            lastout = 0;
+        }
 
       /* Handle some details and read more data to scan.  */
       save = residue + lim - beg;
@@ -1201,6 +1235,16 @@ grep (int fd, struct stat const *st)
           suppressible_error (filename, errno);
           goto finish_grep;
         }
+
+      /* If the file's textbin has not been determined yet, assume
+         it's binary if the next input buffer suggests so.  */
+      if (! textbin && buffer_textbin (bufbeg, buflim - bufbeg) < 0)
+        {
+          textbin = -1;
+          if (binary_files == WITHOUT_MATCH_BINARY_FILES)
+            return 0;
+          done_on_match = out_quiet = true;
+        }
     }
   if (residue)
     {
@@ -1214,7 +1258,7 @@ grep (int fd, struct stat const *st)
  finish_grep:
   done_on_match = done_on_match_0;
   out_quiet = out_quiet_0;
-  if ((not_text & ~out_quiet) && nlines != 0)
+  if (textbin < 0 && !out_quiet && nlines != 0)
     printf (_("Binary file %s matches\n"), filename);
   return nlines;
 }
diff --git a/tests/backref-multibyte-slow b/tests/backref-multibyte-slow
index ffebb6b..d447a4a 100755
--- a/tests/backref-multibyte-slow
+++ b/tests/backref-multibyte-slow
@@ -21,7 +21,7 @@ max_seconds=$(LC_ALL=C perl -le 'use Time::HiRes qw(time); my 
$s = time();
 
 for LOC in en_US.UTF-8; do
   out=out-$LOC
-  LC_ALL=$LOC timeout ${max_seconds}s grep -E '^([a-z]).\1$' in > $out 2>&1
+  LC_ALL=$LOC timeout ${max_seconds}s grep -aE '^([a-z]).\1$' in > $out 2>&1
   test $? = 0 || fail=1
   compare $out in || fail=1
 done
diff --git a/tests/high-bit-range b/tests/high-bit-range
index 74b6e65..76c3310 100755
--- a/tests/high-bit-range
+++ b/tests/high-bit-range
@@ -21,7 +21,7 @@
 fail=0
 
 printf '\201\n' > in || framework_failure_
-grep "$(printf '[\201]')" in > out || fail=1
+grep -a "$(printf '[\201]')" in > out || fail=1
 
 compare out in || fail=1
 
diff --git a/tests/invalid-multibyte-infloop b/tests/invalid-multibyte-infloop
index b28bc53..d7c6165 100755
--- a/tests/invalid-multibyte-infloop
+++ b/tests/invalid-multibyte-infloop
@@ -14,7 +14,7 @@ encode AA > input
 fail=0
 
 # Before 2.15, this would infloop.
-LC_ALL=en_US.UTF-8 timeout 3 grep -F $(encode A) input > out
+LC_ALL=en_US.UTF-8 timeout 3 grep -aF $(encode A) input > out
 status=$?
 if test $status -eq 0; then
   compare input out
@@ -24,4 +24,16 @@ else
   test $status -eq 2
 fi || fail=1
 
+echo 'Binary file input matches' >binary-file-matches
+
+LC_ALL=en_US.UTF-8 timeout 3 grep -F $(encode A) input > out
+status=$?
+if test $status -eq 0; then
+  compare binary-file-matches out
+elif test $status -eq 1; then
+  compare_dev_null_ /dev/null out
+else
+  test $status -eq 2
+fi || fail=1
+
 Exit $fail
-- 
1.9.3

From cc87d585025a2ff310b6c55097480d6e953557bd Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Sun, 14 Sep 2014 14:52:56 -0700
Subject: [PROPOSED PATCH 5/6] grep: improve performance for older glibc

glibc has a bug where mbrlen and mbrtowc mishandle length-0 inputs.
Working around it in gnulib slows grep down, so disable the tests for it
and make sure grep works even if the bug is present.
* bootstrap.conf (avoided_gnulib_modules): Add mbrtowc-tests.
* configure.ac (gl_cv_func_mbrtowc_empty_input): Assume yes.
* src/searchutils.c (mb_next_wc): Don't invoke mbrtowc on empty input.
---
 bootstrap.conf    | 1 +
 configure.ac      | 5 +++++
 src/searchutils.c | 3 ++-
 3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/bootstrap.conf b/bootstrap.conf
index d8171f5..50c0aab 100644
--- a/bootstrap.conf
+++ b/bootstrap.conf
@@ -17,6 +17,7 @@
 
 avoided_gnulib_modules='
   --avoid=lock-tests
+  --avoid=mbrtowc-tests
 '
 
 # gnulib modules used by this package.
diff --git a/configure.ac b/configure.ac
index 3315855..4d069b8 100644
--- a/configure.ac
+++ b/configure.ac
@@ -83,6 +83,11 @@ AC_PROG_CC
 gl_EARLY
 AC_PROG_RANLIB
 
+# grep never invokes mbrtowc or mbrlen on empty input,
+# so don't worry about this common bug,
+# as working around it would merely slow grep down.
+gl_cv_func_mbrtowc_empty_input='assume yes'
+
 dnl Checks for typedefs, structures, and compiler characteristics.
 AC_TYPE_SIZE_T
 AC_C_CONST
diff --git a/src/searchutils.c b/src/searchutils.c
index 5eb9a12..18dd584 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -285,5 +285,6 @@ mb_next_wc (char const *cur, char const *end)
 {
   wchar_t wc;
   mbstate_t mbs = { 0 };
-  return mbrtowc (&wc, cur, end - cur, &mbs) < (size_t) -2 ? wc : WEOF;
+  return (end - cur != 0 && mbrtowc (&wc, cur, end - cur, &mbs) < (size_t) -2
+          ? wc : WEOF);
 }
-- 
1.9.3

From 04c6552a279c7a8d565aae9a7ffea0b751689052 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Sun, 14 Sep 2014 17:37:58 -0700
Subject: [PROPOSED PATCH 6/6] grep: use mbclen cache more effectively

* src/grep.c (buffer_textbin, contains_encoding_error):
Use mb_clen for speed.
(buffer_textbin): Bypass mb_clen in unibyte locales.
(main): Always initialize the cache, since it's sometimes used in
unibyte locales now.  Initialize it before contains_encoding_error
might be called.
* src/search.h (SEARCH_INLINE): New macro.
(mbclen_cache): Now extern decl.
(mb_clen): New inline function.
* src/searchutils.c (SEARCH_INLINE, SYSTEM_INLINE): Define.
(mbclen_cache): Now extern.
(build_mbclen_cache): Put 1 into the cache when mbrlen returns 0.
(mb_goback): Use mb_len for speed, and rely on it returning nonzero.
* src/system.h (SYSTEM_INLINE): New macro.
(to_uchar): Use it.
---
 src/grep.c        | 38 +++++++++++++++++++++-----------------
 src/search.h      | 19 +++++++++++++++++++
 src/searchutils.c | 26 ++++++++++++++------------
 src/system.h      |  9 ++++++++-
 4 files changed, 62 insertions(+), 30 deletions(-)

diff --git a/src/grep.c b/src/grep.c
index ccba1b6..72a811e 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -443,22 +443,27 @@ clean_up_stdout (void)
 static int
 buffer_textbin (char const *buf, size_t size)
 {
-  mbstate_t mbs = { 0 };
-  size_t charlen;
   char badbyte = eolbyte ? '\0' : '\200';
-  char const *p;
 
-  for (p = buf; p < buf + size; p += charlen)
+  if (MB_CUR_MAX <= 1)
+    return memchr (buf, badbyte, size) ? -1 : 1;
+  else
     {
-      if (*p == badbyte)
-        return -1;
-      charlen = mbrlen (p, buf + size - p, &mbs);
-      if ((size_t) -2 <= charlen)
-        return charlen == (size_t) -2 ? 0 : -1;
-      charlen += !charlen;
-    }
+      mbstate_t mbs = { 0 };
+      size_t clen;
+      char const *p;
 
-  return 1;
+      for (p = buf; p < buf + size; p += clen)
+        {
+          if (*p == badbyte)
+            return -1;
+          clen = mb_clen (p, buf + size - p, &mbs);
+          if ((size_t) -2 <= clen)
+            return clen == (size_t) -2 ? 0 : -1;
+        }
+
+      return 1;
+    }
 }
 
 /* Return 1 if a file is known to be text for the purpose of 'grep'.
@@ -1887,9 +1892,9 @@ contains_encoding_error (char const *pat, size_t patlen)
   mbstate_t mbs = { 0 };
   size_t i, charlen;
 
-  for (i = 0; i < patlen; i += charlen + (charlen == 0))
+  for (i = 0; i < patlen; i += charlen)
     {
-      charlen = mbrlen (pat + i, patlen - i, &mbs);
+      charlen = mb_clen (pat + i, patlen - i, &mbs);
       if ((size_t) -2 <= charlen)
         return true;
     }
@@ -2332,6 +2337,8 @@ main (int argc, char **argv)
   else
     usage (EXIT_TROUBLE);
 
+  build_mbclen_cache ();
+
   /* If fgrep in a multibyte locale, then use grep if either
      (1) case is ignored (where grep is typically faster), or
      (2) the pattern has an encoding error (where fgrep might not work).  */
@@ -2349,9 +2356,6 @@ main (int argc, char **argv)
       execute = EGexecute;
     }
 
-  if (MB_CUR_MAX > 1)
-    build_mbclen_cache ();
-
   compile (keys, keycc);
   free (keys);
 
diff --git a/src/search.h b/src/search.h
index 14877bc..3f10a47 100644
--- a/src/search.h
+++ b/src/search.h
@@ -34,6 +34,11 @@
 #include "kwset.h"
 #include "xalloc.h"
 
+_GL_INLINE_HEADER_BEGIN
+#ifndef SEARCH_INLINE
+# define SEARCH_INLINE _GL_INLINE
+#endif
+
 /* This must be a signed type.  Each value is the difference in the size
    of a character (in bytes) induced by converting to lower case.
    The vast majority of values are 0, but a few are 1 or -1, so
@@ -45,6 +50,7 @@ extern void kwsinit (kwset_t *);
 
 extern char *mbtoupper (char const *, size_t *, mb_len_map_t **);
 extern void build_mbclen_cache (void);
+extern size_t mbclen_cache[];
 extern ptrdiff_t mb_goback (char const **, char const *, char const *);
 extern wint_t mb_prev_wc (char const *, char const *, char const *);
 extern wint_t mb_next_wc (char const *, char const *);
@@ -61,4 +67,17 @@ extern size_t Fexecute (char const *, size_t, size_t *, char 
const *);
 extern void Pcompile (char const *, size_t);
 extern size_t Pexecute (char const *, size_t, size_t *, char const *);
 
+/* Return the number of bytes in the character at the start of S, which
+   is of size N.  N must be positive.  MBS is the conversion state.
+   This acts like mbrlen, except it returns 1 when mbrlen would return 0,
+   and it is typically faster because of the cache.  */
+SEARCH_INLINE size_t
+mb_clen (char const *s, size_t n, mbstate_t *mbs)
+{
+  size_t len = mbclen_cache[to_uchar (*s)];
+  return len == (size_t) -2 ? mbrlen (s, n, mbs) : len;
+}
+
+_GL_INLINE_HEADER_END
+
 #endif /* GREP_SEARCH_H */
diff --git a/src/searchutils.c b/src/searchutils.c
index 18dd584..9edc785 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -17,12 +17,16 @@
    02110-1301, USA.  */
 
 #include <config.h>
-#include <assert.h>
+
+#define SEARCH_INLINE _GL_EXTERN_INLINE
+#define SYSTEM_INLINE _GL_EXTERN_INLINE
 #include "search.h"
 
+#include <assert.h>
+
 #define NCHAR (UCHAR_MAX + 1)
 
-static size_t mbclen_cache[NCHAR];
+size_t mbclen_cache[NCHAR];
 
 void
 kwsinit (kwset_t *kwset)
@@ -218,7 +222,8 @@ build_mbclen_cache (void)
       char c = i;
       unsigned char uc = i;
       mbstate_t mbs = { 0 };
-      mbclen_cache[uc] = mbrlen (&c, 1, &mbs);
+      size_t len = mbrlen (&c, 1, &mbs);
+      mbclen_cache[uc] = len ? len : 1;
     }
 }
 
@@ -244,20 +249,17 @@ mb_goback (char const **mb_start, char const *cur, char 
const *end)
 
   while (p < cur)
     {
-      size_t mbclen = mbclen_cache[to_uchar (*p)];
-
-      if (mbclen == (size_t) -2)
-        mbclen = mbrlen (p, end - p, &cur_state);
+      size_t clen = mb_clen (p, end - p, &cur_state);
 
-      if (! (0 < mbclen && mbclen < (size_t) -2))
+      if ((size_t) -2 <= clen)
         {
-          /* An invalid sequence, or a truncated multibyte character, or
-             a null wide character.  Treat it as a single byte character.  */
-          mbclen = 1;
+          /* An invalid sequence, or a truncated multibyte character.
+             Treat it as a single byte character.  */
+          clen = 1;
           memset (&cur_state, 0, sizeof cur_state);
         }
       p0 = p;
-      p += mbclen;
+      p += clen;
     }
 
   *mb_start = p;
diff --git a/src/system.h b/src/system.h
index 7da1d8d..bac2623 100644
--- a/src/system.h
+++ b/src/system.h
@@ -49,15 +49,22 @@ enum { EXIT_TROUBLE = 2 };
 
 #include "unlocked-io.h"
 
+_GL_INLINE_HEADER_BEGIN
+#ifndef SYSTEM_INLINE
+# define SYSTEM_INLINE _GL_INLINE
+#endif
+
 #define STREQ(a, b) (strcmp (a, b) == 0)
 
 /* Convert a possibly-signed character to an unsigned character.  This is
    a bit safer than casting to unsigned char, since it catches some type
    errors that the cast doesn't.  */
-static inline unsigned char
+SYSTEM_INLINE unsigned char
 to_uchar (char ch)
 {
   return ch;
 }
 
+_GL_INLINE_HEADER_END
+
 #endif
-- 
1.9.3

Reply via email to