bug#18454: Improve performance when -P (PCRE) is used in UTF-8 locales

Paul Eggert Thu, 25 Sep 2014 17:25:27 -0700

Thanks for looking into that. The attached patches solve thoseperformance problems for me.

From 4ef67a272af85b46f4769d1c593178a63f6205da Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Thu, 25 Sep 2014 17:04:49 -0700
Subject: [PATCH 1/2] grep: scan for valid multibyte strings more quickly


Scan valid multibyte strings more quickly in the common case of
encodings that are upward compatible with ASCII, such as UTF-8.
You'd think there'd be a fast standard way to do this nowadays,
but nooooo....
Problem reported by Jim Meyering in: http://bugs.gnu.org/18454#56
* src/grep.c (HIBYTE): New constant.
(easy_encoding): New static var.
(init_easy_encoding, skip_easy_bytes): New functions.
(buffer_textbin): Skip easy bytes quickly.
Don't bother with mb_clen here, since skip_easy_bytes typically
captures the easy cases; just use mbrlen directly.
(buffer_textbin, file_textbin): First arg is no longer a const
pointer, since the byte past the end is now an overwritten sentinel.
(main): Call init_easy_encoding.
---
 src/grep.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 53 insertions(+), 4 deletions(-)

diff --git a/src/grep.c b/src/grep.c
index 35d3358..948e427 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -454,9 +454,56 @@ textbin_is_binary (enum textbin textbin)
   return textbin < TEXTBIN_UNKNOWN;
 }
 
+/* The high-order bit of a byte.  */
+enum { HIBYTE = 0x80 };
+
+/* True if every byte with HIBYTE off is a single-byte character.
+   UTF-8 has this property.  */
+static bool easy_encoding;
+
+static void
+init_easy_encoding (void)
+{
+  easy_encoding = true;
+  for (int i = 0; i < HIBYTE; i++)
+    easy_encoding &= mbclen_cache[i] == 1;
+}
+
+/* Skip the easy bytes in a buffer that is guaranteed to have a sentinel
+   that is not easy, and return a pointer to the first non-easy byte.
+   In easy encodings, the easy bytes all have HIBYTE off.
+   In other encodings, no byte is easy.  */
+static char const * _GL_ATTRIBUTE_PURE
+skip_easy_bytes (char const *buf)
+{
+  if (!easy_encoding)
+    return buf;
+
+  /* An unsigned type suitable for fast matching.  */
+  typedef uintmax_t uword;
+
+  /* 0x8080..., extended to be wide enough for uword.  */
+  uword hibyte_mask = (uword) -1 / UCHAR_MAX * HIBYTE;
+
+  /* Search a byte at a time until the pointer is aligned, then a
+     uword at a time until a match is found, then a byte at a time to
+     identify the exact byte.  The uword search may go slightly past
+     the buffer end, but that's benign.  */
+  char const *p;
+  uword const *s;
+  for (p = buf; (uintptr_t) p % sizeof (uword) != 0; p++)
+    if (*p & HIBYTE)
+      return p;
+  for (s = (uword const *) p; ! (*s & hibyte_mask); s++)
+    continue;
+  for (p = (char const *) s; ! (*p & HIBYTE); p++)
+    continue;
+  return p;
+}
+
 /* Return the text type of data in BUF, of size SIZE.  */
 static enum textbin
-buffer_textbin (char const *buf, size_t size)
+buffer_textbin (char *buf, size_t size)
 {
   if (eolbyte && memchr (buf, '\0', size))
     return TEXTBIN_BINARY;
@@ -467,9 +514,10 @@ buffer_textbin (char const *buf, size_t size)
       size_t clen;
       char const *p;
 
-      for (p = buf; p < buf + size; p += clen)
+      buf[size] = -1;
+      for (p = buf; (p = skip_easy_bytes (p)) < buf + size; p += clen)
         {
-          clen = mb_clen (p, buf + size - p, &mbs);
+          clen = mbrlen (p, buf + size - p, &mbs);
           if ((size_t) -2 <= clen)
             return clen == (size_t) -2 ? TEXTBIN_UNKNOWN : TEXTBIN_BINARY;
         }
@@ -481,7 +529,7 @@ buffer_textbin (char const *buf, size_t size)
 /* Return the text type of a file.  BUF, of size BUFSIZE, is the initial
    buffer read from the file with descriptor FD and status ST.  */
 static enum textbin
-file_textbin (char const *buf, size_t bufsize, int fd, struct stat const *st)
+file_textbin (char *buf, size_t bufsize, int fd, struct stat const *st)
 {
   enum textbin textbin = buffer_textbin (buf, bufsize);
   if (textbin_is_binary (textbin))
@@ -2417,6 +2465,7 @@ main (int argc, char **argv)
     usage (EXIT_TROUBLE);
 
   build_mbclen_cache ();
+  init_easy_encoding ();
 
   /* If fgrep in a multibyte locale, then use grep if either
      (1) case is ignored (where grep is typically faster), or
-- 
1.9.3

From 466ca44b0b50907e47ef6f1b4e1283e32d667f37 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Thu, 25 Sep 2014 17:14:56 -0700
Subject: [PATCH 2/2] grep: don't check extensively for invalid prefix bytes
 unless -P

Problem reported by Jim Meyering in: http://bugs.gnu.org/18454#56
* src/grep.c (grep): After the first buffer is checked, leave the
file-type checker in TEXTBIN_UNKNOWN state only when -P is used.
Only the -P matcher has performance problems with checking binary
data that make it worthwhile to check every prefix input byte so
the -P matcher's TEXTBIN_UNKNOWN optimizations can come into play.
Other matchers can simply check the data directly, and using
TEXTBIN_UNKNOWN with them slows 'grep' down for no benefit.
---
 src/grep.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/grep.c b/src/grep.c
index 948e427..3a8d9f5 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -1288,6 +1288,8 @@ grep (int fd, struct stat const *st)
           nul_zapper = eol;
           skip_nuls = skip_empty_lines;
         }
+      else if (execute != Pexecute)
+        textbin = TEXTBIN_TEXT;
     }
 
   for (;;)
-- 
1.9.3

bug#18454: Improve performance when -P (PCRE) is used in UTF-8 locales

Reply via email to