Thanks for looking into that. The attached patches solve those
performance problems for me.
From 4ef67a272af85b46f4769d1c593178a63f6205da Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Thu, 25 Sep 2014 17:04:49 -0700
Subject: [PATCH 1/2] grep: scan for valid multibyte strings more quickly
Scan valid multibyte strings more quickly in the common case of
encodings that are upward compatible with ASCII, such as UTF-8.
You'd think there'd be a fast standard way to do this nowadays,
but nooooo....
Problem reported by Jim Meyering in: http://bugs.gnu.org/18454#56
* src/grep.c (HIBYTE): New constant.
(easy_encoding): New static var.
(init_easy_encoding, skip_easy_bytes): New functions.
(buffer_textbin): Skip easy bytes quickly.
Don't bother with mb_clen here, since skip_easy_bytes typically
captures the easy cases; just use mbrlen directly.
(buffer_textbin, file_textbin): First arg is no longer a const
pointer, since the byte past the end is now an overwritten sentinel.
(main): Call init_easy_encoding.
---
src/grep.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 53 insertions(+), 4 deletions(-)
diff --git a/src/grep.c b/src/grep.c
index 35d3358..948e427 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -454,9 +454,56 @@ textbin_is_binary (enum textbin textbin)
return textbin < TEXTBIN_UNKNOWN;
}
+/* The high-order bit of a byte. */
+enum { HIBYTE = 0x80 };
+
+/* True if every byte with HIBYTE off is a single-byte character.
+ UTF-8 has this property. */
+static bool easy_encoding;
+
+static void
+init_easy_encoding (void)
+{
+ easy_encoding = true;
+ for (int i = 0; i < HIBYTE; i++)
+ easy_encoding &= mbclen_cache[i] == 1;
+}
+
+/* Skip the easy bytes in a buffer that is guaranteed to have a sentinel
+ that is not easy, and return a pointer to the first non-easy byte.
+ In easy encodings, the easy bytes all have HIBYTE off.
+ In other encodings, no byte is easy. */
+static char const * _GL_ATTRIBUTE_PURE
+skip_easy_bytes (char const *buf)
+{
+ if (!easy_encoding)
+ return buf;
+
+ /* An unsigned type suitable for fast matching. */
+ typedef uintmax_t uword;
+
+ /* 0x8080..., extended to be wide enough for uword. */
+ uword hibyte_mask = (uword) -1 / UCHAR_MAX * HIBYTE;
+
+ /* Search a byte at a time until the pointer is aligned, then a
+ uword at a time until a match is found, then a byte at a time to
+ identify the exact byte. The uword search may go slightly past
+ the buffer end, but that's benign. */
+ char const *p;
+ uword const *s;
+ for (p = buf; (uintptr_t) p % sizeof (uword) != 0; p++)
+ if (*p & HIBYTE)
+ return p;
+ for (s = (uword const *) p; ! (*s & hibyte_mask); s++)
+ continue;
+ for (p = (char const *) s; ! (*p & HIBYTE); p++)
+ continue;
+ return p;
+}
+
/* Return the text type of data in BUF, of size SIZE. */
static enum textbin
-buffer_textbin (char const *buf, size_t size)
+buffer_textbin (char *buf, size_t size)
{
if (eolbyte && memchr (buf, '\0', size))
return TEXTBIN_BINARY;
@@ -467,9 +514,10 @@ buffer_textbin (char const *buf, size_t size)
size_t clen;
char const *p;
- for (p = buf; p < buf + size; p += clen)
+ buf[size] = -1;
+ for (p = buf; (p = skip_easy_bytes (p)) < buf + size; p += clen)
{
- clen = mb_clen (p, buf + size - p, &mbs);
+ clen = mbrlen (p, buf + size - p, &mbs);
if ((size_t) -2 <= clen)
return clen == (size_t) -2 ? TEXTBIN_UNKNOWN : TEXTBIN_BINARY;
}
@@ -481,7 +529,7 @@ buffer_textbin (char const *buf, size_t size)
/* Return the text type of a file. BUF, of size BUFSIZE, is the initial
buffer read from the file with descriptor FD and status ST. */
static enum textbin
-file_textbin (char const *buf, size_t bufsize, int fd, struct stat const *st)
+file_textbin (char *buf, size_t bufsize, int fd, struct stat const *st)
{
enum textbin textbin = buffer_textbin (buf, bufsize);
if (textbin_is_binary (textbin))
@@ -2417,6 +2465,7 @@ main (int argc, char **argv)
usage (EXIT_TROUBLE);
build_mbclen_cache ();
+ init_easy_encoding ();
/* If fgrep in a multibyte locale, then use grep if either
(1) case is ignored (where grep is typically faster), or
--
1.9.3
From 466ca44b0b50907e47ef6f1b4e1283e32d667f37 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Thu, 25 Sep 2014 17:14:56 -0700
Subject: [PATCH 2/2] grep: don't check extensively for invalid prefix bytes
unless -P
Problem reported by Jim Meyering in: http://bugs.gnu.org/18454#56
* src/grep.c (grep): After the first buffer is checked, leave the
file-type checker in TEXTBIN_UNKNOWN state only when -P is used.
Only the -P matcher has performance problems with checking binary
data that make it worthwhile to check every prefix input byte so
the -P matcher's TEXTBIN_UNKNOWN optimizations can come into play.
Other matchers can simply check the data directly, and using
TEXTBIN_UNKNOWN with them slows 'grep' down for no benefit.
---
src/grep.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/src/grep.c b/src/grep.c
index 948e427..3a8d9f5 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -1288,6 +1288,8 @@ grep (int fd, struct stat const *st)
nul_zapper = eol;
skip_nuls = skip_empty_lines;
}
+ else if (execute != Pexecute)
+ textbin = TEXTBIN_TEXT;
}
for (;;)
--
1.9.3