Thank you for the updated patch. I am attaching it (with a detailed commit/ChangeLog I have added, and a NEWS entry), along with four additional commits. One fixes an actual bug. I expect to merge those commits into yours, discarding their individual commit logs. I leave them separate here solely to make the changes easier to review.
By the way, I measured the speed-up before and after your change on an idle AMD FX-4100 (best of 10), running these commands: yes $(printf '%078dm' 0)|head -1000000 > in for i in $(seq 10); do env LC_ALL=ja_JP.eucJP time src/grep -v m in; done Before: 2.46 After: 0.36 Let me know if you see anything that can be improved, Jim
From 9aff0f41414933a945b13328111bdb151ac9a9ee Mon Sep 17 00:00:00 2001 From: Jim Meyering <meyer...@fb.com> Date: Thu, 30 Jan 2014 12:56:04 -0800 Subject: [PATCH 1/5] speed up mb-boundary-detection after each preliminary match After each kwsexec or dfaexec match, we must determine whether the tentative match falls in the middle of a multi-byte character. That is what our is_mb_middle function does, but it was expensive, even when most input consisted of single-byte characters. The main cost was for each call to mbrlen. This change constructs and uses a cache of the lengths returned by mbrlen for unibyte values. The largest speed-up (3x to 7x, CPU-dependent) is when most lines contain a match, yet few are printed, e.g., when using grep -v common-pattern ... to filter out all but a few lines. * src/search.h (build_mbclen_cache): Declare it. * src/main.c: Include "search.h". [MBS_SUPPORT] (main): Call build_mbclen_cache in a multibyte locale. * src/searchutils.c [HAVE_LANGINFO_CODESET]: Include <langinfo.h>. (mbclen_cache): New global. (build_mbclen_cache): New function. (is_mb_middle) [HAVE_LANGINFO_CODESET]: Use it. * NEWS (Improvements): Mention it. --- NEWS | 3 +++ src/main.c | 6 ++++++ src/search.h | 1 + src/searchutils.c | 48 ++++++++++++++++++++++++++++++++++++++++++++---- 4 files changed, 54 insertions(+), 4 deletions(-) diff --git a/NEWS b/NEWS index a662960..2ff7272 100644 --- a/NEWS +++ b/NEWS @@ -7,6 +7,9 @@ GNU grep NEWS -*- outline -*- grep -i in a multibyte locale is now typically 10 times faster for patterns that do not contain \ or [. + grep (without -i) in a multibyte locale is now up to 7 times faster + when processing many matched lines. + Range expressions in unibyte locales now ordinarily use the rational range interpretation, in which [a-z] matches only lower-case ASCII letters regardless of locale, and similarly for other ranges. (This diff --git a/src/main.c b/src/main.c index 42f9ff3..54d9dfc 100644 --- a/src/main.c +++ b/src/main.c @@ -46,6 +46,7 @@ #include "propername.h" #include "quote.h" #include "safe-read.h" +#include "search.h" #include "version-etc.h" #include "xalloc.h" #include "xstrtol.h" @@ -2364,6 +2365,11 @@ main (int argc, char **argv) } } +#if MBS_SUPPORT + if (MB_CUR_MAX > 1) + build_mbclen_cache (); +#endif + compile (keys, keycc); free (keys); diff --git a/src/search.h b/src/search.h index 61dcf95..12d0822 100644 --- a/src/search.h +++ b/src/search.h @@ -46,6 +46,7 @@ typedef signed char mb_len_map_t; extern void kwsinit (kwset_t *); extern char *mbtolower (const char *, size_t *, mb_len_map_t **); +extern void build_mbclen_cache (void); extern bool is_mb_middle (const char **, const char *, const char *, size_t); /* dfasearch.c */ diff --git a/src/searchutils.c b/src/searchutils.c index 778f4ad..1c35451 100644 --- a/src/searchutils.c +++ b/src/searchutils.c @@ -19,9 +19,16 @@ #include <config.h> #include <assert.h> #include "search.h" +#if HAVE_LANGINFO_CODESET +# include <langinfo.h> +#endif + +#define STREQ(a, b) (strcmp (a, b) == 0) #define NCHAR (UCHAR_MAX + 1) +static size_t mbclen_cache[NCHAR]; + void kwsinit (kwset_t *kwset) { @@ -207,6 +214,20 @@ mbtolower (const char *beg, size_t *n, mb_len_map_t **len_map_p) return out; } +/* Initialize a cache of mbrlen values for each of its 1-byte inputs. */ +void +build_mbclen_cache (void) +{ + int i; + + for (i = CHAR_MIN; i <= CHAR_MAX; ++i) + { + char c = i; + unsigned char uc = i; + mbstate_t mbs = { 0 }; + mbclen_cache[uc] = mbrlen (&c, 1, &mbs); + } +} bool is_mb_middle (const char **good, const char *buf, const char *end, @@ -215,16 +236,35 @@ is_mb_middle (const char **good, const char *buf, const char *end, const char *p = *good; const char *prev = p; mbstate_t cur_state; +#if HAVE_LANGINFO_CODESET + static int is_utf8 = -1; + + if (is_utf8 == -1) + is_utf8 = (STREQ (nl_langinfo (CODESET), "UTF-8") == 0); + + if (is_utf8 && buf - p > MB_CUR_MAX) + { + for (p = buf; buf - p > MB_CUR_MAX; p--) + if (mbclen_cache[(unsigned char) *p] != (size_t) -1) + break; + + if (buf - p == MB_CUR_MAX) + p = buf; + } +#endif + + memset(&cur_state, 0, sizeof cur_state); - /* TODO: can be optimized for UTF-8. */ - memset(&cur_state, 0, sizeof(mbstate_t)); while (p < buf) { - size_t mbclen = mbrlen(p, end - p, &cur_state); + size_t mbclen = mbclen_cache[(unsigned char) *p]; + + if (mbclen == (size_t) -2) + mbclen = mbrlen (p, end - p, &cur_state); /* Store the beginning of the previous complete multibyte character. */ if (mbclen != (size_t) -2) - prev = p; + prev = p; if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0) { -- 1.8.5.3.321.g14598b9 From a571dc8a18d43d54cdf4e43b64664519947e91e1 Mon Sep 17 00:00:00 2001 From: Jim Meyering <meyer...@fb.com> Date: Thu, 30 Jan 2014 12:59:00 -0800 Subject: [PATCH 2/5] minor fix-up use to_uchar in place of explicit cast, twice remove a space on accidentally over-indented line --- src/searchutils.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/searchutils.c b/src/searchutils.c index 1c35451..8062eb4 100644 --- a/src/searchutils.c +++ b/src/searchutils.c @@ -245,7 +245,7 @@ is_mb_middle (const char **good, const char *buf, const char *end, if (is_utf8 && buf - p > MB_CUR_MAX) { for (p = buf; buf - p > MB_CUR_MAX; p--) - if (mbclen_cache[(unsigned char) *p] != (size_t) -1) + if (mbclen_cache[to_uchar (*p)] != (size_t) -1) break; if (buf - p == MB_CUR_MAX) @@ -257,14 +257,14 @@ is_mb_middle (const char **good, const char *buf, const char *end, while (p < buf) { - size_t mbclen = mbclen_cache[(unsigned char) *p]; + size_t mbclen = mbclen_cache[to_uchar (*p)]; if (mbclen == (size_t) -2) mbclen = mbrlen (p, end - p, &cur_state); /* Store the beginning of the previous complete multibyte character. */ if (mbclen != (size_t) -2) - prev = p; + prev = p; if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0) { -- 1.8.5.3.321.g14598b9 From ee443ffcc5e1721cd54942798a5f57ec5883f101 Mon Sep 17 00:00:00 2001 From: Jim Meyering <meyer...@fb.com> Date: Thu, 30 Jan 2014 19:34:40 -0800 Subject: [PATCH 3/5] STREQ: do not define. already included from system.h via search.h --- src/searchutils.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/searchutils.c b/src/searchutils.c index 8062eb4..584c16f 100644 --- a/src/searchutils.c +++ b/src/searchutils.c @@ -23,8 +23,6 @@ # include <langinfo.h> #endif -#define STREQ(a, b) (strcmp (a, b) == 0) - #define NCHAR (UCHAR_MAX + 1) static size_t mbclen_cache[NCHAR]; -- 1.8.5.3.321.g14598b9 From 109053413e4cb77b5341cdc82485fd89acb061c0 Mon Sep 17 00:00:00 2001 From: Jim Meyering <meyer...@fb.com> Date: Thu, 30 Jan 2014 19:43:19 -0800 Subject: [PATCH 4/5] insert space before funtion-call-open-parenthesis --- src/searchutils.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/searchutils.c b/src/searchutils.c index 584c16f..7cbcd94 100644 --- a/src/searchutils.c +++ b/src/searchutils.c @@ -251,7 +251,7 @@ is_mb_middle (const char **good, const char *buf, const char *end, } #endif - memset(&cur_state, 0, sizeof cur_state); + memset (&cur_state, 0, sizeof cur_state); while (p < buf) { @@ -269,7 +269,7 @@ is_mb_middle (const char **good, const char *buf, const char *end, /* An invalid sequence, or a truncated multibyte character. We treat it as a single byte character. */ mbclen = 1; - memset(&cur_state, 0, sizeof cur_state); + memset (&cur_state, 0, sizeof cur_state); } p += mbclen; } -- 1.8.5.3.321.g14598b9 From c8466cd70cf02c9facfcbeab040340f27e174c4d Mon Sep 17 00:00:00 2001 From: Jim Meyering <meyer...@fb.com> Date: Sat, 1 Feb 2014 08:28:04 -0800 Subject: [PATCH 5/5] fix strcmp/STREQ conversion typo --- src/searchutils.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/searchutils.c b/src/searchutils.c index 7cbcd94..3478417 100644 --- a/src/searchutils.c +++ b/src/searchutils.c @@ -238,7 +238,7 @@ is_mb_middle (const char **good, const char *buf, const char *end, static int is_utf8 = -1; if (is_utf8 == -1) - is_utf8 = (STREQ (nl_langinfo (CODESET), "UTF-8") == 0); + is_utf8 = STREQ (nl_langinfo (CODESET), "UTF-8"); if (is_utf8 && buf - p > MB_CUR_MAX) { -- 1.8.5.3.321.g14598b9