bug#16544: Optimazation for is_mb_middle

Jim Meyering Sat, 01 Feb 2014 17:19:43 -0800

Thank you for the updated patch.
I am attaching it (with a detailed commit/ChangeLog I have added,
and a NEWS entry), along with four additional commits.
One fixes an actual bug.  I expect to merge those commits into yours,
discarding their individual commit logs.  I leave them separate here
solely to make the changes easier to review.


By the way, I measured the speed-up before and after your change
on an idle AMD FX-4100 (best of 10), running these commands:

  yes $(printf '%078dm' 0)|head -1000000 > in
  for i in $(seq 10); do env LC_ALL=ja_JP.eucJP time src/grep -v m in; done

Before: 2.46
After: 0.36

Let me know if you see anything that can be improved,

Jim

From 9aff0f41414933a945b13328111bdb151ac9a9ee Mon Sep 17 00:00:00 2001
From: Jim Meyering <meyer...@fb.com>
Date: Thu, 30 Jan 2014 12:56:04 -0800
Subject: [PATCH 1/5] speed up mb-boundary-detection after each preliminary
 match

After each kwsexec or dfaexec match, we must determine whether
the tentative match falls in the middle of a multi-byte character.
That is what our is_mb_middle function does, but it was expensive,
even when most input consisted of single-byte characters.  The main
cost was for each call to mbrlen.  This change constructs and uses
a cache of the lengths returned by mbrlen for unibyte values.
The largest speed-up (3x to 7x, CPU-dependent) is when most
lines contain a match, yet few are printed, e.g., when using
grep -v common-pattern ... to filter out all but a few lines.

* src/search.h (build_mbclen_cache): Declare it.
* src/main.c: Include "search.h".
[MBS_SUPPORT] (main): Call build_mbclen_cache in a multibyte locale.
* src/searchutils.c [HAVE_LANGINFO_CODESET]: Include <langinfo.h>.
(mbclen_cache): New global.
(build_mbclen_cache): New function.
(is_mb_middle) [HAVE_LANGINFO_CODESET]: Use it.
* NEWS (Improvements): Mention it.
---
 NEWS              |  3 +++
 src/main.c        |  6 ++++++
 src/search.h      |  1 +
 src/searchutils.c | 48 ++++++++++++++++++++++++++++++++++++++++++++----
 4 files changed, 54 insertions(+), 4 deletions(-)

diff --git a/NEWS b/NEWS
index a662960..2ff7272 100644
--- a/NEWS
+++ b/NEWS
@@ -7,6 +7,9 @@ GNU grep NEWS                                    -*- outline -*-
   grep -i in a multibyte locale is now typically 10 times faster
   for patterns that do not contain \ or [.

+  grep (without -i) in a multibyte locale is now up to 7 times faster
+  when processing many matched lines.
+
   Range expressions in unibyte locales now ordinarily use the rational
   range interpretation, in which [a-z] matches only lower-case ASCII
   letters regardless of locale, and similarly for other ranges.  (This
diff --git a/src/main.c b/src/main.c
index 42f9ff3..54d9dfc 100644
--- a/src/main.c
+++ b/src/main.c
@@ -46,6 +46,7 @@
 #include "propername.h"
 #include "quote.h"
 #include "safe-read.h"
+#include "search.h"
 #include "version-etc.h"
 #include "xalloc.h"
 #include "xstrtol.h"
@@ -2364,6 +2365,11 @@ main (int argc, char **argv)
         }
     }

+#if MBS_SUPPORT
+  if (MB_CUR_MAX > 1)
+    build_mbclen_cache ();
+#endif
+
   compile (keys, keycc);
   free (keys);

diff --git a/src/search.h b/src/search.h
index 61dcf95..12d0822 100644
--- a/src/search.h
+++ b/src/search.h
@@ -46,6 +46,7 @@ typedef signed char mb_len_map_t;
 extern void kwsinit (kwset_t *);

 extern char *mbtolower (const char *, size_t *, mb_len_map_t **);
+extern void build_mbclen_cache (void);
 extern bool is_mb_middle (const char **, const char *, const char *, size_t);

 /* dfasearch.c */
diff --git a/src/searchutils.c b/src/searchutils.c
index 778f4ad..1c35451 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -19,9 +19,16 @@
 #include <config.h>
 #include <assert.h>
 #include "search.h"
+#if HAVE_LANGINFO_CODESET
+# include <langinfo.h>
+#endif
+
+#define STREQ(a, b) (strcmp (a, b) == 0)

 #define NCHAR (UCHAR_MAX + 1)

+static size_t mbclen_cache[NCHAR];
+
 void
 kwsinit (kwset_t *kwset)
 {
@@ -207,6 +214,20 @@ mbtolower (const char *beg, size_t *n, mb_len_map_t 
**len_map_p)
   return out;
 }

+/* Initialize a cache of mbrlen values for each of its 1-byte inputs.  */
+void
+build_mbclen_cache (void)
+{
+  int i;
+
+  for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
+    {
+      char c = i;
+      unsigned char uc = i;
+      mbstate_t mbs = { 0 };
+      mbclen_cache[uc] = mbrlen (&c, 1, &mbs);
+    }
+}

 bool
 is_mb_middle (const char **good, const char *buf, const char *end,
@@ -215,16 +236,35 @@ is_mb_middle (const char **good, const char *buf, const 
char *end,
   const char *p = *good;
   const char *prev = p;
   mbstate_t cur_state;
+#if HAVE_LANGINFO_CODESET
+  static int is_utf8 = -1;
+
+  if (is_utf8 == -1)
+    is_utf8 = (STREQ (nl_langinfo (CODESET), "UTF-8") == 0);
+
+  if (is_utf8 && buf - p > MB_CUR_MAX)
+    {
+      for (p = buf; buf - p > MB_CUR_MAX; p--)
+        if (mbclen_cache[(unsigned char) *p] != (size_t) -1)
+          break;
+
+      if (buf - p == MB_CUR_MAX)
+        p = buf;
+    }
+#endif
+
+  memset(&cur_state, 0, sizeof cur_state);

-  /* TODO: can be optimized for UTF-8.  */
-  memset(&cur_state, 0, sizeof(mbstate_t));
   while (p < buf)
     {
-      size_t mbclen = mbrlen(p, end - p, &cur_state);
+      size_t mbclen = mbclen_cache[(unsigned char) *p];
+
+      if (mbclen == (size_t) -2)
+        mbclen = mbrlen (p, end - p, &cur_state);

       /* Store the beginning of the previous complete multibyte character.  */
       if (mbclen != (size_t) -2)
-        prev = p;
+         prev = p;

       if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
         {
-- 
1.8.5.3.321.g14598b9


From a571dc8a18d43d54cdf4e43b64664519947e91e1 Mon Sep 17 00:00:00 2001
From: Jim Meyering <meyer...@fb.com>
Date: Thu, 30 Jan 2014 12:59:00 -0800
Subject: [PATCH 2/5] minor fix-up

use to_uchar in place of explicit cast, twice
remove a space on accidentally over-indented line
---
 src/searchutils.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/searchutils.c b/src/searchutils.c
index 1c35451..8062eb4 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -245,7 +245,7 @@ is_mb_middle (const char **good, const char *buf, const 
char *end,
   if (is_utf8 && buf - p > MB_CUR_MAX)
     {
       for (p = buf; buf - p > MB_CUR_MAX; p--)
-        if (mbclen_cache[(unsigned char) *p] != (size_t) -1)
+        if (mbclen_cache[to_uchar (*p)] != (size_t) -1)
           break;

       if (buf - p == MB_CUR_MAX)
@@ -257,14 +257,14 @@ is_mb_middle (const char **good, const char *buf, const 
char *end,

   while (p < buf)
     {
-      size_t mbclen = mbclen_cache[(unsigned char) *p];
+      size_t mbclen = mbclen_cache[to_uchar (*p)];

       if (mbclen == (size_t) -2)
         mbclen = mbrlen (p, end - p, &cur_state);

       /* Store the beginning of the previous complete multibyte character.  */
       if (mbclen != (size_t) -2)
-         prev = p;
+        prev = p;

       if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
         {
-- 
1.8.5.3.321.g14598b9


From ee443ffcc5e1721cd54942798a5f57ec5883f101 Mon Sep 17 00:00:00 2001
From: Jim Meyering <meyer...@fb.com>
Date: Thu, 30 Jan 2014 19:34:40 -0800
Subject: [PATCH 3/5] STREQ: do not define. already included from system.h via
 search.h

---
 src/searchutils.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/searchutils.c b/src/searchutils.c
index 8062eb4..584c16f 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -23,8 +23,6 @@
 # include <langinfo.h>
 #endif

-#define STREQ(a, b) (strcmp (a, b) == 0)
-
 #define NCHAR (UCHAR_MAX + 1)

 static size_t mbclen_cache[NCHAR];
-- 
1.8.5.3.321.g14598b9


From 109053413e4cb77b5341cdc82485fd89acb061c0 Mon Sep 17 00:00:00 2001
From: Jim Meyering <meyer...@fb.com>
Date: Thu, 30 Jan 2014 19:43:19 -0800
Subject: [PATCH 4/5] insert space before funtion-call-open-parenthesis

---
 src/searchutils.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/searchutils.c b/src/searchutils.c
index 584c16f..7cbcd94 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -251,7 +251,7 @@ is_mb_middle (const char **good, const char *buf, const 
char *end,
     }
 #endif

-  memset(&cur_state, 0, sizeof cur_state);
+  memset (&cur_state, 0, sizeof cur_state);

   while (p < buf)
     {
@@ -269,7 +269,7 @@ is_mb_middle (const char **good, const char *buf, const 
char *end,
           /* An invalid sequence, or a truncated multibyte character.
              We treat it as a single byte character.  */
           mbclen = 1;
-          memset(&cur_state, 0, sizeof cur_state);
+          memset (&cur_state, 0, sizeof cur_state);
         }
       p += mbclen;
     }
-- 
1.8.5.3.321.g14598b9


From c8466cd70cf02c9facfcbeab040340f27e174c4d Mon Sep 17 00:00:00 2001
From: Jim Meyering <meyer...@fb.com>
Date: Sat, 1 Feb 2014 08:28:04 -0800
Subject: [PATCH 5/5] fix strcmp/STREQ conversion typo

---
 src/searchutils.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/searchutils.c b/src/searchutils.c
index 7cbcd94..3478417 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -238,7 +238,7 @@ is_mb_middle (const char **good, const char *buf, const 
char *end,
   static int is_utf8 = -1;

   if (is_utf8 == -1)
-    is_utf8 = (STREQ (nl_langinfo (CODESET), "UTF-8") == 0);
+    is_utf8 = STREQ (nl_langinfo (CODESET), "UTF-8");

   if (is_utf8 && buf - p > MB_CUR_MAX)
     {
-- 
1.8.5.3.321.g14598b9

bug#16544: Optimazation for is_mb_middle

Reply via email to