bug#18777: [PATCH] dfa: improvement for checking of multibyte character boundary

Norihiro Tanaka Tue, 16 Dec 2014 04:43:36 -0800

On Mon, 15 Dec 2014 09:43:54 -0800
Paul Eggert <egg...@cs.ucla.edu> wrote:


> Can't we improve this when using_utf8 () is true?  In that case, every
> ASCII character is always single byte.  Also, the bytes 0xc0, 0xc1,
> and 0xf5 through 0xff can be added to the table: they are not
> single-byte characters but they are always encoding errors so they will
> be a character boundary as far as skip_remains_mb is concerned.  This
> suggests that the table 'always_single_byte' should be renamed to
> something like 'always_character_boundary'.
> 
> >     wint_t wc = WEOF;
> > +  if (always_single_byte[*p])
> > +    return p;

Thanks for the review and suggestion.  If using_utf8 () is true, we can
set always_character_boundary to true except 0x80-0xbf.

> This won't assign anything to *WCP, contrary to the documented API for
> for skip_remains_mb.  This is OK (as callers don't care) but the API
> documentation should be changed to reflect the actual behavior.

Oh!  if WCP is needed, we must be go through step by step, as a wide
character before P is set to *WCP.  I fixed it and updated the API
documentation.

From 82d4b2e55203de0fbabc547de9b95b2a6e5018d0 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <nori...@kcn.ne.jp>
Date: Mon, 15 Dec 2014 23:40:17 +0900
Subject: [PATCH] dfa: improvement for checking of multibyte character boundary

When found single bytes that cannot occur inside a multibyte character
we can skip check for multibyte character boundary before the character.

The improvement speeds up about 40% for input string which doesn't match
even the first part of a pattern.

* src/dfa.c (always_character_boundary): Add a new variable.  It caches
whether each byte can occur inside a multibyte character or not.
(dfaalwayscb): Add a new function.
(dfacomp): Use it.
(skip_remains_mb): If an input character is single bytes that cannot
occur inside a multibyte character, skip check for multibyte character
boundary until there.
---
 src/dfa.c | 32 +++++++++++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/src/dfa.c b/src/dfa.c
index 806cb04..b65b7f2 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -451,6 +451,29 @@ struct dfa
 static void dfamust (struct dfa *dfa);
 static void regexp (void);
 
+/* True if each byte can not occur inside a multibyte character  */
+static bool always_character_boundary[NOTCHAR];
+
+static void
+dfaalwayscb (void)
+{
+  int i;
+  if (using_utf8 ())
+    {
+      for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
+        {
+          unsigned char uc = i;
+          always_character_boundary[uc] = !((uc & 0xc0) ^ 0x80);
+        }
+    }
+  else
+    {
+      unsigned char const ucs[] = { '\0', '\n', '\r', '.', '/' };
+      for (i = 0; i < sizeof ucs / sizeof ucs[0]; ++i)
+        always_character_boundary[ucs[i]] = true;
+    }
+}
+
 static void
 dfambcache (struct dfa *d)
 {
@@ -3273,12 +3296,18 @@ transit_state (struct dfa *d, state_num s, unsigned 
char const **pp,
    Given DFA state d, use mbs_to_wchar to advance MBP until it reaches or
    exceeds P.  If WCP is non-NULL, set *WCP to the final wide character
    processed, or if no wide character is processed, set it to WEOF.
-   Both P and MBP must be no larger than END.  */
+   Both P and MBP must be no larger than END.
+
+   If next character is character boundary, it always reaches P.  So if
+   WCP is NULL, can omit check step by step and immediately return P.  */
 static unsigned char const *
 skip_remains_mb (struct dfa *d, unsigned char const *p,
                  unsigned char const *mbp, char const *end, wint_t *wcp)
 {
   wint_t wc = WEOF;
+  if (wcp == NULL && always_character_boundary[*p])
+    return p;
+
   while (mbp < p)
     mbp += mbs_to_wchar (&wc, (char const *) mbp,
                          end - (char const *) mbp, d);
@@ -3713,6 +3742,7 @@ void
 dfacomp (char const *s, size_t len, struct dfa *d, int searchflag)
 {
   dfainit (d);
+  dfaalwayscb ();
   dfambcache (d);
   dfaparse (s, len, d);
   dfamust (d);
-- 
2.2.0

bug#18777: [PATCH] dfa: improvement for checking of multibyte character boundary

Reply via email to