Norihiro Tanaka wrote:
I thought of various things for the name of the function, but I could
not think of a good name.

Yes, that's a tough one. I eventually came up with maybe_disable_superset_dfa. Perhaps we can think of an ever better one.

I installed your patches into Gnulib, along with the attached relatively-minor fixups, and then propagated the result into grep by updating the Gnulib version that the grep master points to.

As you may notice, nowadays I prefer using signed types like ptrdiff_t to unsigned ones like size_t, as the signed types have a significant advantage in overflow checking with gcc -fsanitize=undefined. Perhaps at some point we should change the other instances of size_t in dfa.c, but one step at a time.

Thanks again for the performance improvement.
>From 5eae16cfb8d40ca75691605e8f2f4a40eab70c7a Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Tue, 18 Sep 2018 15:25:51 -0700
Subject: [PATCH 1/4] dfa: reorder enum for efficiency

* lib/dfa.c (END): Now -1 again.  Reorder other elements
of the enumeration to make it easier for GCC to generate
efficient code by using fewer comparisons to check for
ranges of values.
(atom): Take advantage of the reordering.
---
 ChangeLog |   9 ++++
 lib/dfa.c | 130 +++++++++++++++++++++++++++++-------------------------
 2 files changed, 78 insertions(+), 61 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 59581e3c5..64926503a 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,12 @@
+2018-09-18  Paul Eggert  <egg...@cs.ucla.edu>
+
+	dfa: reorder enum for efficiency
+	* lib/dfa.c (END): Now -1 again.  Reorder other elements
+	of the enumeration to make it easier for GCC to generate
+	efficient code by using fewer comparisons to check for
+	ranges of values.
+	(atom): Take advantage of the reordering.
+
 2018-09-18  Norihiro Tanaka  <nori...@kcn.ne.jp>
 
 	dfa: optimize alternation in NFA
diff --git a/lib/dfa.c b/lib/dfa.c
index 3991112ef..849645154 100644
--- a/lib/dfa.c
+++ b/lib/dfa.c
@@ -223,50 +223,24 @@ typedef ptrdiff_t state_num;
 /* Predefined token values.  */
 enum
 {
-  END = -2,                     /* END is a terminal symbol that matches the
+  END = -1,                     /* END is a terminal symbol that matches the
                                    end of input; any value of END or less in
                                    the parse tree is such a symbol.  Accepting
                                    states of the DFA are those that would have
-                                   a transition on END.  */
-
-  BEG = -1,                     /* BEG is a beginning symbol that matches the
-                                   biginning of input.  */
+                                   a transition on END.  This is -1, not some
+                                   more-negative value, to tweak the speed of
+                                   comparisons to END.  */
 
   /* Ordinary character values are terminal symbols that match themselves.  */
 
+  /* CSET must come last in the following list of special tokens.  Otherwise,
+     the list order matters only for performance.  Related special tokens
+     should have nearby values so that code like (t == ANYCHAR || t == MBCSET
+     || CSET <= t) can be done with a single machine-level comparison.  */
+
   EMPTY = NOTCHAR,              /* EMPTY is a terminal symbol that matches
                                    the empty string.  */
 
-  BACKREF,                      /* BACKREF is generated by \<digit>
-                                   or by any other construct that
-                                   is not completely handled.  If the scanner
-                                   detects a transition on backref, it returns
-                                   a kind of "semi-success" indicating that
-                                   the match will have to be verified with
-                                   a backtracking matcher.  */
-
-  BEGLINE,                      /* BEGLINE is a terminal symbol that matches
-                                   the empty string at the beginning of a
-                                   line.  */
-
-  ENDLINE,                      /* ENDLINE is a terminal symbol that matches
-                                   the empty string at the end of a line.  */
-
-  BEGWORD,                      /* BEGWORD is a terminal symbol that matches
-                                   the empty string at the beginning of a
-                                   word.  */
-
-  ENDWORD,                      /* ENDWORD is a terminal symbol that matches
-                                   the empty string at the end of a word.  */
-
-  LIMWORD,                      /* LIMWORD is a terminal symbol that matches
-                                   the empty string at the beginning or the
-                                   end of a word.  */
-
-  NOTLIMWORD,                   /* NOTLIMWORD is a terminal symbol that
-                                   matches the empty string not at
-                                   the beginning or end of a word.  */
-
   QMARK,                        /* QMARK is an operator of one argument that
                                    matches zero or one occurrences of its
                                    argument.  */
@@ -296,16 +270,49 @@ enum
 
   RPAREN,                       /* RPAREN never appears in the parse tree.  */
 
+  WCHAR,                        /* Only returned by lex.  wctok contains
+                                   the wide character representation.  */
+
   ANYCHAR,                      /* ANYCHAR is a terminal symbol that matches
                                    a valid multibyte (or single byte) character.
                                    It is used only if MB_CUR_MAX > 1.  */
 
+  BEG,                          /* BEG is an initial symbol that matches the
+                                   beginning of input.  */
+
+  BEGLINE,                      /* BEGLINE is a terminal symbol that matches
+                                   the empty string at the beginning of a
+                                   line.  */
+
+  ENDLINE,                      /* ENDLINE is a terminal symbol that matches
+                                   the empty string at the end of a line.  */
+
+  BEGWORD,                      /* BEGWORD is a terminal symbol that matches
+                                   the empty string at the beginning of a
+                                   word.  */
+
+  ENDWORD,                      /* ENDWORD is a terminal symbol that matches
+                                   the empty string at the end of a word.  */
+
+  LIMWORD,                      /* LIMWORD is a terminal symbol that matches
+                                   the empty string at the beginning or the
+                                   end of a word.  */
+
+  NOTLIMWORD,                   /* NOTLIMWORD is a terminal symbol that
+                                   matches the empty string not at
+                                   the beginning or end of a word.  */
+
+  BACKREF,                      /* BACKREF is generated by \<digit>
+                                   or by any other construct that
+                                   is not completely handled.  If the scanner
+                                   detects a transition on backref, it returns
+                                   a kind of "semi-success" indicating that
+                                   the match will have to be verified with
+                                   a backtracking matcher.  */
+
   MBCSET,                       /* MBCSET is similar to CSET, but for
                                    multibyte characters.  */
 
-  WCHAR,                        /* Only returned by lex.  wctok contains
-                                   the wide character representation.  */
-
   CSET                          /* CSET and (and any value greater) is a
                                    terminal symbol that matches any of a
                                    class of characters.  */
@@ -1803,7 +1810,30 @@ add_utf8_anychar (struct dfa *dfa)
 static void
 atom (struct dfa *dfa)
 {
-  if (dfa->parse.tok == WCHAR)
+  if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR)
+      || dfa->parse.tok >= CSET
+      || dfa->parse.tok == BEG || dfa->parse.tok == BACKREF
+      || dfa->parse.tok == BEGLINE || dfa->parse.tok == ENDLINE
+      || dfa->parse.tok == BEGWORD || dfa->parse.tok == ENDWORD
+      || dfa->parse.tok == LIMWORD || dfa->parse.tok == NOTLIMWORD
+      || dfa->parse.tok == ANYCHAR || dfa->parse.tok == MBCSET)
+    {
+      if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
+        {
+          /* For UTF-8 expand the period to a series of CSETs that define a
+             valid UTF-8 character.  This avoids using the slow multibyte
+             path.  I'm pretty sure it would be both profitable and correct to
+             do it for any encoding; however, the optimization must be done
+             manually as it is done above in add_utf8_anychar.  So, let's
+             start with UTF-8: it is the most used, and the structure of the
+             encoding makes the correctness more obvious.  */
+          add_utf8_anychar (dfa);
+        }
+      else
+        addtok (dfa, dfa->parse.tok);
+      dfa->parse.tok = lex (dfa);
+    }
+  else if (dfa->parse.tok == WCHAR)
     {
       if (dfa->lex.wctok == WEOF)
         addtok (dfa, BACKREF);
@@ -1826,28 +1856,6 @@ atom (struct dfa *dfa)
 
       dfa->parse.tok = lex (dfa);
     }
-  else if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
-    {
-      /* For UTF-8 expand the period to a series of CSETs that define a valid
-         UTF-8 character.  This avoids using the slow multibyte path.  I'm
-         pretty sure it would be both profitable and correct to do it for
-         any encoding; however, the optimization must be done manually as
-         it is done above in add_utf8_anychar.  So, let's start with
-         UTF-8: it is the most used, and the structure of the encoding
-         makes the correctness more obvious.  */
-      add_utf8_anychar (dfa);
-      dfa->parse.tok = lex (dfa);
-    }
-  else if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR)
-           || dfa->parse.tok >= CSET || dfa->parse.tok == BACKREF
-           || dfa->parse.tok == BEGLINE || dfa->parse.tok == ENDLINE
-           || dfa->parse.tok == BEGWORD || dfa->parse.tok == ANYCHAR
-           || dfa->parse.tok == MBCSET || dfa->parse.tok == ENDWORD
-           || dfa->parse.tok == LIMWORD || dfa->parse.tok == NOTLIMWORD)
-    {
-      addtok (dfa, dfa->parse.tok);
-      dfa->parse.tok = lex (dfa);
-    }
   else if (dfa->parse.tok == LPAREN)
     {
       dfa->parse.tok = lex (dfa);
-- 
2.17.1

>From cbf5523bd54a92c1d8ffeae4b629d2b82651f651 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Tue, 18 Sep 2018 18:24:27 -0700
Subject: [PATCH 2/4] dfa: prune states as we go

* lib/dfa.c (prune): Remove.
(merge_nfa_state): Prune as we go instead of at the end.
Prefer ptrdiff_t for indexes, as this helps the compiler a bit.
Simplify constraint checking a bit.
---
 ChangeLog |  5 +++++
 lib/dfa.c | 49 ++++++++++++++++---------------------------------
 2 files changed, 21 insertions(+), 33 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 64926503a..bad8eda6d 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,6 +1,11 @@
 2018-09-18  Paul Eggert  <egg...@cs.ucla.edu>
 
+	dfa: prune states as we go
+	* lib/dfa.c (prune): Remove.
 	dfa: reorder enum for efficiency
+	(merge_nfa_state): Prune as we go instead of at the end.
+	Prefer ptrdiff_t for indexes, as this helps the compiler a bit.
+
 	* lib/dfa.c (END): Now -1 again.  Reorder other elements
 	of the enumeration to make it easier for GCC to generate
 	efficient code by using fewer comparisons to check for
diff --git a/lib/dfa.c b/lib/dfa.c
index 849645154..73bd6ca09 100644
--- a/lib/dfa.c
+++ b/lib/dfa.c
@@ -2129,22 +2129,6 @@ delete (size_t del, position_set *s)
   return 0;
 }
 
-static void
-prune (position_set *s)
-{
-  size_t d = 0;
-
-  for (size_t i = 0; i < s->nelem; i++)
-    {
-      if (s->elems[i].constraint == 0)
-        continue;
-
-      s->elems[d++] = s->elems[i];
-    }
-
-  s->nelem = d;
-}
-
 /* Replace a position with the followed set.  */
 static void
 replace (position_set *dst, size_t del, position_set *add,
@@ -2362,17 +2346,20 @@ static size_t
 merge_nfa_state (struct dfa *d, size_t *queue, size_t nqueue, size_t tindex,
                  int *flags, position_set *merged)
 {
-  size_t sindex;
-  size_t dindex;
+  position_set *follows = d->follows;
+  ptrdiff_t nelem = 0;
 
-  for (size_t i = 0; i < d->follows[tindex].nelem; i++)
+  for (ptrdiff_t i = 0; i < follows[tindex].nelem; i++)
     {
-      dindex = d->follows[tindex].elems[i].index;
+      size_t dindex = follows[tindex].elems[i].index;
 
       /* Skip the node as pruned in future.  */
-      if (d->follows[tindex].elems[i].constraint == 0)
+      unsigned int iconstraint = follows[tindex].elems[i].constraint;
+      if (iconstraint == 0)
         continue;
 
+      follows[tindex].elems[nelem++] = follows[tindex].elems[i];
+
       if (tindex < dindex && !(flags[dindex] & OPT_QUEUED))
         {
           queue[nqueue++] = dindex;
@@ -2382,11 +2369,11 @@ merge_nfa_state (struct dfa *d, size_t *queue, size_t nqueue, size_t tindex,
       if (flags[dindex] & (OPT_LPAREN | OPT_RPAREN))
         continue;
 
-      for (size_t j = i + 1; j < d->follows[tindex].nelem; j++)
+      for (size_t j = i + 1; j < follows[tindex].nelem; j++)
         {
-          sindex = d->follows[tindex].elems[j].index;
+          size_t sindex = follows[tindex].elems[j].index;
 
-          if (d->follows[tindex].elems[j].constraint == 0)
+          if (follows[tindex].elems[j].constraint != iconstraint)
             continue;
 
           if (flags[sindex] & (OPT_LPAREN | OPT_RPAREN))
@@ -2395,25 +2382,21 @@ merge_nfa_state (struct dfa *d, size_t *queue, size_t nqueue, size_t tindex,
           if (d->tokens[sindex] != d->tokens[dindex])
             continue;
 
-          if (d->follows[tindex].elems[i].constraint !=
-              d->follows[tindex].elems[j].constraint)
-            continue;
-
           if ((flags[sindex] ^ flags[dindex]) & OPT_REPEAT)
             continue;
 
           if (flags[sindex] & OPT_REPEAT)
-            delete (sindex, &d->follows[sindex]);
+            delete (sindex, &follows[sindex]);
 
-          merge (&d->follows[dindex], &d->follows[sindex], merged);
-          copy (merged, &d->follows[dindex]);
+          merge (&follows[dindex], &follows[sindex], merged);
+          copy (merged, &follows[dindex]);
 
           /* Mark it as pruned in future.  */
-          d->follows[tindex].elems[j].constraint = 0;
+          follows[tindex].elems[j].constraint = 0;
         }
     }
 
-  prune (&d->follows[tindex]);
+  follows[tindex].nelem = nelem;
 
   return nqueue;
 }
-- 
2.17.1

>From 28d7f171a7ac284c2377793559d55e887610fcc8 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Tue, 18 Sep 2018 19:05:26 -0700
Subject: [PATCH 3/4] dfa: tweak allocation performance
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* lib/dfa.c (merge_nfa_state, dfaoptimize):
Prefer ptrdiff_t for indexes some more.
Use char for flags, as it’s wide enough.
Allocate queue and flags together, with one malloc call.
No need to use xnmalloc since the multiplication and
addition cannot overflow (it’s already been checked by
earlier allocation).  Prefer memset to open-coding.
---
 ChangeLog |  9 +++++++++
 lib/dfa.c | 34 ++++++++++++++++------------------
 2 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index bad8eda6d..9c4b12c60 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,14 @@
 2018-09-18  Paul Eggert  <egg...@cs.ucla.edu>
 
+	dfa: tweak allocation performance
+	* lib/dfa.c (merge_nfa_state, dfaoptimize):
+	Prefer ptrdiff_t for indexes some more.
+	Use char for flags, as it’s wide enough.
+	Allocate queue and flags together, with one malloc call.
+	No need to use xnmalloc since the multiplication and
+	addition cannot overflow (it’s already been checked by
+	earlier allocation).  Prefer memset to open-coding.
+
 	dfa: prune states as we go
 	* lib/dfa.c (prune): Remove.
 	dfa: reorder enum for efficiency
diff --git a/lib/dfa.c b/lib/dfa.c
index 73bd6ca09..59e15195a 100644
--- a/lib/dfa.c
+++ b/lib/dfa.c
@@ -2342,9 +2342,9 @@ enum
   OPT_QUEUED = (1 << 4)
 };
 
-static size_t
-merge_nfa_state (struct dfa *d, size_t *queue, size_t nqueue, size_t tindex,
-                 int *flags, position_set *merged)
+static ptrdiff_t
+merge_nfa_state (struct dfa *d, size_t *queue, ptrdiff_t nqueue, size_t tindex,
+                 char *flags, position_set *merged)
 {
   position_set *follows = d->follows;
   ptrdiff_t nelem = 0;
@@ -2369,7 +2369,7 @@ merge_nfa_state (struct dfa *d, size_t *queue, size_t nqueue, size_t tindex,
       if (flags[dindex] & (OPT_LPAREN | OPT_RPAREN))
         continue;
 
-      for (size_t j = i + 1; j < follows[tindex].nelem; j++)
+      for (ptrdiff_t j = i + 1; j < follows[tindex].nelem; j++)
         {
           size_t sindex = follows[tindex].elems[j].index;
 
@@ -2404,28 +2404,27 @@ merge_nfa_state (struct dfa *d, size_t *queue, size_t nqueue, size_t tindex,
 static void
 dfaoptimize (struct dfa *d)
 {
-  int *flags;
+  char *flags;
   size_t *queue;
-  size_t nqueue;
+  ptrdiff_t nqueue;
   position_set merged0;
   position_set *merged;
+  ptrdiff_t extra_space = d->tindex + sizeof *queue - 1;
+  extra_space -= extra_space % sizeof *queue;
 
-  flags = xnmalloc (d->tindex, sizeof *flags);
-  queue = xnmalloc (d->nleaves, sizeof *queue);
-
-  for (size_t i = 0; i < d->tindex; ++i)
-    flags[i] = 0;
+  queue = xmalloc (d->nleaves * sizeof *queue + extra_space);
+  flags = (char *) (queue + d->nleaves);
+  memset (flags, 0, d->tindex * sizeof *flags);
 
-  for (size_t i = 0; i < d->tindex; ++i)
+  for (size_t i = 0; i < d->tindex; i++)
     {
-      for (size_t j = 0; j < d->follows[i].nelem; j++)
+      for (ptrdiff_t j = 0; j < d->follows[i].nelem; j++)
         {
           if (d->follows[i].elems[j].index == i)
             flags[d->follows[i].elems[j].index] |= OPT_REPEAT;
           else if (d->follows[i].elems[j].index < i)
             flags[d->follows[i].elems[j].index] |= OPT_LPAREN;
-          else if (flags[d->follows[i].elems[j].index] &=
-                   OPT_WALKED)
+          else if (flags[d->follows[i].elems[j].index] &= OPT_WALKED)
             flags[d->follows[i].elems[j].index] |= OPT_RPAREN;
           else
             flags[d->follows[i].elems[j].index] |= OPT_WALKED;
@@ -2438,12 +2437,11 @@ dfaoptimize (struct dfa *d)
   nqueue = 0;
   queue[nqueue++] = 0;
 
-  for (size_t i = 0; i < nqueue; i++)
+  for (ptrdiff_t i = 0; i < nqueue; i++)
     nqueue = merge_nfa_state (d, queue, nqueue, queue[i], flags, merged);
 
   free (merged->elems);
   free (queue);
-  free (flags);
 }
 
 /* Perform bottom-up analysis on the parse tree, computing various functions.
@@ -3921,7 +3919,7 @@ dfamust (struct dfa const *d)
   bool need_endline = false;
   bool case_fold_unibyte = d->syntax.case_fold && MB_CUR_MAX == 1;
 
-  for (size_t ri = 1; ri < d->tindex - 1; ++ri)
+  for (size_t ri = 1; ri + 1 < d->tindex; ri++)
     {
       token t = d->tokens[ri];
       switch (t)
-- 
2.17.1

>From bc7c0f010b3c6c4214e026bf678a62ef92f61a4f Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Tue, 18 Sep 2018 19:12:06 -0700
Subject: [PATCH 4/4] dfa: use more-informative function name

* lib/dfa.c (maybe_disable_superset_dfa):
Rename from dfautf8noss.  Use change.
---
 ChangeLog | 4 ++++
 lib/dfa.c | 6 ++++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 9c4b12c60..fa7a4557d 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,9 @@
 2018-09-18  Paul Eggert  <egg...@cs.ucla.edu>
 
+	dfa: use more-informative function name
+	* lib/dfa.c (maybe_disable_superset_dfa):
+	Rename from dfautf8noss.  Use change.
+
 	dfa: tweak allocation performance
 	* lib/dfa.c (merge_nfa_state, dfaoptimize):
 	Prefer ptrdiff_t for indexes some more.
diff --git a/lib/dfa.c b/lib/dfa.c
index 59e15195a..760e060c3 100644
--- a/lib/dfa.c
+++ b/lib/dfa.c
@@ -3483,8 +3483,10 @@ dfa_supported (struct dfa const *d)
   return true;
 }
 
+/* Disable use of the superset DFA is it is not likely to help
+   performance.  */
 static void
-dfautf8noss (struct dfa *d)
+maybe_disable_superset_dfa (struct dfa *d)
 {
   if (!d->localeinfo.using_utf8)
     return;
@@ -3612,7 +3614,7 @@ dfacomp (char const *s, size_t len, struct dfa *d, bool searchflag)
 
   if (dfa_supported (d))
     {
-      dfautf8noss (d);
+      maybe_disable_superset_dfa (d);
       dfaanalyze (d, searchflag);
     }
   else
-- 
2.17.1

Reply via email to