On Fri, Jun 6, 2014 at 6:36 AM, Norihiro Tanaka <nori...@kcn.ne.jp> wrote:
> If we don't use KWset, struct dfamust doesn't have to build.  This patch
> make a change that it's built on demand.

Thank you for that patch.
I've adjusted the commit log as well as comments in dfasearch.c that
have been inaccurate for some time. Here is the revised patch I will push
later today or tomorrow:
From f41b1bcd57eee550af49bdbe3a7a243a218faf44 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <nori...@kcn.ne.jp>
Date: Fri, 6 Jun 2014 19:08:08 +0900
Subject: [PATCH] dfa: build struct dfamust on demand

If we won't use KWset, do not build a "struct dfamust".
Now it is built only when needed.
* src/dfa.c (struct dfa) [musts]: Remove member.
(dfacomp): Don't build dfamust here.
(dfamustfree): New function to free a struct dfamust.
(dfamust): Make it a global function, and make it return a pointer
to a malloc'd struct dfamust.
(dfamusts): Remove it.
* src/dfa.h (struct dfamust) [next]: Remove member.
In the implementation preceding this patch, there was
never more than one of these in a given "struct dfa".
(dfamustfree, dfamust): Add prototypes.
(dfamusts): Remove prototype.
* src/dfasearch.c (kwsmusts): Adapt to use the new interface.
Update the comments to reflect reality.
This addresses http://bugs.gnu.org/17715
---
 src/dfa.c       | 60 ++++++++++++++++++++-----------------------------------
 src/dfa.h       |  8 +++++---
 src/dfasearch.c | 62 +++++++++++++++++++++++++++------------------------------
 3 files changed, 56 insertions(+), 74 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index c7b659e..9f024bb 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -429,9 +429,6 @@ struct dfa
                                    as a sentinel at the end of the buffer.  */
   state_num initstate_letter;   /* Initial state for letter context.  */
   state_num initstate_others;   /* Initial state for other contexts.  */
-  struct dfamust *musts;        /* List of strings, at least one of which
-                                   is known to appear in any r.e. matching
-                                   the dfa.  */
   position_set mb_follows;	/* Follow set added by ANYCHAR and/or MBCSET
                                    on demand.  */
   int *mb_match_lens;           /* Array of length reduced by ANYCHAR and/or
@@ -448,7 +445,6 @@ struct dfa
 #define ACCEPTS_IN_CONTEXT(prev, curr, state, dfa) \
   SUCCEEDS_IN_CONTEXT ((dfa).states[state].constraint, prev, curr)

-static void dfamust (struct dfa *dfa);
 static void regexp (void);

 static void
@@ -3647,7 +3643,6 @@ dfassbuild (struct dfa *d)
   sup->fails = NULL;
   sup->success = NULL;
   sup->newlines = NULL;
-  sup->musts = NULL;

   sup->charclasses = xnmalloc (sup->calloc, sizeof *sup->charclasses);
   if (d->cindex)
@@ -3714,7 +3709,6 @@ dfacomp (char const *s, size_t len, struct dfa *d, int searchflag)
   dfainit (d);
   dfambcache (d);
   dfaparse (s, len, d);
-  dfamust (d);
   dfassbuild (d);
   dfaoptimize (d);
   dfaanalyze (d, searchflag);
@@ -3730,7 +3724,6 @@ void
 dfafree (struct dfa *d)
 {
   size_t i;
-  struct dfamust *dm, *ndm;

   free (d->charclasses);
   free (d->tokens);
@@ -3766,13 +3759,6 @@ dfafree (struct dfa *d)
       free (d->success);
     }

-  for (dm = d->musts; dm; dm = ndm)
-    {
-      ndm = dm->next;
-      free (dm->must);
-      free (dm);
-    }
-
   if (d->superset)
     dfafree (d->superset);
 }
@@ -3793,9 +3779,7 @@ dfafree (struct dfa *d)
         sequences that must constitute the match ("is")

    When we get to the root of the tree, we use one of the longest of its
-   calculated "in" sequences as our answer.  The sequence we find is returned in
-   d->must (where "d" is the single argument passed to "dfamust");
-   the length of the sequence is returned in d->mustn.
+   calculated "in" sequences as our answer.

    The sequences calculated for the various types of node (in pseudo ANSI c)
    are shown below.  "p" is the operand of unary operators (and the left-hand
@@ -4019,8 +4003,8 @@ freemust (must *mp)
   free (mp);
 }

-static void
-dfamust (struct dfa *d)
+struct dfamust *
+dfamust (struct dfa const *d)
 {
   must *mp = NULL;
   char const *result = "";
@@ -4029,7 +4013,6 @@ dfamust (struct dfa *d)
   bool exact = false;
   bool begline = false;
   bool endline = false;
-  struct dfamust *dm;

   for (ri = 0; ri < d->tindex; ++ri)
     {
@@ -4190,30 +4173,28 @@ dfamust (struct dfa *d)
               t = j;
               while (++j < NOTCHAR)
                 if (tstbit (j, *ccl)
-                    && ! (case_fold && !d->multibyte
+                    && ! (case_fold && MB_CUR_MAX == 1
                           && toupper (j) == toupper (t)))
                   break;
               if (j < NOTCHAR)
                 break;
             }
           mp->is[0] = mp->left[0] = mp->right[0]
-            = case_fold && !d->multibyte ? toupper (t) : t;
+            = case_fold && MB_CUR_MAX == 1 ? toupper (t) : t;
           mp->is[1] = mp->left[1] = mp->right[1] = '\0';
           mp->in = enlist (mp->in, mp->is, 1);
           break;
         }
     }
 done:
-  if (*result)
-    {
-      dm = xmalloc (sizeof *dm);
-      dm->exact = exact;
-      dm->begline = begline;
-      dm->endline = endline;
-      dm->must = xstrdup (result);
-      dm->next = d->musts;
-      d->musts = dm;
-    }
+  if (!*result)
+    return NULL;
+
+  struct dfamust *dm = xmalloc (sizeof *dm);
+  dm->exact = exact;
+  dm->begline = begline;
+  dm->endline = endline;
+  dm->must = xstrdup (result);

   while (mp)
     {
@@ -4221,18 +4202,21 @@ done:
       freemust (mp);
       mp = prev;
     }
+
+  return dm;
 }

-struct dfa *
-dfaalloc (void)
+void
+dfamustfree (struct dfamust *dm)
 {
-  return xmalloc (sizeof (struct dfa));
+  free (dm->must);
+  free (dm);
 }

-struct dfamust *_GL_ATTRIBUTE_PURE
-dfamusts (struct dfa const *d)
+struct dfa *
+dfaalloc (void)
 {
-  return d->musts;
+  return xmalloc (sizeof (struct dfa));
 }

 /* vim:set shiftwidth=2: */
diff --git a/src/dfa.h b/src/dfa.h
index 9f95415..e4beb7e 100644
--- a/src/dfa.h
+++ b/src/dfa.h
@@ -30,7 +30,6 @@ struct dfamust
   bool begline;
   bool endline;
   char *must;
-  struct dfamust *next;
 };

 /* The dfa structure. It is completely opaque. */
@@ -43,8 +42,11 @@ struct dfa;
    calling dfafree() on it. */
 extern struct dfa *dfaalloc (void);

-/* Return the dfamusts associated with a dfa. */
-extern struct dfamust *dfamusts (struct dfa const *);
+/* Build and return the struct dfamust from the given struct dfa. */
+extern struct dfamust *dfamust (struct dfa const *);
+
+/* Free the storage held by the components of a struct dfamust. */
+extern void dfamustfree (struct dfamust *);

 /* dfasyntax() takes three arguments; the first sets the syntax bits described
    earlier in this file, the second sets the case-folding flag, and the
diff --git a/src/dfasearch.c b/src/dfasearch.c
index c8fc91b..8b8af06 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -86,41 +86,37 @@ dfawarn (char const *mesg)
 static void
 kwsmusts (void)
 {
-  struct dfamust const *dm = dfamusts (dfa);
-  if (dm)
+  struct dfamust *dm = dfamust (dfa);
+  if (!dm)
+    return;
+  kwsinit (&kwset);
+  if (dm->exact)
     {
-      kwsinit (&kwset);
-      /* First, we compile in the substrings known to be exact
-         matches.  The kwset matcher will return the index
-         of the matching string that it chooses. */
-      for (; dm; dm = dm->next)
-        {
-          if (!dm->exact)
-            continue;
-          ++kwset_exact_matches;
-          size_t old_len = strlen (dm->must);
-          size_t new_len = old_len + dm->begline + dm->endline;
-          char *must = xmalloc (new_len);
-          char *mp = must;
-          *mp = eolbyte;
-          mp += dm->begline;
-          begline |= dm->begline;
-          memcpy (mp, dm->must, old_len);
-          if (dm->endline)
-            mp[old_len] = eolbyte;
-          kwsincr (kwset, must, new_len);
-          free (must);
-        }
-      /* Now, we compile the substrings that will require
-         the use of the regexp matcher.  */
-      for (dm = dfamusts (dfa); dm; dm = dm->next)
-        {
-          if (dm->exact)
-            continue;
-          kwsincr (kwset, dm->must, strlen (dm->must));
-        }
-      kwsprep (kwset);
+      /* Prepare a substring whose presence implies a match.
+         The kwset matcher will return the index of the matching
+         string that it chooses. */
+      ++kwset_exact_matches;
+      size_t old_len = strlen (dm->must);
+      size_t new_len = old_len + dm->begline + dm->endline;
+      char *must = xmalloc (new_len);
+      char *mp = must;
+      *mp = eolbyte;
+      mp += dm->begline;
+      begline |= dm->begline;
+      memcpy (mp, dm->must, old_len);
+      if (dm->endline)
+        mp[old_len] = eolbyte;
+      kwsincr (kwset, must, new_len);
+      free (must);
+    }
+  else
+    {
+      /* Otherwise, filtering with this substring should help reduce the
+         search space, but we'll still have to use the regexp matcher.  */
+      kwsincr (kwset, dm->must, strlen (dm->must));
     }
+  kwsprep (kwset);
+  dfamustfree (dm);
 }

 void
-- 
2.3.7

Reply via email to