Re: Some regular-expression performance hacking

Tom Lane Wed, 24 Feb 2021 15:20:54 -0800

I wrote:
> On my machine, the combination of these two ideas reduces the
> runtime of the example above from ~150 seconds to ~53 seconds,
> or nearly 3x better.  I see something like a 2% improvement on
> Joel's test corpus, which might just be noise.  So this isn't
> any sort of universal panacea, but it sure helps when LACON
> evaluation is the bottleneck.


After another round of testing, I really can't see any improvement
at all from that patch on anything except the original Tcl test
case.  Indeed, a lot of cases seem very slightly worse, perhaps
because compact() now has to make two passes over all the arcs.
So that's leaving me a bit dissatisfied with it; I'm going to
stick it on the back burner for now, in hopes of a better idea.

However, in a different line of thought, I realized that the
memory allocation logic could use some polishing.  It gives out
ten arcs per NFA state initially, and then adds ten more at a time.
However, that's not very bright when you look at the actual usage
patterns, because most states have only one or two out-arcs,
but some have lots and lots.  I instrumented things to gather
stats about arcs-per-state on your larger corpus, and I got this,
where the seond column is the total fraction of states having
the given number of arcs or fewer:

 arcs |      cum_fraction      
------+------------------------
    0 | 0.03152871318455725868
    1 | 0.55852399556959499493
    2 | 0.79408539124378449284
    3 | 0.86926656199366447221
    4 | 0.91726891675794579062
    5 | 0.92596934405572457792
    6 | 0.93491612836055807037
    7 | 0.94075102352639209644
    8 | 0.94486598829672779379
    9 | 0.94882085883928361399
   10 | 0.95137992908336444821
   11 | 0.95241399914559696173
   12 | 0.95436547669138874594
   13 | 0.95534682472329051385
   14 | 0.95653340893356523452
   15 | 0.95780804864876924571
   16 | 0.95902387577636979702
   17 | 0.95981494467267418552
   18 | 0.96048662216159976997
   19 | 0.96130294229052153065
   20 | 0.96196856160309755204
...
 3238 | 0.99999985870142624926
 3242 | 0.99999987047630739515
 4095 | 0.99999987342002768163
 4535 | 0.99999987930746825457
 4642 | 0.99999988225118854105
 4706 | 0.99999989402606968694
 5890 | 0.99999989696978997342
 6386 | 0.99999990874467111931
 7098 | 0.99999991168839140579
 7751 | 0.99999994701303484347
 7755 | 0.99999998233767828116
 7875 | 0.99999998822511885410
 8049 | 1.00000000000000000000

So it seemed clear to me that we should only give out a couple of arcs
per state initially, but then let it ramp up faster than 10 arcs per
additional malloc.  After a bit of fooling I have the attached.
This does nothing for the very largest examples in the corpus (the
ones that cause "regex too complex") --- those were well over the
REG_MAX_COMPILE_SPACE limit before and they still are.  But all the
rest get nicely smaller.  The average pg_regcomp memory consumption
drops from ~89K to ~48K.

                        regards, tom lane

diff --git a/src/backend/regex/regc_nfa.c b/src/backend/regex/regc_nfa.c
index 69929eddfc..411c86f4ff 100644
--- a/src/backend/regex/regc_nfa.c
+++ b/src/backend/regex/regc_nfa.c
@@ -142,9 +142,12 @@ newstate(struct nfa *nfa)
 	{
 		s = nfa->free;
 		nfa->free = s->next;
+		/* the state's free arcs stay as they are */
 	}
 	else
 	{
+		int			i;
+
 		if (nfa->v->spaceused >= REG_MAX_COMPILE_SPACE)
 		{
 			NERR(REG_ETOOBIG);
@@ -157,9 +160,16 @@ newstate(struct nfa *nfa)
 			return NULL;
 		}
 		nfa->v->spaceused += sizeof(struct state);
-		s->oas.next = NULL;
-		s->free = NULL;
-		s->noas = 0;
+		s->lastab = NULL;
+		/* initialize free arcs; this should match allocarc() */
+		for (i = 0; i < NINITARCS - 1; i++)
+		{
+			s->inita[i].type = 0;
+			s->inita[i].freechain = &s->inita[i + 1];
+		}
+		s->inita[i].type = 0;
+		s->inita[i].freechain = NULL;
+		s->free = &s->inita[0];
 	}
 
 	assert(nfa->nstates >= 0);
@@ -255,11 +265,11 @@ destroystate(struct nfa *nfa,
 	struct arcbatch *abnext;
 
 	assert(s->no == FREESTATE);
-	for (ab = s->oas.next; ab != NULL; ab = abnext)
+	for (ab = s->lastab; ab != NULL; ab = abnext)
 	{
 		abnext = ab->next;
+		nfa->v->spaceused -= ARCBATCHSIZE(ab->narcs);
 		FREE(ab);
-		nfa->v->spaceused -= sizeof(struct arcbatch);
 	}
 	s->ins = NULL;
 	s->outs = NULL;
@@ -377,41 +387,39 @@ allocarc(struct nfa *nfa,
 {
 	struct arc *a;
 
-	/* shortcut */
-	if (s->free == NULL && s->noas < ABSIZE)
-	{
-		a = &s->oas.a[s->noas];
-		s->noas++;
-		return a;
-	}
-
 	/* if none at hand, get more */
 	if (s->free == NULL)
 	{
 		struct arcbatch *newAb;
-		int			i;
+		size_t		narcs;
+		size_t		i;
 
 		if (nfa->v->spaceused >= REG_MAX_COMPILE_SPACE)
 		{
 			NERR(REG_ETOOBIG);
 			return NULL;
 		}
-		newAb = (struct arcbatch *) MALLOC(sizeof(struct arcbatch));
+		narcs = (s->lastab != NULL) ? s->lastab->narcs * 2 : FIRSTABSIZE;
+		if (narcs > MAXABSIZE)
+			narcs = MAXABSIZE;
+		newAb = (struct arcbatch *) MALLOC(ARCBATCHSIZE(narcs));
 		if (newAb == NULL)
 		{
 			NERR(REG_ESPACE);
 			return NULL;
 		}
-		nfa->v->spaceused += sizeof(struct arcbatch);
-		newAb->next = s->oas.next;
-		s->oas.next = newAb;
+		nfa->v->spaceused += ARCBATCHSIZE(narcs);
+		newAb->narcs = narcs;
+		newAb->next = s->lastab;
+		s->lastab = newAb;
 
-		for (i = 0; i < ABSIZE; i++)
+		for (i = 0; i < narcs - 1; i++)
 		{
 			newAb->a[i].type = 0;
 			newAb->a[i].freechain = &newAb->a[i + 1];
 		}
-		newAb->a[ABSIZE - 1].freechain = NULL;
+		newAb->a[i].type = 0;
+		newAb->a[i].freechain = NULL;
 		s->free = &newAb->a[0];
 	}
 	assert(s->free != NULL);
@@ -3413,7 +3421,7 @@ dumparc(struct arc *a,
 		FILE *f)
 {
 	struct arc *aa;
-	struct arcbatch *ab;
+	bool		found = false;
 
 	fprintf(f, "\t");
 	switch (a->type)
@@ -3451,15 +3459,29 @@ dumparc(struct arc *a,
 	}
 	if (a->from != s)
 		fprintf(f, "?%d?", a->from->no);
-	for (ab = &a->from->oas; ab != NULL; ab = ab->next)
+
+	/*
+	 * Check that arc belongs to its from-state's arc space.  This is
+	 * approximate, since we don't verify the arc isn't misaligned within the
+	 * space; but it's enough to catch the likely problem of an arc being
+	 * misassigned to a different parent state.
+	 */
+	if (a >= a->from->inita && a < a->from->inita + NINITARCS)
+		found = true;			/* member of initial arcset */
+	else
 	{
-		for (aa = &ab->a[0]; aa < &ab->a[ABSIZE]; aa++)
-			if (aa == a)
+		struct arcbatch *ab;
+
+		for (ab = a->from->lastab; ab != NULL; ab = ab->next)
+		{
+			if (a >= ab->a && a < ab->a + ab->narcs)
+			{
+				found = true;
 				break;			/* NOTE BREAK OUT */
-		if (aa < &ab->a[ABSIZE])	/* propagate break */
-			break;				/* NOTE BREAK OUT */
+			}
+		}
 	}
-	if (ab == NULL)
+	if (!found)
 		fprintf(f, "?!?");		/* not in allocated space */
 	fprintf(f, "->");
 	if (a->to == NULL)
diff --git a/src/include/regex/regguts.h b/src/include/regex/regguts.h
index 306525eb5f..65e6ace9d1 100644
--- a/src/include/regex/regguts.h
+++ b/src/include/regex/regguts.h
@@ -294,12 +294,23 @@ struct arc
 	struct arc *colorchainRev;	/* back-link in color's arc chain */
 };
 
+/*
+ * We include NINITARCS arcs directly in the parent state, to save a separate
+ * malloc for states with few out-arcs.  If more arcs are needed, the first
+ * arcbatch has size FIRSTABSIZE; each succeeding batch doubles in size,
+ * up to MAXABSIZE.
+ */
+#define  NINITARCS		2		/* 2 is enough for about 80% of states */
+#define  FIRSTABSIZE	8		/* 2+8 is enough for 95% of states */
+#define  MAXABSIZE		64		/* don't waste too much space per arcbatch */
+
 struct arcbatch
 {								/* for bulk allocation of arcs */
-	struct arcbatch *next;
-#define  ABSIZE  10
-	struct arc	a[ABSIZE];
+	struct arcbatch *next;		/* chain link */
+	size_t		narcs;			/* number of arcs allocated in this arcbatch */
+	struct arc	a[FLEXIBLE_ARRAY_MEMBER];
 };
+#define  ARCBATCHSIZE(n)  ((n) * sizeof(struct arc) + offsetof(struct arcbatch, a))
 
 struct state
 {
@@ -312,10 +323,10 @@ struct state
 	struct arc *outs;			/* chain of outarcs */
 	struct arc *free;			/* chain of free arcs */
 	struct state *tmp;			/* temporary for traversal algorithms */
-	struct state *next;			/* chain for traversing all */
-	struct state *prev;			/* back chain */
-	struct arcbatch oas;		/* first arcbatch, avoid malloc in easy case */
-	int			noas;			/* number of arcs used in first arcbatch */
+	struct state *next;			/* chain for traversing all live states */
+	struct state *prev;			/* back-link in chain of all live states */
+	struct arcbatch *lastab;	/* chain of associated arcbatches */
+	struct arc	inita[NINITARCS];	/* a few arcs to avoid extra malloc */
 };
 
 struct nfa
@@ -413,7 +424,7 @@ struct cnfa
  */
 #ifndef REG_MAX_COMPILE_SPACE
 #define REG_MAX_COMPILE_SPACE  \
-	(100000 * sizeof(struct state) + 100000 * sizeof(struct arcbatch))
+	(100000 * sizeof(struct state) + 2000000 * sizeof(struct arc))
 #endif
 
 /*

Re: Some regular-expression performance hacking

Reply via email to