I wrote: > On my machine, the combination of these two ideas reduces the > runtime of the example above from ~150 seconds to ~53 seconds, > or nearly 3x better. I see something like a 2% improvement on > Joel's test corpus, which might just be noise. So this isn't > any sort of universal panacea, but it sure helps when LACON > evaluation is the bottleneck.
After another round of testing, I really can't see any improvement at all from that patch on anything except the original Tcl test case. Indeed, a lot of cases seem very slightly worse, perhaps because compact() now has to make two passes over all the arcs. So that's leaving me a bit dissatisfied with it; I'm going to stick it on the back burner for now, in hopes of a better idea. However, in a different line of thought, I realized that the memory allocation logic could use some polishing. It gives out ten arcs per NFA state initially, and then adds ten more at a time. However, that's not very bright when you look at the actual usage patterns, because most states have only one or two out-arcs, but some have lots and lots. I instrumented things to gather stats about arcs-per-state on your larger corpus, and I got this, where the seond column is the total fraction of states having the given number of arcs or fewer: arcs | cum_fraction ------+------------------------ 0 | 0.03152871318455725868 1 | 0.55852399556959499493 2 | 0.79408539124378449284 3 | 0.86926656199366447221 4 | 0.91726891675794579062 5 | 0.92596934405572457792 6 | 0.93491612836055807037 7 | 0.94075102352639209644 8 | 0.94486598829672779379 9 | 0.94882085883928361399 10 | 0.95137992908336444821 11 | 0.95241399914559696173 12 | 0.95436547669138874594 13 | 0.95534682472329051385 14 | 0.95653340893356523452 15 | 0.95780804864876924571 16 | 0.95902387577636979702 17 | 0.95981494467267418552 18 | 0.96048662216159976997 19 | 0.96130294229052153065 20 | 0.96196856160309755204 ... 3238 | 0.99999985870142624926 3242 | 0.99999987047630739515 4095 | 0.99999987342002768163 4535 | 0.99999987930746825457 4642 | 0.99999988225118854105 4706 | 0.99999989402606968694 5890 | 0.99999989696978997342 6386 | 0.99999990874467111931 7098 | 0.99999991168839140579 7751 | 0.99999994701303484347 7755 | 0.99999998233767828116 7875 | 0.99999998822511885410 8049 | 1.00000000000000000000 So it seemed clear to me that we should only give out a couple of arcs per state initially, but then let it ramp up faster than 10 arcs per additional malloc. After a bit of fooling I have the attached. This does nothing for the very largest examples in the corpus (the ones that cause "regex too complex") --- those were well over the REG_MAX_COMPILE_SPACE limit before and they still are. But all the rest get nicely smaller. The average pg_regcomp memory consumption drops from ~89K to ~48K. regards, tom lane
diff --git a/src/backend/regex/regc_nfa.c b/src/backend/regex/regc_nfa.c index 69929eddfc..411c86f4ff 100644 --- a/src/backend/regex/regc_nfa.c +++ b/src/backend/regex/regc_nfa.c @@ -142,9 +142,12 @@ newstate(struct nfa *nfa) { s = nfa->free; nfa->free = s->next; + /* the state's free arcs stay as they are */ } else { + int i; + if (nfa->v->spaceused >= REG_MAX_COMPILE_SPACE) { NERR(REG_ETOOBIG); @@ -157,9 +160,16 @@ newstate(struct nfa *nfa) return NULL; } nfa->v->spaceused += sizeof(struct state); - s->oas.next = NULL; - s->free = NULL; - s->noas = 0; + s->lastab = NULL; + /* initialize free arcs; this should match allocarc() */ + for (i = 0; i < NINITARCS - 1; i++) + { + s->inita[i].type = 0; + s->inita[i].freechain = &s->inita[i + 1]; + } + s->inita[i].type = 0; + s->inita[i].freechain = NULL; + s->free = &s->inita[0]; } assert(nfa->nstates >= 0); @@ -255,11 +265,11 @@ destroystate(struct nfa *nfa, struct arcbatch *abnext; assert(s->no == FREESTATE); - for (ab = s->oas.next; ab != NULL; ab = abnext) + for (ab = s->lastab; ab != NULL; ab = abnext) { abnext = ab->next; + nfa->v->spaceused -= ARCBATCHSIZE(ab->narcs); FREE(ab); - nfa->v->spaceused -= sizeof(struct arcbatch); } s->ins = NULL; s->outs = NULL; @@ -377,41 +387,39 @@ allocarc(struct nfa *nfa, { struct arc *a; - /* shortcut */ - if (s->free == NULL && s->noas < ABSIZE) - { - a = &s->oas.a[s->noas]; - s->noas++; - return a; - } - /* if none at hand, get more */ if (s->free == NULL) { struct arcbatch *newAb; - int i; + size_t narcs; + size_t i; if (nfa->v->spaceused >= REG_MAX_COMPILE_SPACE) { NERR(REG_ETOOBIG); return NULL; } - newAb = (struct arcbatch *) MALLOC(sizeof(struct arcbatch)); + narcs = (s->lastab != NULL) ? s->lastab->narcs * 2 : FIRSTABSIZE; + if (narcs > MAXABSIZE) + narcs = MAXABSIZE; + newAb = (struct arcbatch *) MALLOC(ARCBATCHSIZE(narcs)); if (newAb == NULL) { NERR(REG_ESPACE); return NULL; } - nfa->v->spaceused += sizeof(struct arcbatch); - newAb->next = s->oas.next; - s->oas.next = newAb; + nfa->v->spaceused += ARCBATCHSIZE(narcs); + newAb->narcs = narcs; + newAb->next = s->lastab; + s->lastab = newAb; - for (i = 0; i < ABSIZE; i++) + for (i = 0; i < narcs - 1; i++) { newAb->a[i].type = 0; newAb->a[i].freechain = &newAb->a[i + 1]; } - newAb->a[ABSIZE - 1].freechain = NULL; + newAb->a[i].type = 0; + newAb->a[i].freechain = NULL; s->free = &newAb->a[0]; } assert(s->free != NULL); @@ -3413,7 +3421,7 @@ dumparc(struct arc *a, FILE *f) { struct arc *aa; - struct arcbatch *ab; + bool found = false; fprintf(f, "\t"); switch (a->type) @@ -3451,15 +3459,29 @@ dumparc(struct arc *a, } if (a->from != s) fprintf(f, "?%d?", a->from->no); - for (ab = &a->from->oas; ab != NULL; ab = ab->next) + + /* + * Check that arc belongs to its from-state's arc space. This is + * approximate, since we don't verify the arc isn't misaligned within the + * space; but it's enough to catch the likely problem of an arc being + * misassigned to a different parent state. + */ + if (a >= a->from->inita && a < a->from->inita + NINITARCS) + found = true; /* member of initial arcset */ + else { - for (aa = &ab->a[0]; aa < &ab->a[ABSIZE]; aa++) - if (aa == a) + struct arcbatch *ab; + + for (ab = a->from->lastab; ab != NULL; ab = ab->next) + { + if (a >= ab->a && a < ab->a + ab->narcs) + { + found = true; break; /* NOTE BREAK OUT */ - if (aa < &ab->a[ABSIZE]) /* propagate break */ - break; /* NOTE BREAK OUT */ + } + } } - if (ab == NULL) + if (!found) fprintf(f, "?!?"); /* not in allocated space */ fprintf(f, "->"); if (a->to == NULL) diff --git a/src/include/regex/regguts.h b/src/include/regex/regguts.h index 306525eb5f..65e6ace9d1 100644 --- a/src/include/regex/regguts.h +++ b/src/include/regex/regguts.h @@ -294,12 +294,23 @@ struct arc struct arc *colorchainRev; /* back-link in color's arc chain */ }; +/* + * We include NINITARCS arcs directly in the parent state, to save a separate + * malloc for states with few out-arcs. If more arcs are needed, the first + * arcbatch has size FIRSTABSIZE; each succeeding batch doubles in size, + * up to MAXABSIZE. + */ +#define NINITARCS 2 /* 2 is enough for about 80% of states */ +#define FIRSTABSIZE 8 /* 2+8 is enough for 95% of states */ +#define MAXABSIZE 64 /* don't waste too much space per arcbatch */ + struct arcbatch { /* for bulk allocation of arcs */ - struct arcbatch *next; -#define ABSIZE 10 - struct arc a[ABSIZE]; + struct arcbatch *next; /* chain link */ + size_t narcs; /* number of arcs allocated in this arcbatch */ + struct arc a[FLEXIBLE_ARRAY_MEMBER]; }; +#define ARCBATCHSIZE(n) ((n) * sizeof(struct arc) + offsetof(struct arcbatch, a)) struct state { @@ -312,10 +323,10 @@ struct state struct arc *outs; /* chain of outarcs */ struct arc *free; /* chain of free arcs */ struct state *tmp; /* temporary for traversal algorithms */ - struct state *next; /* chain for traversing all */ - struct state *prev; /* back chain */ - struct arcbatch oas; /* first arcbatch, avoid malloc in easy case */ - int noas; /* number of arcs used in first arcbatch */ + struct state *next; /* chain for traversing all live states */ + struct state *prev; /* back-link in chain of all live states */ + struct arcbatch *lastab; /* chain of associated arcbatches */ + struct arc inita[NINITARCS]; /* a few arcs to avoid extra malloc */ }; struct nfa @@ -413,7 +424,7 @@ struct cnfa */ #ifndef REG_MAX_COMPILE_SPACE #define REG_MAX_COMPILE_SPACE \ - (100000 * sizeof(struct state) + 100000 * sizeof(struct arcbatch)) + (100000 * sizeof(struct state) + 2000000 * sizeof(struct arc)) #endif /*