Re: Another regexp performance improvement: skip useless paren-captures

Tom Lane Mon, 09 Aug 2021 16:32:05 -0700

Mark Dilger <mark.dil...@enterprisedb.com> writes:
> +select regexp_split_to_array('', '(?:((?:q+))){0}(\1){0,0}?*[^]');
> +server closed the connection unexpectedly


Here's a quick draft patch for this.  Basically it moves the
responsibility for clearing v->subs[] pointers into the freesubre()
recursion, so that it will happen for contained capturing parens
not only the top level.

There is a potentially interesting definitional question:
what exactly ought this regexp do?

                ((.)){0}\2

Because the capturing paren sets are zero-quantified, they will
never be matched to any characters, so the backref can never
have any defined referent.  I suspect that study of the POSIX
spec would lead to the conclusion that this is a legal regexp
but it will never match anything.  Implementing that would be
tedious though, and what's more it seems very unlikely that
the user wanted any such behavior.  So I think throwing an
error is an appropriate response.  The existing code will
throw such an error for

                ((.)){0}\1

so I guess Spencer did think about this to some extent -- he
just forgot about the possibility of nested parens.

This patch should work OK in HEAD and v14, but it will need
a bit of fooling-about for older branches I think, given that
they fill v->subs[] a little differently.

                        regards, tom lane

diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c
index ae3a7b6a38..ae1ff670f9 100644
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@@ -550,8 +550,6 @@ freev(struct vars *v,
 {
 	if (v->re != NULL)
 		rfree(v->re);
-	if (v->subs != v->sub10)
-		FREE(v->subs);
 	if (v->nfa != NULL)
 		freenfa(v->nfa);
 	if (v->tree != NULL)
@@ -562,6 +560,8 @@ freev(struct vars *v,
 		freecvec(v->cv);
 	if (v->cv2 != NULL)
 		freecvec(v->cv2);
+	if (v->subs != v->sub10)
+		FREE(v->subs);
 	if (v->lacons != NULL)
 		freelacons(v->lacons, v->nlacons);
 	ERR(err);					/* nop if err==0 */
@@ -1091,8 +1091,8 @@ parseqatom(struct vars *v,
 	{
 		if (atom != NULL)
 			freesubre(v, atom);
-		if (atomtype == '(')
-			v->subs[subno] = NULL;
+		/* freesubre() will clean up any v->subs[] pointers into the atom */
+		assert(atomtype != '(' || v->subs[subno] == NULL);
 		delsub(v->nfa, lp, rp);
 		EMPTYARC(lp, rp);
 		return top;
@@ -2127,9 +2127,24 @@ freesrnode(struct vars *v,		/* might be NULL */
 	if (sr == NULL)
 		return;
 
+	/*
+	 * If the node is referenced in v->subs[], clear that to avoid leaving a
+	 * dangling pointer.  This should only ever matter when parseqatom() is
+	 * throwing away a {0,0}-quantified atom that contains capturing parens.
+	 * Doing this will cause later backrefs to such parens to fail, which is
+	 * maybe not strictly POSIX but seems more helpful than allowing a backref
+	 * that can never have a defined referent.
+	 */
+	if (sr->capno > 0 && v != NULL)
+	{
+		assert(v->subs[sr->capno] == sr);
+		v->subs[sr->capno] = NULL;
+	}
+
 	if (!NULLCNFA(sr->cnfa))
 		freecnfa(&sr->cnfa);
 	sr->flags = 0;				/* in particular, not INUSE */
+	sr->capno = 0;
 	sr->child = sr->sibling = NULL;
 	sr->begin = sr->end = NULL;
 
diff --git a/src/test/modules/test_regex/expected/test_regex.out b/src/test/modules/test_regex/expected/test_regex.out
index 5a6cdf47c2..850e0c8ef5 100644
--- a/src/test/modules/test_regex/expected/test_regex.out
+++ b/src/test/modules/test_regex/expected/test_regex.out
@@ -3506,6 +3506,10 @@ select * from test_regex('((.))(\2)', 'xyy', 'oRP');
  {yy,NULL,NULL,NULL}
 (2 rows)
 
+-- # throwing an error for this is arguably not per POSIX
+-- expectError	21.39 -		{((.)){0}(\2){0}}	ESUBREG
+select * from test_regex('((.)){0}(\2){0}', '', '-');
+ERROR:  invalid regular expression: invalid backreference number
 -- doing 22 "multicharacter collating elements"
 -- # again ugh
 -- MCCEs are not implemented in Postgres, so we skip all these tests
diff --git a/src/test/modules/test_regex/sql/test_regex.sql b/src/test/modules/test_regex/sql/test_regex.sql
index 3419564203..378f22b67b 100644
--- a/src/test/modules/test_regex/sql/test_regex.sql
+++ b/src/test/modules/test_regex/sql/test_regex.sql
@@ -1020,6 +1020,9 @@ select * from test_regex('((.))(\2){0}', 'xy', 'RPQ');
 select * from test_regex('((.))(\2)', 'xyy', 'RP');
 -- expectMatch	21.38 oRP	((.))(\2)	xyy	yy	{}	{}	{}
 select * from test_regex('((.))(\2)', 'xyy', 'oRP');
+-- # throwing an error for this is arguably not per POSIX
+-- expectError	21.39 -		{((.)){0}(\2){0}}	ESUBREG
+select * from test_regex('((.)){0}(\2){0}', '', '-');
 
 -- doing 22 "multicharacter collating elements"
 -- # again ugh

Re: Another regexp performance improvement: skip useless paren-captures

Reply via email to