Fix quadratic performance of regexp match/split functions

Andrew Gierth Sun, 12 Aug 2018 20:33:01 -0700

While poking at that xml issue I tried to compare the memory usage of
the xml queries with vaguely comparable regexp_split_to_table queries,
and found that I could not do so; the regexp queries simply did not
complete in any sensible timeframe.


Investigation showed that in UTF8 (or other multibyte encodings), the
performance of regexp_matches and regexp_split_to_* is subject to
quadratic slowdowns thanks to the use of character offsets and calls to
text_substr, which must scan the string from the start each time to
count characters.

This patch changes that by keeping the wide-char copy of the string
available rather than freeing it, and converting the required substrings
back to the db encoding via a pre-allocated conversion buffer.

This gives noticable speedups on even very small strings (all timings
below are with -O2 --disable-cassert):

select count(*)
  from (select 'aaaaa,aaaaa,aaaaa'::text as content
          from generate_series(1,1000000) i offset 0) s,
       regexp_split_to_table(content, ',') r;

-- ~10% faster with patch: 2.8 s -> 2.5 s

but on strings of even modest size (~1kb) the improvement is vast:

select count(*)
  from (select repeat(repeat('a',10) || ',', 100+(i*0))::text
                 as content -- 1100 bytes, 101 matches
          from generate_series(1,100000) i offset 0) s,
       regexp_split_to_table(content, ',') r;

-- over 8 times faster: 51.8 sec -> 6.3 sec

and it only gets bigger:

select count(*)
  from regexp_split_to_table(repeat('aaa,',10000), ',') r;  -- 40KB

-- 270 times faster: 1628ms -> 6ms

This patch passes regression but I haven't yet tested it on complex
multibyte data (or non-UTF8 encodings but that shouldn't matter).

Comments?

-- 
Andrew (irc:RhodiumToad)

diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c
index 5025a449fb..8839fb4ce2 100644
--- a/src/backend/utils/adt/regexp.c
+++ b/src/backend/utils/adt/regexp.c
@@ -61,6 +61,9 @@ typedef struct regexp_matches_ctx
 	/* workspace for build_regexp_match_result() */
 	Datum	   *elems;			/* has npatterns elements */
 	bool	   *nulls;			/* has npatterns elements */
+	pg_wchar   *wide_str;		/* wide-char version of original string */
+	char	   *conv_buf;		/* conversion buffer */
+	size_t		conv_bufsiz;	/* size thereof */
 } regexp_matches_ctx;
 
 /*
@@ -111,7 +114,8 @@ static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern,
 					 pg_re_flags *flags,
 					 Oid collation,
 					 bool use_subpatterns,
-					 bool ignore_degenerate);
+					 bool ignore_degenerate,
+					 bool fetching_unmatched);
 static void cleanup_regexp_matches(regexp_matches_ctx *matchctx);
 static ArrayType *build_regexp_match_result(regexp_matches_ctx *matchctx);
 static Datum build_regexp_split_result(regexp_matches_ctx *splitctx);
@@ -863,7 +867,7 @@ regexp_match(PG_FUNCTION_ARGS)
 				 errhint("Use the regexp_matches function instead.")));
 
 	matchctx = setup_regexp_matches(orig_str, pattern, &re_flags,
-									PG_GET_COLLATION(), true, false);
+									PG_GET_COLLATION(), true, false, false);
 
 	if (matchctx->nmatches == 0)
 		PG_RETURN_NULL();
@@ -911,7 +915,7 @@ regexp_matches(PG_FUNCTION_ARGS)
 		matchctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
 										&re_flags,
 										PG_GET_COLLATION(),
-										true, false);
+										true, false, false);
 
 		/* Pre-create workspace that build_regexp_match_result needs */
 		matchctx->elems = (Datum *) palloc(sizeof(Datum) * matchctx->npatterns);
@@ -954,7 +958,7 @@ regexp_matches_no_flags(PG_FUNCTION_ARGS)
  * all the matching in one swoop.  The returned regexp_matches_ctx contains
  * the locations of all the substrings matching the pattern.
  *
- * The two bool parameters have only two patterns (one for matching, one for
+ * The three bool parameters have only two patterns (one for matching, one for
  * splitting) but it seems clearer to distinguish the functionality this way
  * than to key it all off one "is_split" flag.
  */
@@ -962,9 +966,11 @@ static regexp_matches_ctx *
 setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
 					 Oid collation,
 					 bool use_subpatterns,
-					 bool ignore_degenerate)
+					 bool ignore_degenerate,
+					 bool fetching_unmatched)
 {
 	regexp_matches_ctx *matchctx = palloc0(sizeof(regexp_matches_ctx));
+	int			eml = pg_database_encoding_max_length();
 	int			orig_len;
 	pg_wchar   *wide_str;
 	int			wide_len;
@@ -975,6 +981,7 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
 	int			array_idx;
 	int			prev_match_end;
 	int			start_search;
+	int			maxlen = 0;		/* largest fetch length in characters */
 
 	/* save original string --- we'll extract result substrings from it */
 	matchctx->orig_str = orig_str;
@@ -1024,7 +1031,7 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
 			 pmatch[0].rm_eo > prev_match_end))
 		{
 			/* enlarge output space if needed */
-			while (array_idx + matchctx->npatterns * 2 > array_len)
+			while (array_idx + matchctx->npatterns * 2 + 1 > array_len)
 			{
 				array_len *= 2;
 				matchctx->match_locs = (int *) repalloc(matchctx->match_locs,
@@ -1038,16 +1045,29 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
 
 				for (i = 1; i <= matchctx->npatterns; i++)
 				{
-					matchctx->match_locs[array_idx++] = pmatch[i].rm_so;
-					matchctx->match_locs[array_idx++] = pmatch[i].rm_eo;
+					int		so = pmatch[i].rm_so;
+					int		eo = pmatch[i].rm_eo;
+					matchctx->match_locs[array_idx++] = so;
+					matchctx->match_locs[array_idx++] = eo;
+					if (so >= 0 && eo >= 0 && (eo - so) > maxlen)
+						maxlen = (eo - so);
 				}
 			}
 			else
 			{
-				matchctx->match_locs[array_idx++] = pmatch[0].rm_so;
-				matchctx->match_locs[array_idx++] = pmatch[0].rm_eo;
+				int		so = pmatch[0].rm_so;
+				int		eo = pmatch[0].rm_eo;
+				matchctx->match_locs[array_idx++] = so;
+				matchctx->match_locs[array_idx++] = eo;
+				if (so >= 0 && eo >= 0 && (eo - so) > maxlen)
+					maxlen = (eo - so);
 			}
 			matchctx->nmatches++;
+
+			if (fetching_unmatched &&
+				pmatch[0].rm_so >= 0 &&
+				(pmatch[0].rm_so - prev_match_end) > maxlen)
+				maxlen = (pmatch[0].rm_so - prev_match_end);
 		}
 		prev_match_end = pmatch[0].rm_eo;
 
@@ -1068,8 +1088,45 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
 			break;
 	}
 
+	if (fetching_unmatched &&
+		(wide_len - prev_match_end) > maxlen)
+		maxlen = (wide_len - prev_match_end);
+
+	/*
+	 * Keep a note of the end position of the string for the benefit of
+	 * splitting code.
+	 */
+	matchctx->match_locs[array_idx] = wide_len;
+
+	if (eml > 1)
+	{
+		int64		maxsiz = (int64) maxlen * eml;
+		size_t		conv_bufsiz;
+
+		/*
+		 * worst case: assume we need the maximum size (maxlen*eml), but take
+		 * advantage of the fact that the original string length in bytes is an
+		 * upper bound on the byte length of any fetched substring (and we know
+		 * that len+1 is safe to allocate because the varlena header is longer
+		 * than 1 byte).
+		 */
+		if (maxsiz > orig_len)
+			conv_bufsiz = (size_t) orig_len + 1;
+		else
+			conv_bufsiz = (size_t) maxsiz + 1;	/* safe since maxsiz < 2^30 */
+		matchctx->conv_buf = palloc(conv_bufsiz);
+		matchctx->conv_bufsiz = conv_bufsiz;
+		matchctx->wide_str = wide_str;
+	}
+	else
+	{
+		pfree(wide_str);
+		matchctx->wide_str = NULL;
+		matchctx->conv_buf = NULL;
+		matchctx->conv_bufsiz = 0;
+	}
+
 	/* Clean up temp storage */
-	pfree(wide_str);
 	pfree(pmatch);
 
 	return matchctx;
@@ -1082,6 +1139,10 @@ static void
 cleanup_regexp_matches(regexp_matches_ctx *matchctx)
 {
 	pfree(matchctx->orig_str);
+	if (matchctx->wide_str)
+		pfree(matchctx->wide_str);
+	if (matchctx->conv_buf)
+		pfree(matchctx->conv_buf);
 	pfree(matchctx->match_locs);
 	if (matchctx->elems)
 		pfree(matchctx->elems);
@@ -1096,6 +1157,7 @@ cleanup_regexp_matches(regexp_matches_ctx *matchctx)
 static ArrayType *
 build_regexp_match_result(regexp_matches_ctx *matchctx)
 {
+	int			eml = pg_database_encoding_max_length();
 	Datum	   *elems = matchctx->elems;
 	bool	   *nulls = matchctx->nulls;
 	int			dims[1];
@@ -1115,6 +1177,17 @@ build_regexp_match_result(regexp_matches_ctx *matchctx)
 			elems[i] = (Datum) 0;
 			nulls[i] = true;
 		}
+		else if (eml > 1)
+		{
+			size_t 	bufsiz PG_USED_FOR_ASSERTS_ONLY = matchctx->conv_bufsiz;
+			char   *buf = matchctx->conv_buf;
+			int		len = pg_wchar2mb_with_len(matchctx->wide_str + so,
+											   buf,
+											   eo - so);
+			Assert(len < bufsiz);
+			elems[i] = PointerGetDatum(cstring_to_text_with_len(buf, len));
+			nulls[i] = false;
+		}
 		else
 		{
 			elems[i] = DirectFunctionCall3(text_substr,
@@ -1168,7 +1241,7 @@ regexp_split_to_table(PG_FUNCTION_ARGS)
 		splitctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
 										&re_flags,
 										PG_GET_COLLATION(),
-										false, true);
+										false, true, true);
 
 		MemoryContextSwitchTo(oldcontext);
 		funcctx->user_fctx = (void *) splitctx;
@@ -1224,7 +1297,7 @@ regexp_split_to_array(PG_FUNCTION_ARGS)
 									PG_GETARG_TEXT_PP(1),
 									&re_flags,
 									PG_GET_COLLATION(),
-									false, true);
+									false, true, true);
 
 	while (splitctx->next_match <= splitctx->nmatches)
 	{
@@ -1261,6 +1334,7 @@ regexp_split_to_array_no_flags(PG_FUNCTION_ARGS)
 static Datum
 build_regexp_split_result(regexp_matches_ctx *splitctx)
 {
+	int			eml = pg_database_encoding_max_length();
 	int			startpos;
 	int			endpos;
 
@@ -1271,7 +1345,22 @@ build_regexp_split_result(regexp_matches_ctx *splitctx)
 	if (startpos < 0)
 		elog(ERROR, "invalid match ending position");
 
-	if (splitctx->next_match < splitctx->nmatches)
+	if (eml > 1)
+	{
+		size_t	bufsiz PG_USED_FOR_ASSERTS_ONLY = splitctx->conv_bufsiz;
+		char   *buf = splitctx->conv_buf;
+		int		len;
+
+		endpos = splitctx->match_locs[splitctx->next_match * 2];
+		if (endpos < startpos)
+			elog(ERROR, "invalid match starting position");
+		len = pg_wchar2mb_with_len(splitctx->wide_str + startpos,
+								   buf,
+								   endpos-startpos);
+		Assert(len < bufsiz);
+		return PointerGetDatum(cstring_to_text_with_len(buf, len));
+	}
+	else if (splitctx->next_match < splitctx->nmatches)
 	{
 		endpos = splitctx->match_locs[splitctx->next_match * 2];
 		if (endpos < startpos)

Fix quadratic performance of regexp match/split functions

Reply via email to