While poking at that xml issue I tried to compare the memory usage of the xml queries with vaguely comparable regexp_split_to_table queries, and found that I could not do so; the regexp queries simply did not complete in any sensible timeframe.
Investigation showed that in UTF8 (or other multibyte encodings), the performance of regexp_matches and regexp_split_to_* is subject to quadratic slowdowns thanks to the use of character offsets and calls to text_substr, which must scan the string from the start each time to count characters. This patch changes that by keeping the wide-char copy of the string available rather than freeing it, and converting the required substrings back to the db encoding via a pre-allocated conversion buffer. This gives noticable speedups on even very small strings (all timings below are with -O2 --disable-cassert): select count(*) from (select 'aaaaa,aaaaa,aaaaa'::text as content from generate_series(1,1000000) i offset 0) s, regexp_split_to_table(content, ',') r; -- ~10% faster with patch: 2.8 s -> 2.5 s but on strings of even modest size (~1kb) the improvement is vast: select count(*) from (select repeat(repeat('a',10) || ',', 100+(i*0))::text as content -- 1100 bytes, 101 matches from generate_series(1,100000) i offset 0) s, regexp_split_to_table(content, ',') r; -- over 8 times faster: 51.8 sec -> 6.3 sec and it only gets bigger: select count(*) from regexp_split_to_table(repeat('aaa,',10000), ',') r; -- 40KB -- 270 times faster: 1628ms -> 6ms This patch passes regression but I haven't yet tested it on complex multibyte data (or non-UTF8 encodings but that shouldn't matter). Comments? -- Andrew (irc:RhodiumToad)
diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c index 5025a449fb..8839fb4ce2 100644 --- a/src/backend/utils/adt/regexp.c +++ b/src/backend/utils/adt/regexp.c @@ -61,6 +61,9 @@ typedef struct regexp_matches_ctx /* workspace for build_regexp_match_result() */ Datum *elems; /* has npatterns elements */ bool *nulls; /* has npatterns elements */ + pg_wchar *wide_str; /* wide-char version of original string */ + char *conv_buf; /* conversion buffer */ + size_t conv_bufsiz; /* size thereof */ } regexp_matches_ctx; /* @@ -111,7 +114,8 @@ static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *flags, Oid collation, bool use_subpatterns, - bool ignore_degenerate); + bool ignore_degenerate, + bool fetching_unmatched); static void cleanup_regexp_matches(regexp_matches_ctx *matchctx); static ArrayType *build_regexp_match_result(regexp_matches_ctx *matchctx); static Datum build_regexp_split_result(regexp_matches_ctx *splitctx); @@ -863,7 +867,7 @@ regexp_match(PG_FUNCTION_ARGS) errhint("Use the regexp_matches function instead."))); matchctx = setup_regexp_matches(orig_str, pattern, &re_flags, - PG_GET_COLLATION(), true, false); + PG_GET_COLLATION(), true, false, false); if (matchctx->nmatches == 0) PG_RETURN_NULL(); @@ -911,7 +915,7 @@ regexp_matches(PG_FUNCTION_ARGS) matchctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern, &re_flags, PG_GET_COLLATION(), - true, false); + true, false, false); /* Pre-create workspace that build_regexp_match_result needs */ matchctx->elems = (Datum *) palloc(sizeof(Datum) * matchctx->npatterns); @@ -954,7 +958,7 @@ regexp_matches_no_flags(PG_FUNCTION_ARGS) * all the matching in one swoop. The returned regexp_matches_ctx contains * the locations of all the substrings matching the pattern. * - * The two bool parameters have only two patterns (one for matching, one for + * The three bool parameters have only two patterns (one for matching, one for * splitting) but it seems clearer to distinguish the functionality this way * than to key it all off one "is_split" flag. */ @@ -962,9 +966,11 @@ static regexp_matches_ctx * setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags, Oid collation, bool use_subpatterns, - bool ignore_degenerate) + bool ignore_degenerate, + bool fetching_unmatched) { regexp_matches_ctx *matchctx = palloc0(sizeof(regexp_matches_ctx)); + int eml = pg_database_encoding_max_length(); int orig_len; pg_wchar *wide_str; int wide_len; @@ -975,6 +981,7 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags, int array_idx; int prev_match_end; int start_search; + int maxlen = 0; /* largest fetch length in characters */ /* save original string --- we'll extract result substrings from it */ matchctx->orig_str = orig_str; @@ -1024,7 +1031,7 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags, pmatch[0].rm_eo > prev_match_end)) { /* enlarge output space if needed */ - while (array_idx + matchctx->npatterns * 2 > array_len) + while (array_idx + matchctx->npatterns * 2 + 1 > array_len) { array_len *= 2; matchctx->match_locs = (int *) repalloc(matchctx->match_locs, @@ -1038,16 +1045,29 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags, for (i = 1; i <= matchctx->npatterns; i++) { - matchctx->match_locs[array_idx++] = pmatch[i].rm_so; - matchctx->match_locs[array_idx++] = pmatch[i].rm_eo; + int so = pmatch[i].rm_so; + int eo = pmatch[i].rm_eo; + matchctx->match_locs[array_idx++] = so; + matchctx->match_locs[array_idx++] = eo; + if (so >= 0 && eo >= 0 && (eo - so) > maxlen) + maxlen = (eo - so); } } else { - matchctx->match_locs[array_idx++] = pmatch[0].rm_so; - matchctx->match_locs[array_idx++] = pmatch[0].rm_eo; + int so = pmatch[0].rm_so; + int eo = pmatch[0].rm_eo; + matchctx->match_locs[array_idx++] = so; + matchctx->match_locs[array_idx++] = eo; + if (so >= 0 && eo >= 0 && (eo - so) > maxlen) + maxlen = (eo - so); } matchctx->nmatches++; + + if (fetching_unmatched && + pmatch[0].rm_so >= 0 && + (pmatch[0].rm_so - prev_match_end) > maxlen) + maxlen = (pmatch[0].rm_so - prev_match_end); } prev_match_end = pmatch[0].rm_eo; @@ -1068,8 +1088,45 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags, break; } + if (fetching_unmatched && + (wide_len - prev_match_end) > maxlen) + maxlen = (wide_len - prev_match_end); + + /* + * Keep a note of the end position of the string for the benefit of + * splitting code. + */ + matchctx->match_locs[array_idx] = wide_len; + + if (eml > 1) + { + int64 maxsiz = (int64) maxlen * eml; + size_t conv_bufsiz; + + /* + * worst case: assume we need the maximum size (maxlen*eml), but take + * advantage of the fact that the original string length in bytes is an + * upper bound on the byte length of any fetched substring (and we know + * that len+1 is safe to allocate because the varlena header is longer + * than 1 byte). + */ + if (maxsiz > orig_len) + conv_bufsiz = (size_t) orig_len + 1; + else + conv_bufsiz = (size_t) maxsiz + 1; /* safe since maxsiz < 2^30 */ + matchctx->conv_buf = palloc(conv_bufsiz); + matchctx->conv_bufsiz = conv_bufsiz; + matchctx->wide_str = wide_str; + } + else + { + pfree(wide_str); + matchctx->wide_str = NULL; + matchctx->conv_buf = NULL; + matchctx->conv_bufsiz = 0; + } + /* Clean up temp storage */ - pfree(wide_str); pfree(pmatch); return matchctx; @@ -1082,6 +1139,10 @@ static void cleanup_regexp_matches(regexp_matches_ctx *matchctx) { pfree(matchctx->orig_str); + if (matchctx->wide_str) + pfree(matchctx->wide_str); + if (matchctx->conv_buf) + pfree(matchctx->conv_buf); pfree(matchctx->match_locs); if (matchctx->elems) pfree(matchctx->elems); @@ -1096,6 +1157,7 @@ cleanup_regexp_matches(regexp_matches_ctx *matchctx) static ArrayType * build_regexp_match_result(regexp_matches_ctx *matchctx) { + int eml = pg_database_encoding_max_length(); Datum *elems = matchctx->elems; bool *nulls = matchctx->nulls; int dims[1]; @@ -1115,6 +1177,17 @@ build_regexp_match_result(regexp_matches_ctx *matchctx) elems[i] = (Datum) 0; nulls[i] = true; } + else if (eml > 1) + { + size_t bufsiz PG_USED_FOR_ASSERTS_ONLY = matchctx->conv_bufsiz; + char *buf = matchctx->conv_buf; + int len = pg_wchar2mb_with_len(matchctx->wide_str + so, + buf, + eo - so); + Assert(len < bufsiz); + elems[i] = PointerGetDatum(cstring_to_text_with_len(buf, len)); + nulls[i] = false; + } else { elems[i] = DirectFunctionCall3(text_substr, @@ -1168,7 +1241,7 @@ regexp_split_to_table(PG_FUNCTION_ARGS) splitctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern, &re_flags, PG_GET_COLLATION(), - false, true); + false, true, true); MemoryContextSwitchTo(oldcontext); funcctx->user_fctx = (void *) splitctx; @@ -1224,7 +1297,7 @@ regexp_split_to_array(PG_FUNCTION_ARGS) PG_GETARG_TEXT_PP(1), &re_flags, PG_GET_COLLATION(), - false, true); + false, true, true); while (splitctx->next_match <= splitctx->nmatches) { @@ -1261,6 +1334,7 @@ regexp_split_to_array_no_flags(PG_FUNCTION_ARGS) static Datum build_regexp_split_result(regexp_matches_ctx *splitctx) { + int eml = pg_database_encoding_max_length(); int startpos; int endpos; @@ -1271,7 +1345,22 @@ build_regexp_split_result(regexp_matches_ctx *splitctx) if (startpos < 0) elog(ERROR, "invalid match ending position"); - if (splitctx->next_match < splitctx->nmatches) + if (eml > 1) + { + size_t bufsiz PG_USED_FOR_ASSERTS_ONLY = splitctx->conv_bufsiz; + char *buf = splitctx->conv_buf; + int len; + + endpos = splitctx->match_locs[splitctx->next_match * 2]; + if (endpos < startpos) + elog(ERROR, "invalid match starting position"); + len = pg_wchar2mb_with_len(splitctx->wide_str + startpos, + buf, + endpos-startpos); + Assert(len < bufsiz); + return PointerGetDatum(cstring_to_text_with_len(buf, len)); + } + else if (splitctx->next_match < splitctx->nmatches) { endpos = splitctx->match_locs[splitctx->next_match * 2]; if (endpos < startpos)