On 26.08.24 08:09, Peter Eisentraut wrote:
This patch allows using text position search functions with nondeterministic collations. These functions are- position, strpos - replace - split_part - string_to_array - string_to_table which all use common internal infrastructure.
Some exploratory testing could be useful here. The present test coverage was already quite helpful during development, but there is always the possibility that something was overlooked.
Here is a freshly rebased patch version, no functionality changes.
From 272befa6ab78d94ac35d24203da415545b3ad2d9 Mon Sep 17 00:00:00 2001 From: Peter Eisentraut <pe...@eisentraut.org> Date: Mon, 2 Dec 2024 10:06:28 +0100 Subject: [PATCH v2] Support POSITION with nondeterministic collations This allows using text position search functions with nondeterministic collations. These functions are - position, strpos - replace - split_part - string_to_array - string_to_table which all use common internal infrastructure. There was previously no internal implementation of this, so it was met with a not-supported error. This adds the internal implementation and removes the error. Unlike with deterministic collations, the search cannot use any byte-by-byte optimized techniques but has to go substring by substring. We also need to consider that the found match could have a different length than the needle and that there could be substrings of different length matching at a position. In most cases, we need to find the longest such substring (greedy semantics). Discussion: https://www.postgresql.org/message-id/flat/582b2613-0900-48ca-8b0d-340c06f4d...@eisentraut.org --- src/backend/utils/adt/varlena.c | 105 ++++++++++-- .../regress/expected/collate.icu.utf8.out | 154 +++++++++++++++--- src/test/regress/sql/collate.icu.utf8.sql | 36 +++- 3 files changed, 247 insertions(+), 48 deletions(-) diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index 533bebc1c7b..3fda7a8aeea 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -54,7 +54,9 @@ typedef struct varlena VarString; */ typedef struct { + pg_locale_t locale; /* collation used for substring matching */ bool is_multibyte_char_in_char; /* need to check char boundaries? */ + bool greedy; /* find longest possible substring? */ char *str1; /* haystack string */ char *str2; /* needle string */ @@ -65,7 +67,13 @@ typedef struct int skiptablemask; /* mask for ANDing with skiptable subscripts */ int skiptable[256]; /* skip distance for given mismatched char */ + /* + * Note that with nondeterministic collations, the length of the last + * match is not necessarily equal to the length of the "needle" passed in. + */ char *last_match; /* pointer to last match in 'str1' */ + int last_match_len; /* length of last match */ + int last_match_len_tmp; /* same but for internal use */ /* * Sometimes we need to convert the byte position of a match to a @@ -1178,15 +1186,21 @@ text_position(text *t1, text *t2, Oid collid) TextPositionState state; int result; + check_collation_set(collid); + /* Empty needle always matches at position 1 */ if (VARSIZE_ANY_EXHDR(t2) < 1) return 1; /* Otherwise, can't match if haystack is shorter than needle */ - if (VARSIZE_ANY_EXHDR(t1) < VARSIZE_ANY_EXHDR(t2)) + if (VARSIZE_ANY_EXHDR(t1) < VARSIZE_ANY_EXHDR(t2) && + pg_newlocale_from_collation(collid)->deterministic) return 0; text_position_setup(t1, t2, collid, &state); + /* don't need greedy mode here */ + state.greedy = false; + if (!text_position_next(&state)) result = 0; else @@ -1217,18 +1231,17 @@ text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state) { int len1 = VARSIZE_ANY_EXHDR(t1); int len2 = VARSIZE_ANY_EXHDR(t2); - pg_locale_t mylocale; check_collation_set(collid); - mylocale = pg_newlocale_from_collation(collid); + state->locale = pg_newlocale_from_collation(collid); - if (!mylocale->deterministic) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("nondeterministic collations are not supported for substring searches"))); + /* + * Most callers need greedy mode, but some might want to unset this to + * optimize. + */ + state->greedy = true; - Assert(len1 > 0); Assert(len2 > 0); /* @@ -1264,8 +1277,11 @@ text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state) * point in wasting cycles initializing the table. We also choose not to * use B-M-H for needles of length 1, since the skip table can't possibly * save anything in that case. + * + * (With nondeterministic collations, the search was already + * multibyte-aware, so we don't need this.) */ - if (len1 >= len2 && len2 > 1) + if (len1 >= len2 && len2 > 1 && state->locale->deterministic) { int searchlength = len1 - len2; int skiptablemask; @@ -1343,7 +1359,7 @@ text_position_next(TextPositionState *state) /* Start from the point right after the previous match. */ if (state->last_match) - start_ptr = state->last_match + needle_len; + start_ptr = state->last_match + state->last_match_len; else start_ptr = state->str1; @@ -1359,7 +1375,7 @@ text_position_next(TextPositionState *state) * multi-byte character, we need to verify that the match was at a * character boundary, not in the middle of a multi-byte character. */ - if (state->is_multibyte_char_in_char) + if (state->is_multibyte_char_in_char && state->locale->deterministic) { /* Walk one character at a time, until we reach the match. */ @@ -1387,6 +1403,7 @@ text_position_next(TextPositionState *state) } state->last_match = matchptr; + state->last_match_len = state->last_match_len_tmp; return true; } @@ -1408,7 +1425,63 @@ text_position_next_internal(char *start_ptr, TextPositionState *state) Assert(start_ptr >= haystack && start_ptr <= haystack_end); - if (needle_len == 1) + state->last_match_len_tmp = needle_len; + + if (!state->locale->deterministic) + { + /* + * With a nondeterministic collation, we have to use an unoptimized + * route. We walk through the haystack and see if at each position + * there is a substring of the remaining string that is equal to the + * needle under the given collation. + * + * Note, the found substring could have a different length than the + * needle, including being empty. Callers that want to skip over the + * found string need to read the length of the found substring from + * last_match_len rather than just using the length of their needle. + * + * Most callers will require "greedy" semantics, meaning that we need + * to find the longest such substring, not the shortest. For callers + * don't don't need greedy semantics, we can finish on the first + * match. + */ + const char *result_hptr = NULL; + + hptr = start_ptr; + while (hptr < haystack_end) + { + /* + * First check the common case that there is a match in the + * haystack of exactly the length of the needle. + */ + if (!state->greedy && + haystack_end - hptr >= needle_len && + pg_strncoll(hptr, needle_len, needle, needle_len, state->locale) == 0) + return (char *) hptr; + + /* + * Else check if any of the possible substrings starting at hptr + * are equal to the needle. + */ + for (const char *test_end = hptr; test_end < haystack_end; test_end += pg_mblen(test_end)) + { + if (pg_strncoll(hptr, (test_end - hptr), needle, needle_len, state->locale) == 0) + { + state->last_match_len_tmp = (test_end - hptr); + result_hptr = hptr; + if (!state->greedy) + break; + } + } + if (result_hptr) + break; + + hptr += pg_mblen(hptr); + } + + return (char *) result_hptr; + } + else if (needle_len == 1) { /* No point in using B-M-H for a one-character needle */ char nchar = *needle; @@ -4055,7 +4128,7 @@ replace_text(PG_FUNCTION_ARGS) appendStringInfoText(&str, to_sub_text); - start_ptr = curr_ptr + from_sub_text_len; + start_ptr = curr_ptr + state.last_match_len; found = text_position_next(&state); if (found) @@ -4445,7 +4518,7 @@ split_part(PG_FUNCTION_ARGS) /* special case of last field does not require an extra pass */ if (fldnum == -1) { - start_ptr = text_position_get_match_ptr(&state) + fldsep_len; + start_ptr = text_position_get_match_ptr(&state) + state.last_match_len; end_ptr = VARDATA_ANY(inputstring) + inputstring_len; text_position_cleanup(&state); PG_RETURN_TEXT_P(cstring_to_text_with_len(start_ptr, @@ -4475,7 +4548,7 @@ split_part(PG_FUNCTION_ARGS) while (found && --fldnum > 0) { /* identify bounds of next field */ - start_ptr = end_ptr + fldsep_len; + start_ptr = end_ptr + state.last_match_len; found = text_position_next(&state); if (found) end_ptr = text_position_get_match_ptr(&state); @@ -4691,7 +4764,7 @@ split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate) if (!found) break; - start_ptr = end_ptr + fldsep_len; + start_ptr = end_ptr + state.last_match_len; } text_position_cleanup(&state); diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out index 9f5e57428e8..ebd2cc58fee 100644 --- a/src/test/regress/expected/collate.icu.utf8.out +++ b/src/test/regress/expected/collate.icu.utf8.out @@ -1300,39 +1300,109 @@ SELECT 'abc' LIKE 'a\bc' COLLATE ctest_nondet; CREATE TABLE test6 (a int, b text); -- same string in different normal forms -INSERT INTO test6 VALUES (1, U&'\00E4bc'); -INSERT INTO test6 VALUES (2, U&'\0061\0308bc'); +INSERT INTO test6 VALUES (1, U&'zy\00E4bc'); +INSERT INTO test6 VALUES (2, U&'zy\0061\0308bc'); SELECT * FROM test6; - a | b ----+----- - 1 | äbc - 2 | äbc + a | b +---+------- + 1 | zyäbc + 2 | zyäbc (2 rows) -SELECT * FROM test6 WHERE b = 'äbc' COLLATE ctest_det; - a | b ----+----- - 1 | äbc +SELECT * FROM test6 WHERE b = 'zyäbc' COLLATE ctest_det; + a | b +---+------- + 1 | zyäbc (1 row) -SELECT * FROM test6 WHERE b = 'äbc' COLLATE ctest_nondet; - a | b ----+----- - 1 | äbc - 2 | äbc +SELECT * FROM test6 WHERE b = 'zyäbc' COLLATE ctest_nondet; + a | b +---+------- + 1 | zyäbc + 2 | zyäbc (2 rows) -SELECT * FROM test6 WHERE b LIKE 'äbc' COLLATE ctest_det; - a | b ----+----- - 1 | äbc +SELECT strpos(b COLLATE ctest_det, 'bc') FROM test6; + strpos +-------- + 4 + 5 +(2 rows) + +SELECT strpos(b COLLATE ctest_nondet, 'bc') FROM test6; + strpos +-------- + 4 + 5 +(2 rows) + +SELECT replace(b COLLATE ctest_det, U&'\00E4b', 'X') FROM test6; + replace +--------- + zyXc + zyäbc +(2 rows) + +SELECT replace(b COLLATE ctest_nondet, U&'\00E4b', 'X') FROM test6; + replace +--------- + zyXc + zyXc +(2 rows) + +SELECT a, split_part(b COLLATE ctest_det, U&'\00E4b', 2) FROM test6; + a | split_part +---+------------ + 1 | c + 2 | +(2 rows) + +SELECT a, split_part(b COLLATE ctest_nondet, U&'\00E4b', 2) FROM test6; + a | split_part +---+------------ + 1 | c + 2 | c +(2 rows) + +SELECT a, split_part(b COLLATE ctest_det, U&'\00E4b', -1) FROM test6; + a | split_part +---+------------ + 1 | c + 2 | zyäbc +(2 rows) + +SELECT a, split_part(b COLLATE ctest_nondet, U&'\00E4b', -1) FROM test6; + a | split_part +---+------------ + 1 | c + 2 | c +(2 rows) + +SELECT a, string_to_array(b COLLATE ctest_det, U&'\00E4b') FROM test6; + a | string_to_array +---+----------------- + 1 | {zy,c} + 2 | {zyäbc} +(2 rows) + +SELECT a, string_to_array(b COLLATE ctest_nondet, U&'\00E4b') FROM test6; + a | string_to_array +---+----------------- + 1 | {zy,c} + 2 | {zy,c} +(2 rows) + +SELECT * FROM test6 WHERE b LIKE 'zyäbc' COLLATE ctest_det; + a | b +---+------- + 1 | zyäbc (1 row) -SELECT * FROM test6 WHERE b LIKE 'äbc' COLLATE ctest_nondet; - a | b ----+----- - 1 | äbc - 2 | äbc +SELECT * FROM test6 WHERE b LIKE 'zyäbc' COLLATE ctest_nondet; + a | b +---+------- + 1 | zyäbc + 2 | zyäbc (2 rows) -- same with arrays @@ -1643,7 +1713,11 @@ CREATE UNIQUE INDEX ON test3ci (x); -- error ERROR: could not create unique index "test3ci_x_idx" DETAIL: Key (x)=(abc) is duplicated. SELECT string_to_array('ABC,DEF,GHI' COLLATE case_insensitive, ',', 'abc'); -ERROR: nondeterministic collations are not supported for substring searches + string_to_array +----------------- + {NULL,DEF,GHI} +(1 row) + SELECT string_to_array('ABCDEFGHI' COLLATE case_insensitive, NULL, 'b'); string_to_array ------------------------ @@ -1766,7 +1840,11 @@ CREATE UNIQUE INDEX ON test3bpci (x); -- error ERROR: could not create unique index "test3bpci_x_idx" DETAIL: Key (x)=(abc) is duplicated. SELECT string_to_array('ABC,DEF,GHI'::char(11) COLLATE case_insensitive, ',', 'abc'); -ERROR: nondeterministic collations are not supported for substring searches + string_to_array +----------------- + {NULL,DEF,GHI} +(1 row) + SELECT string_to_array('ABCDEFGHI'::char(9) COLLATE case_insensitive, NULL, 'b'); string_to_array ------------------------ @@ -1895,6 +1973,30 @@ SELECT * FROM test4 WHERE b = 'Cote' COLLATE case_insensitive; 1 | cote (1 row) +CREATE TABLE test4nfd (a int, b text); +INSERT INTO test4nfd VALUES (1, 'cote'), (2, 'côte'), (3, 'coté'), (4, 'côté'); +UPDATE test4nfd SET b = normalize(b, nfd); +-- This shows why replace should be greedy. Otherwise, in the NFD +-- case, the match would stop before the decomposed accents, which +-- would leave the accents in the results. +SELECT a, b, replace(b COLLATE ignore_accents, 'co', 'ma') FROM test4; + a | b | replace +---+------+--------- + 1 | cote | mate + 2 | côte | mate + 3 | coté | maté + 4 | côté | maté +(4 rows) + +SELECT a, b, replace(b COLLATE ignore_accents, 'co', 'ma') FROM test4nfd; + a | b | replace +---+------+--------- + 1 | cote | mate + 2 | côte | mate + 3 | coté | maté + 4 | côté | maté +(4 rows) + -- This is a tricky one. A naive implementation would first test -- \00E4 matches \0061, which is true under ignore_accents, but then -- the rest of the string won't match anymore. Therefore, the diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql index 5ee2da4e0e0..cd0353fd7c6 100644 --- a/src/test/regress/sql/collate.icu.utf8.sql +++ b/src/test/regress/sql/collate.icu.utf8.sql @@ -522,14 +522,28 @@ CREATE COLLATION ctest_nondet (provider = icu, locale = '', deterministic = fals CREATE TABLE test6 (a int, b text); -- same string in different normal forms -INSERT INTO test6 VALUES (1, U&'\00E4bc'); -INSERT INTO test6 VALUES (2, U&'\0061\0308bc'); +INSERT INTO test6 VALUES (1, U&'zy\00E4bc'); +INSERT INTO test6 VALUES (2, U&'zy\0061\0308bc'); SELECT * FROM test6; -SELECT * FROM test6 WHERE b = 'äbc' COLLATE ctest_det; -SELECT * FROM test6 WHERE b = 'äbc' COLLATE ctest_nondet; +SELECT * FROM test6 WHERE b = 'zyäbc' COLLATE ctest_det; +SELECT * FROM test6 WHERE b = 'zyäbc' COLLATE ctest_nondet; -SELECT * FROM test6 WHERE b LIKE 'äbc' COLLATE ctest_det; -SELECT * FROM test6 WHERE b LIKE 'äbc' COLLATE ctest_nondet; +SELECT strpos(b COLLATE ctest_det, 'bc') FROM test6; +SELECT strpos(b COLLATE ctest_nondet, 'bc') FROM test6; + +SELECT replace(b COLLATE ctest_det, U&'\00E4b', 'X') FROM test6; +SELECT replace(b COLLATE ctest_nondet, U&'\00E4b', 'X') FROM test6; + +SELECT a, split_part(b COLLATE ctest_det, U&'\00E4b', 2) FROM test6; +SELECT a, split_part(b COLLATE ctest_nondet, U&'\00E4b', 2) FROM test6; +SELECT a, split_part(b COLLATE ctest_det, U&'\00E4b', -1) FROM test6; +SELECT a, split_part(b COLLATE ctest_nondet, U&'\00E4b', -1) FROM test6; + +SELECT a, string_to_array(b COLLATE ctest_det, U&'\00E4b') FROM test6; +SELECT a, string_to_array(b COLLATE ctest_nondet, U&'\00E4b') FROM test6; + +SELECT * FROM test6 WHERE b LIKE 'zyäbc' COLLATE ctest_det; +SELECT * FROM test6 WHERE b LIKE 'zyäbc' COLLATE ctest_nondet; -- same with arrays CREATE TABLE test6a (a int, b text[]); @@ -696,6 +710,16 @@ CREATE TABLE test4 (a int, b text); SELECT * FROM test4 WHERE b = 'Cote' COLLATE ignore_accents; -- still case-sensitive SELECT * FROM test4 WHERE b = 'Cote' COLLATE case_insensitive; +CREATE TABLE test4nfd (a int, b text); +INSERT INTO test4nfd VALUES (1, 'cote'), (2, 'côte'), (3, 'coté'), (4, 'côté'); +UPDATE test4nfd SET b = normalize(b, nfd); + +-- This shows why replace should be greedy. Otherwise, in the NFD +-- case, the match would stop before the decomposed accents, which +-- would leave the accents in the results. +SELECT a, b, replace(b COLLATE ignore_accents, 'co', 'ma') FROM test4; +SELECT a, b, replace(b COLLATE ignore_accents, 'co', 'ma') FROM test4nfd; + -- This is a tricky one. A naive implementation would first test -- \00E4 matches \0061, which is true under ignore_accents, but then -- the rest of the string won't match anymore. Therefore, the -- 2.47.1