On Fri, 2025-01-17 at 16:34 -0800, Jeff Davis wrote: > v5 attached. v6 attached. I plan to commit this soon.
A couple things to note: * The ICU API for lower/title/uppercasing is slightly different from folding. The former accept a locale, while the latter just has an option which is relevant only to languages 'az' and 'tr'. So the patch checks for those two languages to enable the option, so that ICU is consistently locale-aware for all the functions. I also added ICU tests. * I'm leaving out the normalization, which is not required for Unicode Default Caseless Matchinng, as mentioned in the last email. That simplifies the SQL function as well as the implementation. There was some discussion on normalization upthread. Regards, Jeff Davis
From b5e25e3b1b075c5b79682e3c812df973d9d22046 Mon Sep 17 00:00:00 2001 From: Jeff Davis <j...@j-davis.com> Date: Wed, 11 Dec 2024 23:46:43 -0800 Subject: [PATCH v6] Add SQL function CASEFOLD(). --- CATVERSION --- Useful for case-insensitive string comparison. Avoids some of the edge-case problems with using LOWER() for that purpose. Discussion: https://postgr.es/m/a1886ddfcd8f60cb3e905c93009b646b4cfb74c5.camel%40j-davis.com Reviewed-by: Ian Lawrence Barwick --- doc/src/sgml/func.sgml | 44 +++++++++++- src/backend/utils/adt/formatting.c | 69 +++++++++++++++++++ src/backend/utils/adt/oracle_compat.c | 16 +++++ src/backend/utils/adt/pg_locale.c | 24 +++++++ src/backend/utils/adt/pg_locale_builtin.c | 10 +++ src/backend/utils/adt/pg_locale_icu.c | 58 ++++++++++++++++ src/include/catalog/catversion.h | 2 +- src/include/catalog/pg_proc.dat | 3 + src/include/utils/formatting.h | 1 + src/include/utils/pg_locale.h | 3 + .../regress/expected/collate.icu.utf8.out | 12 ++++ src/test/regress/expected/collate.utf8.out | 14 ++++ src/test/regress/sql/collate.icu.utf8.sql | 3 + src/test/regress/sql/collate.utf8.sql | 6 ++ 14 files changed, 262 insertions(+), 3 deletions(-) diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 5678e7621a5..acc14934721 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -2596,7 +2596,7 @@ SELECT NOT(ROW(table.*) IS NOT NULL) FROM TABLE; -- detect at least one null in <row> <entry role="func_table_entry"><para role="func_signature"> - <indexterm> + <indexterm id="function-lower"> <primary>lower</primary> </indexterm> <function>lower</function> ( <type>text</type> ) @@ -2657,7 +2657,7 @@ SELECT NOT(ROW(table.*) IS NOT NULL) FROM TABLE; -- detect at least one null in <row> <entry role="func_table_entry"><para role="func_signature"> - <indexterm> + <indexterm id="function-normalize"> <primary>normalize</primary> </indexterm> <indexterm> @@ -3109,6 +3109,46 @@ SELECT NOT(ROW(table.*) IS NOT NULL) FROM TABLE; -- detect at least one null in </para></entry> </row> + <row> + <entry role="func_table_entry"><para role="func_signature"> + <indexterm> + <primary>casefold</primary> + </indexterm> + <function>casefold</function> ( <type>text</type> ) + <returnvalue>text</returnvalue> + </para> + <para> + Performs case folding of the input string according to the collation. + Case folding is similar to case conversion, but the purpose of case + folding is to facilitate case-insensitive comparison of strings, + whereas the purpose of case conversion is to convert to a particular + cased form. This function can only be used when the server encoding + is <literal>UTF8</literal>. + </para> + <para> + Ordinarily, case folding simply converts to lowercase, but there are a + few notable exceptions. For instance, the character + <literal>Σ</literal> (U+03A3) has two lowercase forms: + <literal>σ</literal> (U+03C3) and <literal>ς</literal> (U+03C2); case + folding in the <literal>PG_C_UTF8</literal> collation maps all three + forms to <literal>σ</literal>. + </para> + <para> + Another benefit of case folding is that the results don't change for + existing characters in new versions of Unicode. + </para> + <para> + <function>casefold</function> can be used for Unicode Default Caseless + Matching. It does not always preserve the normalized form of the + input string (see <xref linkend="function-normalize"/>). + </para> + <para> + The <literal>libc</literal> provider doesn't support case folding, so + <function>casefold</function> is identical to <xref + linkend="function-lower"/>. + </para></entry> + </row> + <row> <entry role="func_table_entry"><para role="func_signature"> <indexterm> diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c index 7c4c4aa07d5..2720d3902ab 100644 --- a/src/backend/utils/adt/formatting.c +++ b/src/backend/utils/adt/formatting.c @@ -1819,6 +1819,75 @@ str_initcap(const char *buff, size_t nbytes, Oid collid) return result; } +/* + * collation-aware, wide-character-aware case folding + * + * We pass the number of bytes so we can pass varlena and char* + * to this function. The result is a palloc'd, null-terminated string. + */ +char * +str_casefold(const char *buff, size_t nbytes, Oid collid) +{ + char *result; + pg_locale_t mylocale; + + if (!buff) + return NULL; + + if (!OidIsValid(collid)) + { + /* + * This typically means that the parser could not resolve a conflict + * of implicit collations, so report it that way. + */ + ereport(ERROR, + (errcode(ERRCODE_INDETERMINATE_COLLATION), + errmsg("could not determine which collation to use for %s function", + "lower()"), + errhint("Use the COLLATE clause to set the collation explicitly."))); + } + + if (GetDatabaseEncoding() != PG_UTF8) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("Unicode case folding can only be performed if server encoding is UTF8"))); + + mylocale = pg_newlocale_from_collation(collid); + + /* C/POSIX collations use this path regardless of database encoding */ + if (mylocale->ctype_is_c) + { + result = asc_tolower(buff, nbytes); + } + else + { + const char *src = buff; + size_t srclen = nbytes; + size_t dstsize; + char *dst; + size_t needed; + + /* first try buffer of equal size plus terminating NUL */ + dstsize = srclen + 1; + dst = palloc(dstsize); + + needed = pg_strfold(dst, dstsize, src, srclen, mylocale); + if (needed + 1 > dstsize) + { + /* grow buffer if needed and retry */ + dstsize = needed + 1; + dst = repalloc(dst, dstsize); + needed = pg_strfold(dst, dstsize, src, srclen, mylocale); + Assert(needed + 1 <= dstsize); + } + + Assert(dst[needed] == '\0'); + result = dst; + } + + return result; +} + /* * ASCII-only lower function * diff --git a/src/backend/utils/adt/oracle_compat.c b/src/backend/utils/adt/oracle_compat.c index 2cba7cd1621..a24a2d208fb 100644 --- a/src/backend/utils/adt/oracle_compat.c +++ b/src/backend/utils/adt/oracle_compat.c @@ -126,6 +126,22 @@ initcap(PG_FUNCTION_ARGS) PG_RETURN_TEXT_P(result); } +Datum +casefold(PG_FUNCTION_ARGS) +{ + text *in_string = PG_GETARG_TEXT_PP(0); + char *out_string; + text *result; + + out_string = str_casefold(VARDATA_ANY(in_string), + VARSIZE_ANY_EXHDR(in_string), + PG_GET_COLLATION()); + result = cstring_to_text(out_string); + pfree(out_string); + + PG_RETURN_TEXT_P(result); +} + /******************************************************************** * diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index 94444acd2c5..7d92f580a57 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -106,6 +106,8 @@ extern size_t strtitle_builtin(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale); extern size_t strupper_builtin(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale); +extern size_t strfold_builtin(char *dst, size_t dstsize, const char *src, + ssize_t srclen, pg_locale_t locale); extern size_t strlower_icu(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale); @@ -113,6 +115,8 @@ extern size_t strtitle_icu(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale); extern size_t strupper_icu(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale); +extern size_t strfold_icu(char *dst, size_t dstsize, const char *src, + ssize_t srclen, pg_locale_t locale); extern size_t strlower_libc(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale); @@ -1447,6 +1451,26 @@ pg_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen, return 0; /* keep compiler quiet */ } +size_t +pg_strfold(char *dst, size_t dstsize, const char *src, ssize_t srclen, + pg_locale_t locale) +{ + if (locale->provider == COLLPROVIDER_BUILTIN) + return strfold_builtin(dst, dstsize, src, srclen, locale); +#ifdef USE_ICU + else if (locale->provider == COLLPROVIDER_ICU) + return strfold_icu(dst, dstsize, src, srclen, locale); +#endif + /* for libc, just use strlower */ + else if (locale->provider == COLLPROVIDER_LIBC) + return strlower_libc(dst, dstsize, src, srclen, locale); + else + /* shouldn't happen */ + PGLOCALE_SUPPORT_ERROR(locale->provider); + + return 0; /* keep compiler quiet */ +} + /* * pg_strcoll * diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c index 436e32c0ca0..33ad20bbf07 100644 --- a/src/backend/utils/adt/pg_locale_builtin.c +++ b/src/backend/utils/adt/pg_locale_builtin.c @@ -31,6 +31,8 @@ extern size_t strtitle_builtin(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale); extern size_t strupper_builtin(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale); +extern size_t strfold_builtin(char *dst, size_t dstsize, const char *src, + ssize_t srclen, pg_locale_t locale); struct WordBoundaryState @@ -107,6 +109,14 @@ strupper_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, locale->info.builtin.casemap_full); } +size_t +strfold_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, + pg_locale_t locale) +{ + return unicode_strfold(dest, destsize, src, srclen, + locale->info.builtin.casemap_full); +} + pg_locale_t create_pg_locale_builtin(Oid collid, MemoryContext context) { diff --git a/src/backend/utils/adt/pg_locale_icu.c b/src/backend/utils/adt/pg_locale_icu.c index 5185b0f7289..b0c73f2e43d 100644 --- a/src/backend/utils/adt/pg_locale_icu.c +++ b/src/backend/utils/adt/pg_locale_icu.c @@ -54,6 +54,8 @@ extern size_t strtitle_icu(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale); extern size_t strupper_icu(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale); +extern size_t strfold_icu(char *dst, size_t dstsize, const char *src, + ssize_t srclen, pg_locale_t locale); #ifdef USE_ICU @@ -117,6 +119,10 @@ static int32_t u_strToTitle_default_BI(UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, const char *locale, UErrorCode *pErrorCode); +static int32_t u_strFoldCase_default(UChar *dest, int32_t destCapacity, + const UChar *src, int32_t srcLength, + const char *locale, + UErrorCode *pErrorCode); static const struct collate_methods collate_methods_icu = { .strncoll = strncoll_icu, @@ -439,6 +445,26 @@ strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, return result_len; } +size_t +strfold_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, + pg_locale_t locale) +{ + int32_t len_uchar; + int32_t len_conv; + UChar *buff_uchar; + UChar *buff_conv; + size_t result_len; + + len_uchar = icu_to_uchar(&buff_uchar, src, srclen); + len_conv = icu_convert_case(u_strFoldCase_default, locale, + &buff_conv, buff_uchar, len_uchar); + result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv); + pfree(buff_uchar); + pfree(buff_conv); + + return result_len; +} + /* * strncoll_icu_utf8 * @@ -673,6 +699,38 @@ u_strToTitle_default_BI(UChar *dest, int32_t destCapacity, NULL, locale, pErrorCode); } +static int32_t +u_strFoldCase_default(UChar *dest, int32_t destCapacity, + const UChar *src, int32_t srcLength, + const char *locale, + UErrorCode *pErrorCode) +{ + uint32 options = U_FOLD_CASE_DEFAULT; + char lang[3]; + UErrorCode status; + + /* + * Unlike the ICU APIs for lowercasing, titlecasing, and uppercasing, case + * folding does not accept a locale. Instead it just supports a single + * option relevant to Turkic languages 'az' and 'tr'; check for those + * languages to enable the option. + */ + status = U_ZERO_ERROR; + uloc_getLanguage(locale, lang, 3, &status); + if (U_SUCCESS(status)) + { + /* + * The option name is confusing, but it causes u_strFoldCase to use + * the 'T' mappings, which are ignored for U_FOLD_CASE_DEFAULT. + */ + if (strcmp(lang, "tr") == 0 || strcmp(lang, "az") == 0) + options = U_FOLD_CASE_EXCLUDE_SPECIAL_I; + } + + return u_strFoldCase(dest, destCapacity, src, srcLength, + options, pErrorCode); +} + /* * strncoll_icu * diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 586b83f2f4d..e3a308024de 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -57,6 +57,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 202501231 +#define CATALOG_VERSION_NO 202501232 #endif diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 18560755d26..2aafdbc3e93 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -3623,6 +3623,9 @@ { oid => '872', descr => 'capitalize each word', proname => 'initcap', prorettype => 'text', proargtypes => 'text', prosrc => 'initcap' }, +{ oid => '9569', descr => 'fold case', + proname => 'casefold', prorettype => 'text', proargtypes => 'text', + prosrc => 'casefold' }, { oid => '873', descr => 'left-pad string to length', proname => 'lpad', prorettype => 'text', proargtypes => 'text int4 text', prosrc => 'lpad' }, diff --git a/src/include/utils/formatting.h b/src/include/utils/formatting.h index 5fa49539aaa..835307dac09 100644 --- a/src/include/utils/formatting.h +++ b/src/include/utils/formatting.h @@ -21,6 +21,7 @@ extern char *str_tolower(const char *buff, size_t nbytes, Oid collid); extern char *str_toupper(const char *buff, size_t nbytes, Oid collid); extern char *str_initcap(const char *buff, size_t nbytes, Oid collid); +extern char *str_casefold(const char *buff, size_t nbytes, Oid collid); extern char *asc_tolower(const char *buff, size_t nbytes); extern char *asc_toupper(const char *buff, size_t nbytes); diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h index 2bc3a7df2d9..0d5f0513ceb 100644 --- a/src/include/utils/pg_locale.h +++ b/src/include/utils/pg_locale.h @@ -134,6 +134,9 @@ extern size_t pg_strtitle(char *dest, size_t destsize, extern size_t pg_strupper(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale); +extern size_t pg_strfold(char *dest, size_t destsize, + const char *src, ssize_t srclen, + pg_locale_t locale); extern int pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale); extern int pg_strncoll(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, pg_locale_t locale); diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out index d4f327636fd..d16b46feca7 100644 --- a/src/test/regress/expected/collate.icu.utf8.out +++ b/src/test/regress/expected/collate.icu.utf8.out @@ -255,6 +255,18 @@ SELECT a, x, y FROM collate_test10 ORDER BY lower(y), a; 1 | hij | hij (2 rows) +SELECT casefold('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' COLLATE "en-x-icu"); + casefold +--------------------------------- + abcd 123 #$% ıiii̇ ss ss dždždž σσσ +(1 row) + +SELECT casefold('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' COLLATE "tr-x-icu"); + casefold +--------------------------------- + abcd 123 #$% ıiıi ss ss dždždž σσσ +(1 row) + -- LIKE/ILIKE SELECT * FROM collate_test1 WHERE b LIKE 'abc'; a | b diff --git a/src/test/regress/expected/collate.utf8.out b/src/test/regress/expected/collate.utf8.out index 8b7176a2756..5508622b16d 100644 --- a/src/test/regress/expected/collate.utf8.out +++ b/src/test/regress/expected/collate.utf8.out @@ -160,6 +160,13 @@ SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_C_UTF8; -- same as above with cases reversed t (1 row) +-- case folding +select casefold('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' collate PG_C_UTF8); + casefold +------------------------------- + abcd 123 #$% ıiiİ ß ß dždždž σσσ +(1 row) + -- -- Test PG_UNICODE_FAST -- @@ -320,3 +327,10 @@ SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_UNICODE_FAST; -- same as above with cases re t (1 row) +-- case folding +select casefold('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' collate PG_UNICODE_FAST); + casefold +--------------------------------- + abcd 123 #$% ıiii̇ ss ss dždždž σσσ +(1 row) + diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql index 5ee2da4e0e0..5495c2a1fe1 100644 --- a/src/test/regress/sql/collate.icu.utf8.sql +++ b/src/test/regress/sql/collate.icu.utf8.sql @@ -116,6 +116,9 @@ SELECT a, lower(x COLLATE "C"), lower(y COLLATE "C") FROM collate_test10; SELECT a, x, y FROM collate_test10 ORDER BY lower(y), a; +SELECT casefold('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' COLLATE "en-x-icu"); +SELECT casefold('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' COLLATE "tr-x-icu"); + -- LIKE/ILIKE SELECT * FROM collate_test1 WHERE b LIKE 'abc'; diff --git a/src/test/regress/sql/collate.utf8.sql b/src/test/regress/sql/collate.utf8.sql index 46e9c5232ad..6c7c7aec9ec 100644 --- a/src/test/regress/sql/collate.utf8.sql +++ b/src/test/regress/sql/collate.utf8.sql @@ -81,6 +81,9 @@ SELECT 'xAb' !~* '[c-d]' COLLATE PG_C_UTF8; SELECT 'Δ' ~* '[γ-λ]' COLLATE PG_C_UTF8; SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_C_UTF8; -- same as above with cases reversed +-- case folding +select casefold('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' collate PG_C_UTF8); + -- -- Test PG_UNICODE_FAST -- @@ -140,3 +143,6 @@ SELECT 'xAb' ~* '[W-Y]' COLLATE PG_UNICODE_FAST; SELECT 'xAb' !~* '[c-d]' COLLATE PG_UNICODE_FAST; SELECT 'Δ' ~* '[γ-λ]' COLLATE PG_UNICODE_FAST; SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_UNICODE_FAST; -- same as above with cases reversed + +-- case folding +select casefold('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' collate PG_UNICODE_FAST); -- 2.34.1