On Thu, 2025-04-17 at 06:58 -0700, Noah Misch wrote: > Should initcap_wbnext() pass in a locale-dependent "bool posix" > argument like > the others calls the commit changed?
Yes, I believe you are correct. Patch and tests attached. > Long-term, pg_u_isword() should have a "bool posix" argument. > Currently, only > tests call that function. If it got a non-test caller, > https://www.unicode.org/reports/tr18/#word would have pg_u_isword() > follow the > choice of posix compatibility like pg_u_isalnum() does. I based those functions on: https://www.unicode.org/reports/tr18/#Compatibility_Properties and the "word" class does not have a POSIX variant. But Postgres has two documented definitions for "word" characters: * for regexes, alnum + "_" * for INITCAP(), just alnum and the above definition doesn't match up with either one, which is why we don't use it. ICU INITCAP() uses the ICU definition of word boundaries, so doesn't match our documentation. We could adjust our documentation to allow for provider-dependent definitions of word characters, which might be a good idea. But that still doesn't quite capture ICU's more complex definition of word boundaries. Or, we could remove those unused functions for now, and figure out if there's a reason to add them back later. They are probably adding more confusion than anything. Regards, Jeff Davis
From ff3f5cc9e7a725e2c1d324c77dd534d77ea085a4 Mon Sep 17 00:00:00 2001 From: Jeff Davis <j...@j-davis.com> Date: Sat, 19 Apr 2025 11:47:44 -0700 Subject: [PATCH v1] Fix INITCAP() word boundaries for PG_UNICODE_FAST. Word boundaries are based on whether a character is alphanumeric or not. For the PG_UNICODE_FAST collation, alphanumeric includes non-ASCII digits; whereas for the PG_C_UTF8 collation, it only includes digits 0-9. Pass down the right information from the pg_locale_t into initcap_wbnext to differentiate the behavior. Reported-by: Noah Misch <n...@leadboat.com> Discussion: https://postgr.es/m/20250417135841.33.nmi...@google.com --- src/backend/utils/adt/pg_locale_builtin.c | 4 +++- src/common/unicode/case_test.c | 13 ++++++++++++- src/test/regress/expected/collate.utf8.out | 8 ++++++-- src/test/regress/sql/collate.utf8.sql | 2 ++ 4 files changed, 23 insertions(+), 4 deletions(-) diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c index 125b10ff7ab..f51768830cd 100644 --- a/src/backend/utils/adt/pg_locale_builtin.c +++ b/src/backend/utils/adt/pg_locale_builtin.c @@ -40,6 +40,7 @@ struct WordBoundaryState const char *str; size_t len; size_t offset; + bool posix; bool init; bool prev_alnum; }; @@ -58,7 +59,7 @@ initcap_wbnext(void *state) { pg_wchar u = utf8_to_unicode((unsigned char *) wbstate->str + wbstate->offset); - bool curr_alnum = pg_u_isalnum(u, true); + bool curr_alnum = pg_u_isalnum(u, wbstate->posix); if (!wbstate->init || curr_alnum != wbstate->prev_alnum) { @@ -92,6 +93,7 @@ strtitle_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, .str = src, .len = srclen, .offset = 0, + .posix = !locale->info.builtin.casemap_full, .init = false, .prev_alnum = false, }; diff --git a/src/common/unicode/case_test.c b/src/common/unicode/case_test.c index f0b38b3bdd7..fdfb62e8552 100644 --- a/src/common/unicode/case_test.c +++ b/src/common/unicode/case_test.c @@ -41,6 +41,7 @@ struct WordBoundaryState const char *str; size_t len; size_t offset; + bool posix; bool init; bool prev_alnum; }; @@ -55,7 +56,7 @@ initcap_wbnext(void *state) { pg_wchar u = utf8_to_unicode((unsigned char *) wbstate->str + wbstate->offset); - bool curr_alnum = pg_u_isalnum(u, true); + bool curr_alnum = pg_u_isalnum(u, wbstate->posix); if (!wbstate->init || curr_alnum != wbstate->prev_alnum) { @@ -112,10 +113,13 @@ icu_test_full(char *str) char icu_upper[BUFSZ]; char icu_fold[BUFSZ]; UErrorCode status; + + /* full case mapping doesn't use posix semantics */ struct WordBoundaryState wbstate = { .str = str, .len = strlen(str), .offset = 0, + .posix = false, .init = false, .prev_alnum = false, }; @@ -344,6 +348,12 @@ test_convert_case() test_convert(tfunc_lower, "σς'Σ' ΣΣ'Σ'", "σς'ς' σσ'ς'"); test_convert(tfunc_title, "σςΣ ΣΣΣ", "Σςς Σσς"); test_convert(tfunc_fold, "σςΣ ΣΣΣ", "σσσ σσσ"); + /* test that alphanumerics are word characters */ + test_convert(tfunc_title, "λλ", "Λλ"); + test_convert(tfunc_title, "1a", "1a"); + /* U+FF11 FULLWIDTH ONE is alphanumeric for full case mapping */ + test_convert(tfunc_title, "\uFF11a", "\uFF11a"); + #ifdef USE_ICU icu_test_full(""); @@ -354,6 +364,7 @@ test_convert_case() icu_test_full("abc 123xyz"); icu_test_full("σςΣ ΣΣΣ"); icu_test_full("ıiIİ"); + icu_test_full("\uFF11a"); /* test <alpha><iota_subscript><acute> */ icu_test_full("\u0391\u0345\u0301"); #endif diff --git a/src/test/regress/expected/collate.utf8.out b/src/test/regress/expected/collate.utf8.out index 5508622b16d..0c3ab5c89b2 100644 --- a/src/test/regress/expected/collate.utf8.out +++ b/src/test/regress/expected/collate.utf8.out @@ -52,6 +52,7 @@ INSERT INTO test_pg_c_utf8 VALUES ('abc DEF 123abc'), ('ábc sßs ßss DÉF'), ('DŽxxDŽ džxxDž Džxxdž'), + (U&'Λλ 1a \FF11a'), ('ȺȺȺ'), ('ⱥⱥⱥ'), ('ⱥȺ'); @@ -67,10 +68,11 @@ SELECT abc DEF 123abc | abc def 123abc | Abc Def 123abc | ABC DEF 123ABC | 14 | 14 | 14 | 14 ábc sßs ßss DÉF | ábc sßs ßss déf | Ábc Sßs ßss Déf | ÁBC SßS ßSS DÉF | 19 | 19 | 19 | 19 DŽxxDŽ džxxDž Džxxdž | džxxdž džxxdž džxxdž | DŽxxdž DŽxxdž DŽxxdž | DŽXXDŽ DŽXXDŽ DŽXXDŽ | 20 | 20 | 20 | 20 + Λλ 1a 1a | λλ 1a 1a | Λλ 1a 1A | ΛΛ 1A 1A | 12 | 12 | 12 | 12 ȺȺȺ | ⱥⱥⱥ | Ⱥⱥⱥ | ȺȺȺ | 6 | 9 | 8 | 6 ⱥⱥⱥ | ⱥⱥⱥ | Ⱥⱥⱥ | ȺȺȺ | 9 | 9 | 8 | 6 ⱥȺ | ⱥⱥ | Ⱥⱥ | ȺȺ | 5 | 6 | 5 | 4 -(6 rows) +(7 rows) DROP TABLE test_pg_c_utf8; -- negative test: Final_Sigma not used for builtin locale C.UTF-8 @@ -182,6 +184,7 @@ INSERT INTO test_pg_unicode_fast VALUES ('abc DEF 123abc'), ('ábc sßs ßss DÉF'), ('DŽxxDŽ džxxDž Džxxdž'), + (U&'Λλ 1a \FF11a'), ('ȺȺȺ'), ('ⱥⱥⱥ'), ('ⱥȺ'); @@ -197,10 +200,11 @@ SELECT abc DEF 123abc | abc def 123abc | Abc Def 123abc | ABC DEF 123ABC | 14 | 14 | 14 | 14 ábc sßs ßss DÉF | ábc sßs ßss déf | Ábc Sßs Ssss Déf | ÁBC SSSS SSSS DÉF | 19 | 19 | 19 | 19 DŽxxDŽ džxxDž Džxxdž | džxxdž džxxdž džxxdž | Džxxdž Džxxdž Džxxdž | DŽXXDŽ DŽXXDŽ DŽXXDŽ | 20 | 20 | 20 | 20 + Λλ 1a 1a | λλ 1a 1a | Λλ 1a 1a | ΛΛ 1A 1A | 12 | 12 | 12 | 12 ȺȺȺ | ⱥⱥⱥ | Ⱥⱥⱥ | ȺȺȺ | 6 | 9 | 8 | 6 ⱥⱥⱥ | ⱥⱥⱥ | Ⱥⱥⱥ | ȺȺȺ | 9 | 9 | 8 | 6 ⱥȺ | ⱥⱥ | Ⱥⱥ | ȺȺ | 5 | 6 | 5 | 4 -(6 rows) +(7 rows) DROP TABLE test_pg_unicode_fast; -- test Final_Sigma diff --git a/src/test/regress/sql/collate.utf8.sql b/src/test/regress/sql/collate.utf8.sql index 6c7c7aec9ec..d6d14220ab3 100644 --- a/src/test/regress/sql/collate.utf8.sql +++ b/src/test/regress/sql/collate.utf8.sql @@ -45,6 +45,7 @@ INSERT INTO test_pg_c_utf8 VALUES ('abc DEF 123abc'), ('ábc sßs ßss DÉF'), ('DŽxxDŽ džxxDž Džxxdž'), + (U&'Λλ 1a \FF11a'), ('ȺȺȺ'), ('ⱥⱥⱥ'), ('ⱥȺ'); @@ -100,6 +101,7 @@ INSERT INTO test_pg_unicode_fast VALUES ('abc DEF 123abc'), ('ábc sßs ßss DÉF'), ('DŽxxDŽ džxxDž Džxxdž'), + (U&'Λλ 1a \FF11a'), ('ȺȺȺ'), ('ⱥⱥⱥ'), ('ⱥȺ'); -- 2.34.1