Re: Unicode full case mapping: PG_UNICODE_FAST, and standard-compliant UCS_BASIC

Jeff Davis Sat, 19 Apr 2025 12:47:59 -0700

On Thu, 2025-04-17 at 06:58 -0700, Noah Misch wrote:
> Should initcap_wbnext() pass in a locale-dependent "bool posix"
> argument like
> the others calls the commit changed?


Yes, I believe you are correct. Patch and tests attached.

> Long-term, pg_u_isword() should have a "bool posix" argument. 
> Currently, only
> tests call that function.  If it got a non-test caller,
> https://www.unicode.org/reports/tr18/#word would have pg_u_isword()
> follow the
> choice of posix compatibility like pg_u_isalnum() does.

I based those functions on:

https://www.unicode.org/reports/tr18/#Compatibility_Properties

and the "word" class does not have a POSIX variant. But Postgres has
two documented definitions for "word" characters:

 * for regexes, alnum + "_"
 * for INITCAP(), just alnum

and the above definition doesn't match up with either one, which is why
we don't use it.

ICU INITCAP() uses the ICU definition of word boundaries, so doesn't
match our documentation.

We could adjust our documentation to allow for provider-dependent
definitions of word characters, which might be a good idea. But that
still doesn't quite capture ICU's more complex definition of word
boundaries.

Or, we could remove those unused functions for now, and figure out if
there's a reason to add them back later. They are probably adding more
confusion than anything.

Regards,
        Jeff Davis

From ff3f5cc9e7a725e2c1d324c77dd534d77ea085a4 Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Sat, 19 Apr 2025 11:47:44 -0700
Subject: [PATCH v1] Fix INITCAP() word boundaries for PG_UNICODE_FAST.

Word boundaries are based on whether a character is alphanumeric or
not. For the PG_UNICODE_FAST collation, alphanumeric includes
non-ASCII digits; whereas for the PG_C_UTF8 collation, it only
includes digits 0-9. Pass down the right information from the
pg_locale_t into initcap_wbnext to differentiate the behavior.

Reported-by: Noah Misch <[email protected]>
Discussion: https://postgr.es/m/[email protected]
---
 src/backend/utils/adt/pg_locale_builtin.c  |  4 +++-
 src/common/unicode/case_test.c             | 13 ++++++++++++-
 src/test/regress/expected/collate.utf8.out |  8 ++++++--
 src/test/regress/sql/collate.utf8.sql      |  2 ++
 4 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c
index 125b10ff7ab..f51768830cd 100644
--- a/src/backend/utils/adt/pg_locale_builtin.c
+++ b/src/backend/utils/adt/pg_locale_builtin.c
@@ -40,6 +40,7 @@ struct WordBoundaryState
 	const char *str;
 	size_t		len;
 	size_t		offset;
+	bool		posix;
 	bool		init;
 	bool		prev_alnum;
 };
@@ -58,7 +59,7 @@ initcap_wbnext(void *state)
 	{
 		pg_wchar	u = utf8_to_unicode((unsigned char *) wbstate->str +
 										wbstate->offset);
-		bool		curr_alnum = pg_u_isalnum(u, true);
+		bool		curr_alnum = pg_u_isalnum(u, wbstate->posix);
 
 		if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
 		{
@@ -92,6 +93,7 @@ strtitle_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
 		.str = src,
 		.len = srclen,
 		.offset = 0,
+		.posix = !locale->info.builtin.casemap_full,
 		.init = false,
 		.prev_alnum = false,
 	};
diff --git a/src/common/unicode/case_test.c b/src/common/unicode/case_test.c
index f0b38b3bdd7..fdfb62e8552 100644
--- a/src/common/unicode/case_test.c
+++ b/src/common/unicode/case_test.c
@@ -41,6 +41,7 @@ struct WordBoundaryState
 	const char *str;
 	size_t		len;
 	size_t		offset;
+	bool		posix;
 	bool		init;
 	bool		prev_alnum;
 };
@@ -55,7 +56,7 @@ initcap_wbnext(void *state)
 	{
 		pg_wchar	u = utf8_to_unicode((unsigned char *) wbstate->str +
 										wbstate->offset);
-		bool		curr_alnum = pg_u_isalnum(u, true);
+		bool		curr_alnum = pg_u_isalnum(u, wbstate->posix);
 
 		if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
 		{
@@ -112,10 +113,13 @@ icu_test_full(char *str)
 	char		icu_upper[BUFSZ];
 	char		icu_fold[BUFSZ];
 	UErrorCode	status;
+
+	/* full case mapping doesn't use posix semantics */
 	struct WordBoundaryState wbstate = {
 		.str = str,
 		.len = strlen(str),
 		.offset = 0,
+		.posix = false,
 		.init = false,
 		.prev_alnum = false,
 	};
@@ -344,6 +348,12 @@ test_convert_case()
 	test_convert(tfunc_lower, "σς'Σ' ΣΣ'Σ'", "σς'ς' σσ'ς'");
 	test_convert(tfunc_title, "σςΣ ΣΣΣ", "Σςς Σσς");
 	test_convert(tfunc_fold, "σςΣ ΣΣΣ", "σσσ σσσ");
+	/* test that alphanumerics are word characters */
+	test_convert(tfunc_title, "λλ", "Λλ");
+	test_convert(tfunc_title, "1a", "1a");
+	/* U+FF11 FULLWIDTH ONE is alphanumeric for full case mapping */
+	test_convert(tfunc_title, "\uFF11a", "\uFF11a");
+
 
 #ifdef USE_ICU
 	icu_test_full("");
@@ -354,6 +364,7 @@ test_convert_case()
 	icu_test_full("abc 123xyz");
 	icu_test_full("σςΣ ΣΣΣ");
 	icu_test_full("ıiIİ");
+	icu_test_full("\uFF11a");
 	/* test <alpha><iota_subscript><acute> */
 	icu_test_full("\u0391\u0345\u0301");
 #endif
diff --git a/src/test/regress/expected/collate.utf8.out b/src/test/regress/expected/collate.utf8.out
index 5508622b16d..0c3ab5c89b2 100644
--- a/src/test/regress/expected/collate.utf8.out
+++ b/src/test/regress/expected/collate.utf8.out
@@ -52,6 +52,7 @@ INSERT INTO test_pg_c_utf8 VALUES
   ('abc DEF 123abc'),
   ('ábc sßs ßss DÉF'),
   ('ǄxxǄ ǆxxǅ ǅxxǆ'),
+  (U&'Λλ 1a \FF11a'),
   ('ȺȺȺ'),
   ('ⱥⱥⱥ'),
   ('ⱥȺ');
@@ -67,10 +68,11 @@ SELECT
  abc DEF 123abc  | abc def 123abc  | Abc Def 123abc  | ABC DEF 123ABC  |      14 |            14 |              14 |            14
  ábc sßs ßss DÉF | ábc sßs ßss déf | Ábc Sßs ßss Déf | ÁBC SßS ßSS DÉF |      19 |            19 |              19 |            19
  ǄxxǄ ǆxxǅ ǅxxǆ  | ǆxxǆ ǆxxǆ ǆxxǆ  | Ǆxxǆ Ǆxxǆ Ǆxxǆ  | ǄXXǄ ǄXXǄ ǄXXǄ  |      20 |            20 |              20 |            20
+ Λλ 1a １a       | λλ 1a １a       | Λλ 1a １A       | ΛΛ 1A １A       |      12 |            12 |              12 |            12
  ȺȺȺ             | ⱥⱥⱥ             | Ⱥⱥⱥ             | ȺȺȺ             |       6 |             9 |               8 |             6
  ⱥⱥⱥ             | ⱥⱥⱥ             | Ⱥⱥⱥ             | ȺȺȺ             |       9 |             9 |               8 |             6
  ⱥȺ              | ⱥⱥ              | Ⱥⱥ              | ȺȺ              |       5 |             6 |               5 |             4
-(6 rows)
+(7 rows)
 
 DROP TABLE test_pg_c_utf8;
 -- negative test: Final_Sigma not used for builtin locale C.UTF-8
@@ -182,6 +184,7 @@ INSERT INTO test_pg_unicode_fast VALUES
   ('abc DEF 123abc'),
   ('ábc sßs ßss DÉF'),
   ('ǄxxǄ ǆxxǅ ǅxxǆ'),
+  (U&'Λλ 1a \FF11a'),
   ('ȺȺȺ'),
   ('ⱥⱥⱥ'),
   ('ⱥȺ');
@@ -197,10 +200,11 @@ SELECT
  abc DEF 123abc  | abc def 123abc  | Abc Def 123abc   | ABC DEF 123ABC    |      14 |            14 |              14 |            14
  ábc sßs ßss DÉF | ábc sßs ßss déf | Ábc Sßs Ssss Déf | ÁBC SSSS SSSS DÉF |      19 |            19 |              19 |            19
  ǄxxǄ ǆxxǅ ǅxxǆ  | ǆxxǆ ǆxxǆ ǆxxǆ  | ǅxxǆ ǅxxǆ ǅxxǆ   | ǄXXǄ ǄXXǄ ǄXXǄ    |      20 |            20 |              20 |            20
+ Λλ 1a １a       | λλ 1a １a       | Λλ 1a １a        | ΛΛ 1A １A         |      12 |            12 |              12 |            12
  ȺȺȺ             | ⱥⱥⱥ             | Ⱥⱥⱥ              | ȺȺȺ               |       6 |             9 |               8 |             6
  ⱥⱥⱥ             | ⱥⱥⱥ             | Ⱥⱥⱥ              | ȺȺȺ               |       9 |             9 |               8 |             6
  ⱥȺ              | ⱥⱥ              | Ⱥⱥ               | ȺȺ                |       5 |             6 |               5 |             4
-(6 rows)
+(7 rows)
 
 DROP TABLE test_pg_unicode_fast;
 -- test Final_Sigma
diff --git a/src/test/regress/sql/collate.utf8.sql b/src/test/regress/sql/collate.utf8.sql
index 6c7c7aec9ec..d6d14220ab3 100644
--- a/src/test/regress/sql/collate.utf8.sql
+++ b/src/test/regress/sql/collate.utf8.sql
@@ -45,6 +45,7 @@ INSERT INTO test_pg_c_utf8 VALUES
   ('abc DEF 123abc'),
   ('ábc sßs ßss DÉF'),
   ('ǄxxǄ ǆxxǅ ǅxxǆ'),
+  (U&'Λλ 1a \FF11a'),
   ('ȺȺȺ'),
   ('ⱥⱥⱥ'),
   ('ⱥȺ');
@@ -100,6 +101,7 @@ INSERT INTO test_pg_unicode_fast VALUES
   ('abc DEF 123abc'),
   ('ábc sßs ßss DÉF'),
   ('ǄxxǄ ǆxxǅ ǅxxǆ'),
+  (U&'Λλ 1a \FF11a'),
   ('ȺȺȺ'),
   ('ⱥⱥⱥ'),
   ('ⱥȺ');
-- 
2.34.1

Re: Unicode full case mapping: PG_UNICODE_FAST, and standard-compliant UCS_BASIC

Reply via email to