On Mon, 2025-04-14 at 13:44 -0700, Noah Misch wrote: > v18 regc_pg_locale.c uses only the locale_t-extended forms, and it > aims not to > depend on LC_CTYPE. v17 used both tolower() and tolower_l(), but v18 > uses the > latter only.
Fixed in v2-0001 by rewording the comment slightly. > > However, I think v17's > concept of separate PG_REGEX_ symbols for the default-locale case is > still the > right thing for v18. In other words, this code should change to look > more > like v17, with the removal of non-locale_t calls being the main > change. I tried that in v2-0003, but I think it ended up worse. Most pg_wc_xyz() functions don't care if it's the default collation or not, so there are a lot of duplicate cases. The previous approach is still there as v2-0002. Regrads, Jeff Davis
From 9724181f715ce3468e9342763fadde450db08d26 Mon Sep 17 00:00:00 2001 From: Jeff Davis <j...@j-davis.com> Date: Tue, 15 Apr 2025 15:04:37 -0700 Subject: [PATCH v2 1/3] Improve comment in regc_pg_locale.c. Reported-by: Noah Misch <n...@leadboat.com> Discussion: https://postgr.es/m/20250412123430.8c.nmi...@google.com --- src/backend/regex/regc_pg_locale.c | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c index ed7411df83d..31d8ba0322a 100644 --- a/src/backend/regex/regc_pg_locale.c +++ b/src/backend/regex/regc_pg_locale.c @@ -21,22 +21,22 @@ #include "utils/pg_locale.h" /* - * To provide as much functionality as possible on a variety of platforms, - * without going so far as to implement everything from scratch, we use - * several implementation strategies depending on the situation: + * For the libc provider, to provide as much functionality as possible on a + * variety of platforms without going so far as to implement everything from + * scratch, we use several implementation strategies depending on the + * situation: * * 1. In C/POSIX collations, we use hard-wired code. We can't depend on * the <ctype.h> functions since those will obey LC_CTYPE. Note that these * collations don't give a fig about multibyte characters. * - * 2. In the "default" collation (which is supposed to obey LC_CTYPE): - * - * 2a. When working in UTF8 encoding, we use the <wctype.h> functions. + * 2. When working in UTF8 encoding, we use the <wctype.h> functions. * This assumes that every platform uses Unicode codepoints directly - * as the wchar_t representation of Unicode. On some platforms + * as the wchar_t representation of Unicode. (XXX: ICU makes this assumption + * even for non-UTF8 encodings, which may be a problem.) On some platforms * wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF. * - * 2b. In all other encodings, we use the <ctype.h> functions for pg_wchar + * 3. In all other encodings, we use the <ctype.h> functions for pg_wchar * values up to 255, and punt for values above that. This is 100% correct * only in single-byte encodings such as LATINn. However, non-Unicode * multibyte encodings are mostly Far Eastern character sets for which the @@ -46,14 +46,11 @@ * the platform's wchar_t representation matches what we do in pg_wchar * conversions. * - * 3. Here, we use the locale_t-extended forms of the <wctype.h> and <ctype.h> - * functions, under exactly the same cases as #2. - * - * There is one notable difference between cases 2 and 3: in the "default" - * collation we force ASCII letters to follow ASCII upcase/downcase rules, - * while in a non-default collation we just let the library functions do what - * they will. The case where this matters is treatment of I/i in Turkish, - * and the behavior is meant to match the upper()/lower() SQL functions. + * As a special case, in the "default" collation we force ASCII letters to + * follow ASCII upcase/downcase rules, while in a non-default collation we + * just let the library functions do what they will. The case where this + * matters is treatment of I/i in Turkish, and the behavior is meant to match + * the upper()/lower() SQL functions. * * We store the active collation setting in static variables. In principle * it could be passed down to here via the regex library's "struct vars" data -- 2.34.1
From d7e18174cb25fc981ba906101c3f4582fc41a826 Mon Sep 17 00:00:00 2001 From: Jeff Davis <j...@j-davis.com> Date: Tue, 15 Apr 2025 15:05:08 -0700 Subject: [PATCH v2 2/3] Another unintentional behavior change in commit e9931bfb75. Reported-by: Noah Misch <n...@leadboat.com> Discussion: https://postgr.es/m/20250412123430.8c.nmi...@google.com --- src/backend/regex/regc_pg_locale.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c index 31d8ba0322a..7377f459607 100644 --- a/src/backend/regex/regc_pg_locale.c +++ b/src/backend/regex/regc_pg_locale.c @@ -559,10 +559,16 @@ pg_wc_toupper(pg_wchar c) case PG_REGEX_STRATEGY_BUILTIN: return unicode_uppercase_simple(c); case PG_REGEX_STRATEGY_LIBC_WIDE: + /* force C behavior for ASCII characters, per comments above */ + if (pg_regex_locale->is_default && c <= (pg_wchar) 127) + return pg_ascii_toupper((unsigned char) c); if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return towupper_l((wint_t) c, pg_regex_locale->info.lt); /* FALL THRU */ case PG_REGEX_STRATEGY_LIBC_1BYTE: + /* force C behavior for ASCII characters, per comments above */ + if (pg_regex_locale->is_default && c <= (pg_wchar) 127) + return pg_ascii_toupper((unsigned char) c); if (c <= (pg_wchar) UCHAR_MAX) return toupper_l((unsigned char) c, pg_regex_locale->info.lt); return c; @@ -587,10 +593,16 @@ pg_wc_tolower(pg_wchar c) case PG_REGEX_STRATEGY_BUILTIN: return unicode_lowercase_simple(c); case PG_REGEX_STRATEGY_LIBC_WIDE: + /* force C behavior for ASCII characters, per comments above */ + if (pg_regex_locale->is_default && c <= (pg_wchar) 127) + return pg_ascii_tolower((unsigned char) c); if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return towlower_l((wint_t) c, pg_regex_locale->info.lt); /* FALL THRU */ case PG_REGEX_STRATEGY_LIBC_1BYTE: + /* force C behavior for ASCII characters, per comments above */ + if (pg_regex_locale->is_default && c <= (pg_wchar) 127) + return pg_ascii_tolower((unsigned char) c); if (c <= (pg_wchar) UCHAR_MAX) return tolower_l((unsigned char) c, pg_regex_locale->info.lt); return c; -- 2.34.1
From 280929ad8f70106be7bc4d59d90957ecf86a421c Mon Sep 17 00:00:00 2001 From: Jeff Davis <j...@j-davis.com> Date: Tue, 15 Apr 2025 15:49:44 -0700 Subject: [PATCH v2 3/3] alternative approach --- src/backend/regex/regc_pg_locale.c | 60 +++++++++++++++++++++++++----- 1 file changed, 50 insertions(+), 10 deletions(-) diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c index 7377f459607..ec9bb8e460b 100644 --- a/src/backend/regex/regc_pg_locale.c +++ b/src/backend/regex/regc_pg_locale.c @@ -64,6 +64,8 @@ typedef enum { PG_REGEX_STRATEGY_C, /* C locale (encoding independent) */ PG_REGEX_STRATEGY_BUILTIN, /* built-in Unicode semantics */ + PG_REGEX_STRATEGY_DEF_WIDE, /* libc default, locale_t <wctype.h> functions */ + PG_REGEX_STRATEGY_DEF_1BYTE, /* libc default, locale_t <ctype.h> functions */ PG_REGEX_STRATEGY_LIBC_WIDE, /* Use locale_t <wctype.h> functions */ PG_REGEX_STRATEGY_LIBC_1BYTE, /* Use locale_t <ctype.h> functions */ PG_REGEX_STRATEGY_ICU, /* Use ICU uchar.h functions */ @@ -285,9 +287,19 @@ pg_set_regex_collation(Oid collation) { Assert(locale->provider == COLLPROVIDER_LIBC); if (GetDatabaseEncoding() == PG_UTF8) - strategy = PG_REGEX_STRATEGY_LIBC_WIDE; + { + if (locale->is_default) + strategy = PG_REGEX_STRATEGY_DEF_WIDE; + else + strategy = PG_REGEX_STRATEGY_LIBC_WIDE; + } else - strategy = PG_REGEX_STRATEGY_LIBC_1BYTE; + { + if (locale->is_default) + strategy = PG_REGEX_STRATEGY_DEF_1BYTE; + else + strategy = PG_REGEX_STRATEGY_LIBC_1BYTE; + } } } @@ -305,10 +317,12 @@ pg_wc_isdigit(pg_wchar c) (pg_char_properties[c] & PG_ISDIGIT)); case PG_REGEX_STRATEGY_BUILTIN: return pg_u_isdigit(c, !pg_regex_locale->info.builtin.casemap_full); + case PG_REGEX_STRATEGY_DEF_WIDE: case PG_REGEX_STRATEGY_LIBC_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswdigit_l((wint_t) c, pg_regex_locale->info.lt); /* FALL THRU */ + case PG_REGEX_STRATEGY_DEF_1BYTE: case PG_REGEX_STRATEGY_LIBC_1BYTE: return (c <= (pg_wchar) UCHAR_MAX && isdigit_l((unsigned char) c, pg_regex_locale->info.lt)); @@ -332,10 +346,12 @@ pg_wc_isalpha(pg_wchar c) (pg_char_properties[c] & PG_ISALPHA)); case PG_REGEX_STRATEGY_BUILTIN: return pg_u_isalpha(c); + case PG_REGEX_STRATEGY_DEF_WIDE: case PG_REGEX_STRATEGY_LIBC_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswalpha_l((wint_t) c, pg_regex_locale->info.lt); /* FALL THRU */ + case PG_REGEX_STRATEGY_DEF_1BYTE: case PG_REGEX_STRATEGY_LIBC_1BYTE: return (c <= (pg_wchar) UCHAR_MAX && isalpha_l((unsigned char) c, pg_regex_locale->info.lt)); @@ -359,10 +375,12 @@ pg_wc_isalnum(pg_wchar c) (pg_char_properties[c] & PG_ISALNUM)); case PG_REGEX_STRATEGY_BUILTIN: return pg_u_isalnum(c, !pg_regex_locale->info.builtin.casemap_full); + case PG_REGEX_STRATEGY_DEF_WIDE: case PG_REGEX_STRATEGY_LIBC_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswalnum_l((wint_t) c, pg_regex_locale->info.lt); /* FALL THRU */ + case PG_REGEX_STRATEGY_DEF_1BYTE: case PG_REGEX_STRATEGY_LIBC_1BYTE: return (c <= (pg_wchar) UCHAR_MAX && isalnum_l((unsigned char) c, pg_regex_locale->info.lt)); @@ -395,10 +413,12 @@ pg_wc_isupper(pg_wchar c) (pg_char_properties[c] & PG_ISUPPER)); case PG_REGEX_STRATEGY_BUILTIN: return pg_u_isupper(c); + case PG_REGEX_STRATEGY_DEF_WIDE: case PG_REGEX_STRATEGY_LIBC_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswupper_l((wint_t) c, pg_regex_locale->info.lt); /* FALL THRU */ + case PG_REGEX_STRATEGY_DEF_1BYTE: case PG_REGEX_STRATEGY_LIBC_1BYTE: return (c <= (pg_wchar) UCHAR_MAX && isupper_l((unsigned char) c, pg_regex_locale->info.lt)); @@ -422,10 +442,12 @@ pg_wc_islower(pg_wchar c) (pg_char_properties[c] & PG_ISLOWER)); case PG_REGEX_STRATEGY_BUILTIN: return pg_u_islower(c); + case PG_REGEX_STRATEGY_DEF_WIDE: case PG_REGEX_STRATEGY_LIBC_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswlower_l((wint_t) c, pg_regex_locale->info.lt); /* FALL THRU */ + case PG_REGEX_STRATEGY_DEF_1BYTE: case PG_REGEX_STRATEGY_LIBC_1BYTE: return (c <= (pg_wchar) UCHAR_MAX && islower_l((unsigned char) c, pg_regex_locale->info.lt)); @@ -449,10 +471,12 @@ pg_wc_isgraph(pg_wchar c) (pg_char_properties[c] & PG_ISGRAPH)); case PG_REGEX_STRATEGY_BUILTIN: return pg_u_isgraph(c); + case PG_REGEX_STRATEGY_DEF_WIDE: case PG_REGEX_STRATEGY_LIBC_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswgraph_l((wint_t) c, pg_regex_locale->info.lt); /* FALL THRU */ + case PG_REGEX_STRATEGY_DEF_1BYTE: case PG_REGEX_STRATEGY_LIBC_1BYTE: return (c <= (pg_wchar) UCHAR_MAX && isgraph_l((unsigned char) c, pg_regex_locale->info.lt)); @@ -476,10 +500,12 @@ pg_wc_isprint(pg_wchar c) (pg_char_properties[c] & PG_ISPRINT)); case PG_REGEX_STRATEGY_BUILTIN: return pg_u_isprint(c); + case PG_REGEX_STRATEGY_DEF_WIDE: case PG_REGEX_STRATEGY_LIBC_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswprint_l((wint_t) c, pg_regex_locale->info.lt); /* FALL THRU */ + case PG_REGEX_STRATEGY_DEF_1BYTE: case PG_REGEX_STRATEGY_LIBC_1BYTE: return (c <= (pg_wchar) UCHAR_MAX && isprint_l((unsigned char) c, pg_regex_locale->info.lt)); @@ -503,10 +529,12 @@ pg_wc_ispunct(pg_wchar c) (pg_char_properties[c] & PG_ISPUNCT)); case PG_REGEX_STRATEGY_BUILTIN: return pg_u_ispunct(c, !pg_regex_locale->info.builtin.casemap_full); + case PG_REGEX_STRATEGY_DEF_WIDE: case PG_REGEX_STRATEGY_LIBC_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswpunct_l((wint_t) c, pg_regex_locale->info.lt); /* FALL THRU */ + case PG_REGEX_STRATEGY_DEF_1BYTE: case PG_REGEX_STRATEGY_LIBC_1BYTE: return (c <= (pg_wchar) UCHAR_MAX && ispunct_l((unsigned char) c, pg_regex_locale->info.lt)); @@ -530,10 +558,12 @@ pg_wc_isspace(pg_wchar c) (pg_char_properties[c] & PG_ISSPACE)); case PG_REGEX_STRATEGY_BUILTIN: return pg_u_isspace(c); + case PG_REGEX_STRATEGY_DEF_WIDE: case PG_REGEX_STRATEGY_LIBC_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswspace_l((wint_t) c, pg_regex_locale->info.lt); /* FALL THRU */ + case PG_REGEX_STRATEGY_DEF_1BYTE: case PG_REGEX_STRATEGY_LIBC_1BYTE: return (c <= (pg_wchar) UCHAR_MAX && isspace_l((unsigned char) c, pg_regex_locale->info.lt)); @@ -558,17 +588,21 @@ pg_wc_toupper(pg_wchar c) return c; case PG_REGEX_STRATEGY_BUILTIN: return unicode_uppercase_simple(c); - case PG_REGEX_STRATEGY_LIBC_WIDE: + case PG_REGEX_STRATEGY_DEF_WIDE: /* force C behavior for ASCII characters, per comments above */ - if (pg_regex_locale->is_default && c <= (pg_wchar) 127) + if (c <= (pg_wchar) 127) return pg_ascii_toupper((unsigned char) c); + /* FALL THRU */ + case PG_REGEX_STRATEGY_LIBC_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return towupper_l((wint_t) c, pg_regex_locale->info.lt); /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: + case PG_REGEX_STRATEGY_DEF_1BYTE: /* force C behavior for ASCII characters, per comments above */ - if (pg_regex_locale->is_default && c <= (pg_wchar) 127) + if (c <= (pg_wchar) 127) return pg_ascii_toupper((unsigned char) c); + /* FALL THRU */ + case PG_REGEX_STRATEGY_LIBC_1BYTE: if (c <= (pg_wchar) UCHAR_MAX) return toupper_l((unsigned char) c, pg_regex_locale->info.lt); return c; @@ -592,17 +626,21 @@ pg_wc_tolower(pg_wchar c) return c; case PG_REGEX_STRATEGY_BUILTIN: return unicode_lowercase_simple(c); - case PG_REGEX_STRATEGY_LIBC_WIDE: + case PG_REGEX_STRATEGY_DEF_WIDE: /* force C behavior for ASCII characters, per comments above */ - if (pg_regex_locale->is_default && c <= (pg_wchar) 127) + if (c <= (pg_wchar) 127) return pg_ascii_tolower((unsigned char) c); + /* FALL THRU */ + case PG_REGEX_STRATEGY_LIBC_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return towlower_l((wint_t) c, pg_regex_locale->info.lt); /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: + case PG_REGEX_STRATEGY_DEF_1BYTE: /* force C behavior for ASCII characters, per comments above */ - if (pg_regex_locale->is_default && c <= (pg_wchar) 127) + if (c <= (pg_wchar) 127) return pg_ascii_tolower((unsigned char) c); + /* FALL THRU */ + case PG_REGEX_STRATEGY_LIBC_1BYTE: if (c <= (pg_wchar) UCHAR_MAX) return tolower_l((unsigned char) c, pg_regex_locale->info.lt); return c; @@ -751,9 +789,11 @@ pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode) case PG_REGEX_STRATEGY_BUILTIN: max_chr = (pg_wchar) MAX_SIMPLE_CHR; break; + case PG_REGEX_STRATEGY_DEF_WIDE: case PG_REGEX_STRATEGY_LIBC_WIDE: max_chr = (pg_wchar) MAX_SIMPLE_CHR; break; + case PG_REGEX_STRATEGY_DEF_1BYTE: case PG_REGEX_STRATEGY_LIBC_1BYTE: #if MAX_SIMPLE_CHR >= UCHAR_MAX max_chr = (pg_wchar) UCHAR_MAX; -- 2.34.1