On Wed, 2025-01-15 at 12:42 -0800, Jeff Davis wrote: > > Here's v12 after committing a few of the earlier patches.
And here's v14, just a rebase. > I collected some performance numbers for a worst case on UTF8. I'm still inlined to think the method table is a good thing to do: (a) The performance cases I tried seem implausibly bad -- running character classification patterns over large fields consisting only of codepoints over U+07FF. (b) The method tables seem like a better code organization that separates the responsibilities of the provider from the calling code. It's also a requirement (or nearly so) if we want to provide some pluggability or support multiple library versions. It would be good to hear from others on these points, though. Regards, Jeff Davis
From e30915172c98616d0aec56f190dff48836760ccc Mon Sep 17 00:00:00 2001 From: Jeff Davis <j...@j-davis.com> Date: Fri, 29 Nov 2024 09:37:43 -0800 Subject: [PATCH v14 1/4] Control ctype behavior internally with a method table. Previously, pattern matching and case mapping behavior branched based on the provider. Refactor to use a method table, which is less error-prone and easier to hook. --- src/backend/regex/regc_pg_locale.c | 377 +++++----------------- src/backend/utils/adt/like.c | 22 +- src/backend/utils/adt/like_support.c | 7 +- src/backend/utils/adt/pg_locale.c | 101 +++--- src/backend/utils/adt/pg_locale_builtin.c | 106 +++++- src/backend/utils/adt/pg_locale_icu.c | 109 ++++++- src/backend/utils/adt/pg_locale_libc.c | 279 +++++++++++++--- src/include/utils/pg_locale.h | 49 +++ src/tools/pgindent/typedefs.list | 1 - 9 files changed, 618 insertions(+), 433 deletions(-) diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c index ed7411df83d..31b8f4a9478 100644 --- a/src/backend/regex/regc_pg_locale.c +++ b/src/backend/regex/regc_pg_locale.c @@ -63,18 +63,13 @@ * NB: the coding here assumes pg_wchar is an unsigned type. */ -typedef enum -{ - PG_REGEX_STRATEGY_C, /* C locale (encoding independent) */ - PG_REGEX_STRATEGY_BUILTIN, /* built-in Unicode semantics */ - PG_REGEX_STRATEGY_LIBC_WIDE, /* Use locale_t <wctype.h> functions */ - PG_REGEX_STRATEGY_LIBC_1BYTE, /* Use locale_t <ctype.h> functions */ - PG_REGEX_STRATEGY_ICU, /* Use ICU uchar.h functions */ -} PG_Locale_Strategy; - -static PG_Locale_Strategy pg_regex_strategy; static pg_locale_t pg_regex_locale; +static struct pg_locale_struct dummy_c_locale = { + .collate_is_c = true, + .ctype_is_c = true, +}; + /* * Hard-wired character properties for C locale */ @@ -231,7 +226,6 @@ void pg_set_regex_collation(Oid collation) { pg_locale_t locale = 0; - PG_Locale_Strategy strategy; if (!OidIsValid(collation)) { @@ -252,8 +246,7 @@ pg_set_regex_collation(Oid collation) * catalog access is available, so we can't call * pg_newlocale_from_collation(). */ - strategy = PG_REGEX_STRATEGY_C; - locale = 0; + locale = &dummy_c_locale; } else { @@ -270,113 +263,41 @@ pg_set_regex_collation(Oid collation) * C/POSIX collations use this path regardless of database * encoding */ - strategy = PG_REGEX_STRATEGY_C; - locale = 0; - } - else if (locale->provider == COLLPROVIDER_BUILTIN) - { - Assert(GetDatabaseEncoding() == PG_UTF8); - strategy = PG_REGEX_STRATEGY_BUILTIN; - } -#ifdef USE_ICU - else if (locale->provider == COLLPROVIDER_ICU) - { - strategy = PG_REGEX_STRATEGY_ICU; - } -#endif - else - { - Assert(locale->provider == COLLPROVIDER_LIBC); - if (GetDatabaseEncoding() == PG_UTF8) - strategy = PG_REGEX_STRATEGY_LIBC_WIDE; - else - strategy = PG_REGEX_STRATEGY_LIBC_1BYTE; + locale = &dummy_c_locale; } } - pg_regex_strategy = strategy; pg_regex_locale = locale; } static int pg_wc_isdigit(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISDIGIT)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_isdigit(c, !pg_regex_locale->info.builtin.casemap_full); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswdigit_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - isdigit_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_isdigit(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISDIGIT)); + else + return pg_regex_locale->ctype->wc_isdigit(c, pg_regex_locale); } static int pg_wc_isalpha(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISALPHA)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_isalpha(c); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswalpha_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - isalpha_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_isalpha(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISALPHA)); + else + return pg_regex_locale->ctype->wc_isalpha(c, pg_regex_locale); } static int pg_wc_isalnum(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISALNUM)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_isalnum(c, !pg_regex_locale->info.builtin.casemap_full); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswalnum_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - isalnum_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_isalnum(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISALNUM)); + else + return pg_regex_locale->ctype->wc_isalnum(c, pg_regex_locale); } static int @@ -391,219 +312,87 @@ pg_wc_isword(pg_wchar c) static int pg_wc_isupper(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISUPPER)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_isupper(c); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswupper_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - isupper_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_isupper(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISUPPER)); + else + return pg_regex_locale->ctype->wc_isupper(c, pg_regex_locale); } static int pg_wc_islower(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISLOWER)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_islower(c); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswlower_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - islower_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_islower(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISLOWER)); + else + return pg_regex_locale->ctype->wc_islower(c, pg_regex_locale); } static int pg_wc_isgraph(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISGRAPH)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_isgraph(c); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswgraph_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - isgraph_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_isgraph(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISGRAPH)); + else + return pg_regex_locale->ctype->wc_isgraph(c, pg_regex_locale); } static int pg_wc_isprint(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISPRINT)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_isprint(c); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswprint_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - isprint_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_isprint(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISPRINT)); + else + return pg_regex_locale->ctype->wc_isprint(c, pg_regex_locale); } static int pg_wc_ispunct(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISPUNCT)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_ispunct(c, !pg_regex_locale->info.builtin.casemap_full); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswpunct_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - ispunct_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_ispunct(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISPUNCT)); + else + return pg_regex_locale->ctype->wc_ispunct(c, pg_regex_locale); } static int pg_wc_isspace(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISSPACE)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_isspace(c); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswspace_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - isspace_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_isspace(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISSPACE)); + else + return pg_regex_locale->ctype->wc_isspace(c, pg_regex_locale); } static pg_wchar pg_wc_toupper(pg_wchar c) { - switch (pg_regex_strategy) + if (pg_regex_locale->ctype_is_c) { - case PG_REGEX_STRATEGY_C: - if (c <= (pg_wchar) 127) - return pg_ascii_toupper((unsigned char) c); - return c; - case PG_REGEX_STRATEGY_BUILTIN: - return unicode_uppercase_simple(c); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return towupper_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - if (c <= (pg_wchar) UCHAR_MAX) - return toupper_l((unsigned char) c, pg_regex_locale->info.lt); - return c; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_toupper(c); -#endif - break; + if (c <= (pg_wchar) 127) + return pg_ascii_toupper((unsigned char) c); + return c; } - return 0; /* can't get here, but keep compiler quiet */ + else + return pg_regex_locale->ctype->wc_toupper(c, pg_regex_locale); } static pg_wchar pg_wc_tolower(pg_wchar c) { - switch (pg_regex_strategy) + if (pg_regex_locale->ctype_is_c) { - case PG_REGEX_STRATEGY_C: - if (c <= (pg_wchar) 127) - return pg_ascii_tolower((unsigned char) c); - return c; - case PG_REGEX_STRATEGY_BUILTIN: - return unicode_lowercase_simple(c); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return towlower_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - if (c <= (pg_wchar) UCHAR_MAX) - return tolower_l((unsigned char) c, pg_regex_locale->info.lt); - return c; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_tolower(c); -#endif - break; + if (c <= (pg_wchar) 127) + return pg_ascii_tolower((unsigned char) c); + return c; } - return 0; /* can't get here, but keep compiler quiet */ + else + return pg_regex_locale->ctype->wc_tolower(c, pg_regex_locale); } @@ -729,37 +518,25 @@ pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode) * would always be true for production values of MAX_SIMPLE_CHR, but it's * useful to allow it to be small for testing purposes.) */ - switch (pg_regex_strategy) + if (pg_regex_locale->ctype_is_c) { - case PG_REGEX_STRATEGY_C: #if MAX_SIMPLE_CHR >= 127 - max_chr = (pg_wchar) 127; - pcc->cv.cclasscode = -1; + max_chr = (pg_wchar) 127; + pcc->cv.cclasscode = -1; #else - max_chr = (pg_wchar) MAX_SIMPLE_CHR; + max_chr = (pg_wchar) MAX_SIMPLE_CHR; #endif - break; - case PG_REGEX_STRATEGY_BUILTIN: - max_chr = (pg_wchar) MAX_SIMPLE_CHR; - break; - case PG_REGEX_STRATEGY_LIBC_WIDE: - max_chr = (pg_wchar) MAX_SIMPLE_CHR; - break; - case PG_REGEX_STRATEGY_LIBC_1BYTE: -#if MAX_SIMPLE_CHR >= UCHAR_MAX - max_chr = (pg_wchar) UCHAR_MAX; + } + else + { + if (pg_regex_locale->ctype->max_chr != 0 && + pg_regex_locale->ctype->max_chr <= MAX_SIMPLE_CHR) + { + max_chr = pg_regex_locale->ctype->max_chr; pcc->cv.cclasscode = -1; -#else - max_chr = (pg_wchar) MAX_SIMPLE_CHR; -#endif - break; - case PG_REGEX_STRATEGY_ICU: + } + else max_chr = (pg_wchar) MAX_SIMPLE_CHR; - break; - default: - Assert(false); - max_chr = 0; /* can't get here, but keep compiler quiet */ - break; } /* diff --git a/src/backend/utils/adt/like.c b/src/backend/utils/adt/like.c index 7f4cf614585..4216ac17f43 100644 --- a/src/backend/utils/adt/like.c +++ b/src/backend/utils/adt/like.c @@ -98,7 +98,7 @@ SB_lower_char(unsigned char c, pg_locale_t locale) else if (locale->is_default) return pg_tolower(c); else - return tolower_l(c, locale->info.lt); + return char_tolower(c, locale); } @@ -209,7 +209,17 @@ Generic_Text_IC_like(text *str, text *pat, Oid collation) * way. */ - if (pg_database_encoding_max_length() > 1 || (locale->provider == COLLPROVIDER_ICU)) + if (locale->ctype_is_c || + (char_tolower_enabled(locale) && + pg_database_encoding_max_length() == 1)) + { + p = VARDATA_ANY(pat); + plen = VARSIZE_ANY_EXHDR(pat); + s = VARDATA_ANY(str); + slen = VARSIZE_ANY_EXHDR(str); + return SB_IMatchText(s, slen, p, plen, locale); + } + else { pat = DatumGetTextPP(DirectFunctionCall1Coll(lower, collation, PointerGetDatum(pat))); @@ -224,14 +234,6 @@ Generic_Text_IC_like(text *str, text *pat, Oid collation) else return MB_MatchText(s, slen, p, plen, 0); } - else - { - p = VARDATA_ANY(pat); - plen = VARSIZE_ANY_EXHDR(pat); - s = VARDATA_ANY(str); - slen = VARSIZE_ANY_EXHDR(str); - return SB_IMatchText(s, slen, p, plen, locale); - } } /* diff --git a/src/backend/utils/adt/like_support.c b/src/backend/utils/adt/like_support.c index 8fdc677371f..999f23f86d5 100644 --- a/src/backend/utils/adt/like_support.c +++ b/src/backend/utils/adt/like_support.c @@ -1495,13 +1495,8 @@ pattern_char_isalpha(char c, bool is_multibyte, { if (locale->ctype_is_c) return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); - else if (is_multibyte && IS_HIGHBIT_SET(c)) - return true; - else if (locale->provider != COLLPROVIDER_LIBC) - return IS_HIGHBIT_SET(c) || - (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); else - return isalpha_l((unsigned char) c, locale->info.lt); + return char_is_cased(c, locale); } diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index 94444acd2c5..5b78237f72e 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -100,27 +100,6 @@ extern pg_locale_t create_pg_locale_icu(Oid collid, MemoryContext context); extern pg_locale_t create_pg_locale_libc(Oid collid, MemoryContext context); extern char *get_collation_actual_version_libc(const char *collcollate); -extern size_t strlower_builtin(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strtitle_builtin(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strupper_builtin(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); - -extern size_t strlower_icu(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strtitle_icu(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strupper_icu(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); - -extern size_t strlower_libc(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strtitle_libc(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strupper_libc(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); - /* GUC settings */ char *locale_messages; char *locale_monetary; @@ -1232,6 +1211,9 @@ create_pg_locale(Oid collid, MemoryContext context) Assert((result->collate_is_c && result->collate == NULL) || (!result->collate_is_c && result->collate != NULL)); + Assert((result->ctype_is_c && result->ctype == NULL) || + (!result->ctype_is_c && result->ctype != NULL)); + datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collversion, &isnull); if (!isnull) @@ -1394,57 +1376,21 @@ size_t pg_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale) { - if (locale->provider == COLLPROVIDER_BUILTIN) - return strlower_builtin(dst, dstsize, src, srclen, locale); -#ifdef USE_ICU - else if (locale->provider == COLLPROVIDER_ICU) - return strlower_icu(dst, dstsize, src, srclen, locale); -#endif - else if (locale->provider == COLLPROVIDER_LIBC) - return strlower_libc(dst, dstsize, src, srclen, locale); - else - /* shouldn't happen */ - PGLOCALE_SUPPORT_ERROR(locale->provider); - - return 0; /* keep compiler quiet */ + return locale->ctype->strlower(dst, dstsize, src, srclen, locale); } size_t pg_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale) { - if (locale->provider == COLLPROVIDER_BUILTIN) - return strtitle_builtin(dst, dstsize, src, srclen, locale); -#ifdef USE_ICU - else if (locale->provider == COLLPROVIDER_ICU) - return strtitle_icu(dst, dstsize, src, srclen, locale); -#endif - else if (locale->provider == COLLPROVIDER_LIBC) - return strtitle_libc(dst, dstsize, src, srclen, locale); - else - /* shouldn't happen */ - PGLOCALE_SUPPORT_ERROR(locale->provider); - - return 0; /* keep compiler quiet */ + return locale->ctype->strtitle(dst, dstsize, src, srclen, locale); } size_t pg_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale) { - if (locale->provider == COLLPROVIDER_BUILTIN) - return strupper_builtin(dst, dstsize, src, srclen, locale); -#ifdef USE_ICU - else if (locale->provider == COLLPROVIDER_ICU) - return strupper_icu(dst, dstsize, src, srclen, locale); -#endif - else if (locale->provider == COLLPROVIDER_LIBC) - return strupper_libc(dst, dstsize, src, srclen, locale); - else - /* shouldn't happen */ - PGLOCALE_SUPPORT_ERROR(locale->provider); - - return 0; /* keep compiler quiet */ + return locale->ctype->strupper(dst, dstsize, src, srclen, locale); } /* @@ -1581,6 +1527,41 @@ pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src, return locale->collate->strnxfrm_prefix(dest, destsize, src, srclen, locale); } +/* + * char_is_cased() + * + * Fuzzy test of whether the given char is case-varying or not. The argument + * is a single byte, so in a multibyte encoding, just assume any non-ASCII + * char is case-varying. + */ +bool +char_is_cased(char ch, pg_locale_t locale) +{ + return locale->ctype->char_is_cased(ch, locale); +} + +/* + * char_tolower_enabled() + * + * Does the provider support char_tolower()? + */ +bool +char_tolower_enabled(pg_locale_t locale) +{ + return (locale->ctype->char_tolower != NULL); +} + +/* + * char_tolower() + * + * Convert char (single-byte encoding) to lowercase. + */ +char +char_tolower(unsigned char ch, pg_locale_t locale) +{ + return locale->ctype->char_tolower(ch, locale); +} + /* * Return required encoding ID for the given locale, or -1 if any encoding is * valid for the locale. diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c index 436e32c0ca0..5f43658ab5b 100644 --- a/src/backend/utils/adt/pg_locale_builtin.c +++ b/src/backend/utils/adt/pg_locale_builtin.c @@ -25,13 +25,6 @@ extern pg_locale_t create_pg_locale_builtin(Oid collid, MemoryContext context); extern char *get_collation_actual_version_builtin(const char *collcollate); -extern size_t strlower_builtin(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strtitle_builtin(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strupper_builtin(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); - struct WordBoundaryState { @@ -74,7 +67,7 @@ initcap_wbnext(void *state) return wbstate->len; } -size_t +static size_t strlower_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { @@ -82,7 +75,7 @@ strlower_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, locale->info.builtin.casemap_full); } -size_t +static size_t strtitle_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { @@ -99,7 +92,7 @@ strtitle_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, initcap_wbnext, &wbstate); } -size_t +static size_t strupper_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { @@ -107,6 +100,97 @@ strupper_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, locale->info.builtin.casemap_full); } +static bool +wc_isdigit_builtin(pg_wchar wc, pg_locale_t locale) +{ + return pg_u_isdigit(wc, !locale->info.builtin.casemap_full); +} + +static bool +wc_isalpha_builtin(pg_wchar wc, pg_locale_t locale) +{ + return pg_u_isalpha(wc); +} + +static bool +wc_isalnum_builtin(pg_wchar wc, pg_locale_t locale) +{ + return pg_u_isalnum(wc, !locale->info.builtin.casemap_full); +} + +static bool +wc_isupper_builtin(pg_wchar wc, pg_locale_t locale) +{ + return pg_u_isupper(wc); +} + +static bool +wc_islower_builtin(pg_wchar wc, pg_locale_t locale) +{ + return pg_u_islower(wc); +} + +static bool +wc_isgraph_builtin(pg_wchar wc, pg_locale_t locale) +{ + return pg_u_isgraph(wc); +} + +static bool +wc_isprint_builtin(pg_wchar wc, pg_locale_t locale) +{ + return pg_u_isprint(wc); +} + +static bool +wc_ispunct_builtin(pg_wchar wc, pg_locale_t locale) +{ + return pg_u_ispunct(wc, !locale->info.builtin.casemap_full); +} + +static bool +wc_isspace_builtin(pg_wchar wc, pg_locale_t locale) +{ + return pg_u_isspace(wc); +} + +static bool +char_is_cased_builtin(char ch, pg_locale_t locale) +{ + return IS_HIGHBIT_SET(ch) || + (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z'); +} + +static pg_wchar +wc_toupper_builtin(pg_wchar wc, pg_locale_t locale) +{ + return unicode_uppercase_simple(wc); +} + +static pg_wchar +wc_tolower_builtin(pg_wchar wc, pg_locale_t locale) +{ + return unicode_lowercase_simple(wc); +} + +static const struct ctype_methods ctype_methods_builtin = { + .strlower = strlower_builtin, + .strtitle = strtitle_builtin, + .strupper = strupper_builtin, + .wc_isdigit = wc_isdigit_builtin, + .wc_isalpha = wc_isalpha_builtin, + .wc_isalnum = wc_isalnum_builtin, + .wc_isupper = wc_isupper_builtin, + .wc_islower = wc_islower_builtin, + .wc_isgraph = wc_isgraph_builtin, + .wc_isprint = wc_isprint_builtin, + .wc_ispunct = wc_ispunct_builtin, + .wc_isspace = wc_isspace_builtin, + .char_is_cased = char_is_cased_builtin, + .wc_tolower = wc_tolower_builtin, + .wc_toupper = wc_toupper_builtin, +}; + pg_locale_t create_pg_locale_builtin(Oid collid, MemoryContext context) { @@ -150,6 +234,8 @@ create_pg_locale_builtin(Oid collid, MemoryContext context) result->deterministic = true; result->collate_is_c = true; result->ctype_is_c = (strcmp(locstr, "C") == 0); + if (!result->ctype_is_c) + result->ctype = &ctype_methods_builtin; return result; } diff --git a/src/backend/utils/adt/pg_locale_icu.c b/src/backend/utils/adt/pg_locale_icu.c index 5185b0f7289..3e9a2e0cfaa 100644 --- a/src/backend/utils/adt/pg_locale_icu.c +++ b/src/backend/utils/adt/pg_locale_icu.c @@ -48,17 +48,17 @@ #define TEXTBUFLEN 1024 extern pg_locale_t create_pg_locale_icu(Oid collid, MemoryContext context); -extern size_t strlower_icu(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strtitle_icu(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strupper_icu(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); #ifdef USE_ICU extern UCollator *pg_ucol_open(const char *loc_str); +static size_t strlower_icu(char *dst, size_t dstsize, const char *src, + ssize_t srclen, pg_locale_t locale); +static size_t strtitle_icu(char *dst, size_t dstsize, const char *src, + ssize_t srclen, pg_locale_t locale); +static size_t strupper_icu(char *dst, size_t dstsize, const char *src, + ssize_t srclen, pg_locale_t locale); static int strncoll_icu(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, pg_locale_t locale); @@ -118,6 +118,25 @@ static int32_t u_strToTitle_default_BI(UChar *dest, int32_t destCapacity, const char *locale, UErrorCode *pErrorCode); +static bool +char_is_cased_icu(char ch, pg_locale_t locale) +{ + return IS_HIGHBIT_SET(ch) || + (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z'); +} + +static pg_wchar +toupper_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_toupper(wc); +} + +static pg_wchar +tolower_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_tolower(wc); +} + static const struct collate_methods collate_methods_icu = { .strncoll = strncoll_icu, .strnxfrm = strnxfrm_icu, @@ -136,6 +155,77 @@ static const struct collate_methods collate_methods_icu_utf8 = { .strxfrm_is_safe = true, }; +static bool +wc_isdigit_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_isdigit(wc); +} + +static bool +wc_isalpha_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_isalpha(wc); +} + +static bool +wc_isalnum_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_isalnum(wc); +} + +static bool +wc_isupper_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_isupper(wc); +} + +static bool +wc_islower_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_islower(wc); +} + +static bool +wc_isgraph_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_isgraph(wc); +} + +static bool +wc_isprint_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_isprint(wc); +} + +static bool +wc_ispunct_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_ispunct(wc); +} + +static bool +wc_isspace_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_isspace(wc); +} + +static const struct ctype_methods ctype_methods_icu = { + .strlower = strlower_icu, + .strtitle = strtitle_icu, + .strupper = strupper_icu, + .wc_isdigit = wc_isdigit_icu, + .wc_isalpha = wc_isalpha_icu, + .wc_isalnum = wc_isalnum_icu, + .wc_isupper = wc_isupper_icu, + .wc_islower = wc_islower_icu, + .wc_isgraph = wc_isgraph_icu, + .wc_isprint = wc_isprint_icu, + .wc_ispunct = wc_ispunct_icu, + .wc_isspace = wc_isspace_icu, + .char_is_cased = char_is_cased_icu, + .wc_toupper = toupper_icu, + .wc_tolower = tolower_icu, +}; #endif pg_locale_t @@ -206,6 +296,7 @@ create_pg_locale_icu(Oid collid, MemoryContext context) result->collate = &collate_methods_icu_utf8; else result->collate = &collate_methods_icu; + result->ctype = &ctype_methods_icu; return result; #else @@ -379,7 +470,7 @@ make_icu_collator(const char *iculocstr, const char *icurules) } } -size_t +static size_t strlower_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { @@ -399,7 +490,7 @@ strlower_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, return result_len; } -size_t +static size_t strtitle_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { @@ -419,7 +510,7 @@ strtitle_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, return result_len; } -size_t +static size_t strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { diff --git a/src/backend/utils/adt/pg_locale_libc.c b/src/backend/utils/adt/pg_locale_libc.c index 8f9a8637897..1144c6ff304 100644 --- a/src/backend/utils/adt/pg_locale_libc.c +++ b/src/backend/utils/adt/pg_locale_libc.c @@ -43,13 +43,6 @@ extern pg_locale_t create_pg_locale_libc(Oid collid, MemoryContext context); -extern size_t strlower_libc(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strtitle_libc(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strupper_libc(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); - static int strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, pg_locale_t locale); @@ -86,6 +79,239 @@ static size_t strupper_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale); +static bool +wc_isdigit_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + return isdigit_l((unsigned char) wc, locale->info.lt); +} + +static bool +wc_isalpha_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + return isalpha_l((unsigned char) wc, locale->info.lt); +} + +static bool +wc_isalnum_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + return isalnum_l((unsigned char) wc, locale->info.lt); +} + +static bool +wc_isupper_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + return isupper_l((unsigned char) wc, locale->info.lt); +} + +static bool +wc_islower_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + return islower_l((unsigned char) wc, locale->info.lt); +} + +static bool +wc_isgraph_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + return isgraph_l((unsigned char) wc, locale->info.lt); +} + +static bool +wc_isprint_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + return isprint_l((unsigned char) wc, locale->info.lt); +} + +static bool +wc_ispunct_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + return ispunct_l((unsigned char) wc, locale->info.lt); +} + +static bool +wc_isspace_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + return isspace_l((unsigned char) wc, locale->info.lt); +} + +static bool +wc_isdigit_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + return iswdigit_l((wint_t) wc, locale->info.lt); +} + +static bool +wc_isalpha_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + return iswalpha_l((wint_t) wc, locale->info.lt); +} + +static bool +wc_isalnum_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + return iswalnum_l((wint_t) wc, locale->info.lt); +} + +static bool +wc_isupper_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + return iswupper_l((wint_t) wc, locale->info.lt); +} + +static bool +wc_islower_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + return iswlower_l((wint_t) wc, locale->info.lt); +} + +static bool +wc_isgraph_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + return iswgraph_l((wint_t) wc, locale->info.lt); +} + +static bool +wc_isprint_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + return iswprint_l((wint_t) wc, locale->info.lt); +} + +static bool +wc_ispunct_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + return iswpunct_l((wint_t) wc, locale->info.lt); +} + +static bool +wc_isspace_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + return iswspace_l((wint_t) wc, locale->info.lt); +} + +static char +char_tolower_libc(unsigned char ch, pg_locale_t locale) +{ + Assert(pg_database_encoding_max_length() == 1); + return tolower_l(ch, locale->info.lt); +} + +static bool +char_is_cased_libc(char ch, pg_locale_t locale) +{ + bool is_multibyte = pg_database_encoding_max_length() > 1; + + if (is_multibyte && IS_HIGHBIT_SET(ch)) + return true; + else + return isalpha_l((unsigned char) ch, locale->info.lt); +} + +static pg_wchar +toupper_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + Assert(GetDatabaseEncoding() != PG_UTF8); + + if (wc <= (pg_wchar) UCHAR_MAX) + return toupper_l((unsigned char) wc, locale->info.lt); + else + return wc; +} + +static pg_wchar +toupper_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + Assert(GetDatabaseEncoding() == PG_UTF8); + + if (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF) + return towupper_l((wint_t) wc, locale->info.lt); + else + return wc; +} + +static pg_wchar +tolower_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + Assert(GetDatabaseEncoding() != PG_UTF8); + + if (wc <= (pg_wchar) UCHAR_MAX) + return tolower_l((unsigned char) wc, locale->info.lt); + else + return wc; +} + +static pg_wchar +tolower_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + Assert(GetDatabaseEncoding() == PG_UTF8); + + if (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF) + return towlower_l((wint_t) wc, locale->info.lt); + else + return wc; +} + +static const struct ctype_methods ctype_methods_libc_sb = { + .strlower = strlower_libc_sb, + .strtitle = strtitle_libc_sb, + .strupper = strupper_libc_sb, + .wc_isdigit = wc_isdigit_libc_sb, + .wc_isalpha = wc_isalpha_libc_sb, + .wc_isalnum = wc_isalnum_libc_sb, + .wc_isupper = wc_isupper_libc_sb, + .wc_islower = wc_islower_libc_sb, + .wc_isgraph = wc_isgraph_libc_sb, + .wc_isprint = wc_isprint_libc_sb, + .wc_ispunct = wc_ispunct_libc_sb, + .wc_isspace = wc_isspace_libc_sb, + .char_is_cased = char_is_cased_libc, + .char_tolower = char_tolower_libc, + .wc_toupper = toupper_libc_sb, + .wc_tolower = tolower_libc_sb, + .max_chr = UCHAR_MAX, +}; + +/* + * Non-UTF8 multibyte encodings use multibyte semantics for case mapping, but + * single-byte semantics for pattern matching. + */ +static const struct ctype_methods ctype_methods_libc_other_mb = { + .strlower = strlower_libc_mb, + .strtitle = strtitle_libc_mb, + .strupper = strupper_libc_mb, + .wc_isdigit = wc_isdigit_libc_sb, + .wc_isalpha = wc_isalpha_libc_sb, + .wc_isalnum = wc_isalnum_libc_sb, + .wc_isupper = wc_isupper_libc_sb, + .wc_islower = wc_islower_libc_sb, + .wc_isgraph = wc_isgraph_libc_sb, + .wc_isprint = wc_isprint_libc_sb, + .wc_ispunct = wc_ispunct_libc_sb, + .wc_isspace = wc_isspace_libc_sb, + .char_is_cased = char_is_cased_libc, + .char_tolower = char_tolower_libc, + .wc_toupper = toupper_libc_sb, + .wc_tolower = tolower_libc_sb, + .max_chr = UCHAR_MAX, +}; + +static const struct ctype_methods ctype_methods_libc_utf8 = { + .strlower = strlower_libc_mb, + .strtitle = strtitle_libc_mb, + .strupper = strupper_libc_mb, + .wc_isdigit = wc_isdigit_libc_mb, + .wc_isalpha = wc_isalpha_libc_mb, + .wc_isalnum = wc_isalnum_libc_mb, + .wc_isupper = wc_isupper_libc_mb, + .wc_islower = wc_islower_libc_mb, + .wc_isgraph = wc_isgraph_libc_mb, + .wc_isprint = wc_isprint_libc_mb, + .wc_ispunct = wc_ispunct_libc_mb, + .wc_isspace = wc_isspace_libc_mb, + .char_is_cased = char_is_cased_libc, + .char_tolower = char_tolower_libc, + .wc_toupper = toupper_libc_mb, + .wc_tolower = tolower_libc_mb, +}; + static const struct collate_methods collate_methods_libc = { .strncoll = strncoll_libc, .strnxfrm = strnxfrm_libc, @@ -120,36 +346,6 @@ static const struct collate_methods collate_methods_libc_win32_utf8 = { }; #endif -size_t -strlower_libc(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale) -{ - if (pg_database_encoding_max_length() > 1) - return strlower_libc_mb(dst, dstsize, src, srclen, locale); - else - return strlower_libc_sb(dst, dstsize, src, srclen, locale); -} - -size_t -strtitle_libc(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale) -{ - if (pg_database_encoding_max_length() > 1) - return strtitle_libc_mb(dst, dstsize, src, srclen, locale); - else - return strtitle_libc_sb(dst, dstsize, src, srclen, locale); -} - -size_t -strupper_libc(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale) -{ - if (pg_database_encoding_max_length() > 1) - return strupper_libc_mb(dst, dstsize, src, srclen, locale); - else - return strupper_libc_sb(dst, dstsize, src, srclen, locale); -} - static size_t strlower_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) @@ -482,6 +678,15 @@ create_pg_locale_libc(Oid collid, MemoryContext context) #endif result->collate = &collate_methods_libc; } + if (!result->ctype_is_c) + { + if (GetDatabaseEncoding() == PG_UTF8) + result->ctype = &ctype_methods_libc_utf8; + else if (pg_database_encoding_max_length() > 1) + result->ctype = &ctype_methods_libc_other_mb; + else + result->ctype = &ctype_methods_libc_sb; + } return result; } diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h index 2bc3a7df2d9..cac05c69d34 100644 --- a/src/include/utils/pg_locale.h +++ b/src/include/utils/pg_locale.h @@ -12,6 +12,8 @@ #ifndef _PG_LOCALE_ #define _PG_LOCALE_ +#include "mb/pg_wchar.h" + #ifdef USE_ICU #include <unicode/ucol.h> #endif @@ -77,6 +79,49 @@ struct collate_methods bool strxfrm_is_safe; }; +struct ctype_methods +{ + /* case mapping: LOWER()/INITCAP()/UPPER() */ + size_t (*strlower) (char *dest, size_t destsize, + const char *src, ssize_t srclen, + pg_locale_t locale); + size_t (*strtitle) (char *dest, size_t destsize, + const char *src, ssize_t srclen, + pg_locale_t locale); + size_t (*strupper) (char *dest, size_t destsize, + const char *src, ssize_t srclen, + pg_locale_t locale); + + /* required */ + bool (*wc_isdigit) (pg_wchar wc, pg_locale_t locale); + bool (*wc_isalpha) (pg_wchar wc, pg_locale_t locale); + bool (*wc_isalnum) (pg_wchar wc, pg_locale_t locale); + bool (*wc_isupper) (pg_wchar wc, pg_locale_t locale); + bool (*wc_islower) (pg_wchar wc, pg_locale_t locale); + bool (*wc_isgraph) (pg_wchar wc, pg_locale_t locale); + bool (*wc_isprint) (pg_wchar wc, pg_locale_t locale); + bool (*wc_ispunct) (pg_wchar wc, pg_locale_t locale); + bool (*wc_isspace) (pg_wchar wc, pg_locale_t locale); + pg_wchar (*wc_toupper) (pg_wchar wc, pg_locale_t locale); + pg_wchar (*wc_tolower) (pg_wchar wc, pg_locale_t locale); + + /* required */ + bool (*char_is_cased) (char ch, pg_locale_t locale); + + /* + * Optional. If defined, will only be called for single-byte encodings. If + * not defined, or if the encoding is multibyte, will fall back to + * pg_strlower(). + */ + char (*char_tolower) (unsigned char ch, pg_locale_t locale); + + /* + * For regex and pattern matching efficiency, the maximum char value + * supported by the above methods. If zero, limit is set by regex code. + */ + pg_wchar max_chr; +}; + /* * We use a discriminated union to hold either a locale_t or an ICU collator. * pg_locale_t is occasionally checked for truth, so make it a pointer. @@ -102,6 +147,7 @@ struct pg_locale_struct bool is_default; const struct collate_methods *collate; /* NULL if collate_is_c */ + const struct ctype_methods *ctype; /* NULL if ctype_is_c */ union { @@ -125,6 +171,9 @@ extern void init_database_collation(void); extern pg_locale_t pg_newlocale_from_collation(Oid collid); extern char *get_collation_actual_version(char collprovider, const char *collcollate); +extern bool char_is_cased(char ch, pg_locale_t locale); +extern bool char_tolower_enabled(pg_locale_t locale); +extern char char_tolower(unsigned char ch, pg_locale_t locale); extern size_t pg_strlower(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale); diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 668bddbfcd7..9aa19f88b5b 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1829,7 +1829,6 @@ PGTargetServerType PGTernaryBool PGTransactionStatusType PGVerbosity -PG_Locale_Strategy PG_Lock_Status PG_init_t PGcancel -- 2.34.1
From 6f434248cdabd9e9ada75b6eabe62e77f5a22e6a Mon Sep 17 00:00:00 2001 From: Jeff Davis <j...@j-davis.com> Date: Mon, 7 Oct 2024 12:51:27 -0700 Subject: [PATCH v14 2/4] Remove provider field from pg_locale_t. The behavior of pg_locale_t is entirely specified by methods, so a separate provider field is no longer necessary. --- src/backend/utils/adt/pg_locale_builtin.c | 1 - src/backend/utils/adt/pg_locale_icu.c | 11 ----------- src/backend/utils/adt/pg_locale_libc.c | 6 ------ src/include/utils/pg_locale.h | 1 - 4 files changed, 19 deletions(-) diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c index 5f43658ab5b..9ea5a461e84 100644 --- a/src/backend/utils/adt/pg_locale_builtin.c +++ b/src/backend/utils/adt/pg_locale_builtin.c @@ -230,7 +230,6 @@ create_pg_locale_builtin(Oid collid, MemoryContext context) result->info.builtin.locale = MemoryContextStrdup(context, locstr); result->info.builtin.casemap_full = (strcmp(locstr, "PG_UNICODE_FAST") == 0); - result->provider = COLLPROVIDER_BUILTIN; result->deterministic = true; result->collate_is_c = true; result->ctype_is_c = (strcmp(locstr, "C") == 0); diff --git a/src/backend/utils/adt/pg_locale_icu.c b/src/backend/utils/adt/pg_locale_icu.c index 3e9a2e0cfaa..e4f0398c217 100644 --- a/src/backend/utils/adt/pg_locale_icu.c +++ b/src/backend/utils/adt/pg_locale_icu.c @@ -288,7 +288,6 @@ create_pg_locale_icu(Oid collid, MemoryContext context) result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct)); result->info.icu.locale = MemoryContextStrdup(context, iculocstr); result->info.icu.ucol = collator; - result->provider = COLLPROVIDER_ICU; result->deterministic = deterministic; result->collate_is_c = false; result->ctype_is_c = false; @@ -545,8 +544,6 @@ strncoll_icu_utf8(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2 int result; UErrorCode status; - Assert(locale->provider == COLLPROVIDER_ICU); - Assert(GetDatabaseEncoding() == PG_UTF8); status = U_ZERO_ERROR; @@ -574,8 +571,6 @@ strnxfrm_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, size_t uchar_bsize; Size result_bsize; - Assert(locale->provider == COLLPROVIDER_ICU); - init_icu_converter(); ulen = uchar_length(icu_converter, src, srclen); @@ -620,8 +615,6 @@ strnxfrm_prefix_icu_utf8(char *dest, size_t destsize, uint32_t state[2]; UErrorCode status; - Assert(locale->provider == COLLPROVIDER_ICU); - Assert(GetDatabaseEncoding() == PG_UTF8); uiter_setUTF8(&iter, src, srclen); @@ -788,8 +781,6 @@ strncoll_icu(const char *arg1, ssize_t len1, *uchar2; int result; - Assert(locale->provider == COLLPROVIDER_ICU); - /* if encoding is UTF8, use more efficient strncoll_icu_utf8 */ #ifdef HAVE_UCOL_STRCOLLUTF8 Assert(GetDatabaseEncoding() != PG_UTF8); @@ -838,8 +829,6 @@ strnxfrm_prefix_icu(char *dest, size_t destsize, size_t uchar_bsize; Size result_bsize; - Assert(locale->provider == COLLPROVIDER_ICU); - /* if encoding is UTF8, use more efficient strnxfrm_prefix_icu_utf8 */ Assert(GetDatabaseEncoding() != PG_UTF8); diff --git a/src/backend/utils/adt/pg_locale_libc.c b/src/backend/utils/adt/pg_locale_libc.c index 1144c6ff304..1582f8cdd2a 100644 --- a/src/backend/utils/adt/pg_locale_libc.c +++ b/src/backend/utils/adt/pg_locale_libc.c @@ -662,7 +662,6 @@ create_pg_locale_libc(Oid collid, MemoryContext context) loc = make_libc_collator(collate, ctype); result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct)); - result->provider = COLLPROVIDER_LIBC; result->deterministic = true; result->collate_is_c = (strcmp(collate, "C") == 0) || (strcmp(collate, "POSIX") == 0); @@ -782,8 +781,6 @@ strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, const char *arg2n; int result; - Assert(locale->provider == COLLPROVIDER_LIBC); - if (bufsize1 + bufsize2 > TEXTBUFLEN) buf = palloc(bufsize1 + bufsize2); @@ -838,8 +835,6 @@ strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen, size_t bufsize = srclen + 1; size_t result; - Assert(locale->provider == COLLPROVIDER_LIBC); - if (srclen == -1) return strxfrm_l(dest, src, destsize, locale->info.lt); @@ -948,7 +943,6 @@ strncoll_libc_win32_utf8(const char *arg1, ssize_t len1, const char *arg2, int r; int result; - Assert(locale->provider == COLLPROVIDER_LIBC); Assert(GetDatabaseEncoding() == PG_UTF8); if (len1 == -1) diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h index cac05c69d34..11e1810eeb8 100644 --- a/src/include/utils/pg_locale.h +++ b/src/include/utils/pg_locale.h @@ -140,7 +140,6 @@ struct ctype_methods */ struct pg_locale_struct { - char provider; bool deterministic; bool collate_is_c; bool ctype_is_c; -- 2.34.1
From edf86ee0af1a36ef379118b84c3cef65b71ff9c5 Mon Sep 17 00:00:00 2001 From: Jeff Davis <j...@j-davis.com> Date: Mon, 7 Oct 2024 13:36:44 -0700 Subject: [PATCH v14 3/4] Make provider data in pg_locale_t an opaque pointer. --- src/backend/utils/adt/pg_locale_builtin.c | 49 +++++-- src/backend/utils/adt/pg_locale_icu.c | 40 ++++-- src/backend/utils/adt/pg_locale_libc.c | 167 +++++++++++++++------- src/include/utils/pg_locale.h | 17 +-- 4 files changed, 192 insertions(+), 81 deletions(-) diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c index 9ea5a461e84..de328e05a78 100644 --- a/src/backend/utils/adt/pg_locale_builtin.c +++ b/src/backend/utils/adt/pg_locale_builtin.c @@ -26,6 +26,12 @@ extern pg_locale_t create_pg_locale_builtin(Oid collid, MemoryContext context); extern char *get_collation_actual_version_builtin(const char *collcollate); +struct builtin_provider +{ + const char *locale; + bool casemap_full; +}; + struct WordBoundaryState { const char *str; @@ -71,14 +77,19 @@ static size_t strlower_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { + struct builtin_provider *builtin; + + builtin = (struct builtin_provider *) locale->provider_data; + return unicode_strlower(dest, destsize, src, srclen, - locale->info.builtin.casemap_full); + builtin->casemap_full); } static size_t strtitle_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { + struct builtin_provider *builtin; struct WordBoundaryState wbstate = { .str = src, .len = srclen, @@ -87,8 +98,10 @@ strtitle_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, .prev_alnum = false, }; + builtin = (struct builtin_provider *) locale->provider_data; + return unicode_strtitle(dest, destsize, src, srclen, - locale->info.builtin.casemap_full, + builtin->casemap_full, initcap_wbnext, &wbstate); } @@ -96,14 +109,22 @@ static size_t strupper_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { + struct builtin_provider *builtin; + + builtin = (struct builtin_provider *) locale->provider_data; + return unicode_strupper(dest, destsize, src, srclen, - locale->info.builtin.casemap_full); + builtin->casemap_full); } static bool wc_isdigit_builtin(pg_wchar wc, pg_locale_t locale) { - return pg_u_isdigit(wc, !locale->info.builtin.casemap_full); + struct builtin_provider *builtin; + + builtin = (struct builtin_provider *) locale->provider_data; + + return pg_u_isdigit(wc, !builtin->casemap_full); } static bool @@ -115,7 +136,11 @@ wc_isalpha_builtin(pg_wchar wc, pg_locale_t locale) static bool wc_isalnum_builtin(pg_wchar wc, pg_locale_t locale) { - return pg_u_isalnum(wc, !locale->info.builtin.casemap_full); + struct builtin_provider *builtin; + + builtin = (struct builtin_provider *) locale->provider_data; + + return pg_u_isalnum(wc, !builtin->casemap_full); } static bool @@ -145,7 +170,11 @@ wc_isprint_builtin(pg_wchar wc, pg_locale_t locale) static bool wc_ispunct_builtin(pg_wchar wc, pg_locale_t locale) { - return pg_u_ispunct(wc, !locale->info.builtin.casemap_full); + struct builtin_provider *builtin; + + builtin = (struct builtin_provider *) locale->provider_data; + + return pg_u_ispunct(wc, !builtin->casemap_full); } static bool @@ -195,6 +224,7 @@ pg_locale_t create_pg_locale_builtin(Oid collid, MemoryContext context) { const char *locstr; + struct builtin_provider *builtin; pg_locale_t result; if (collid == DEFAULT_COLLATION_OID) @@ -228,8 +258,11 @@ create_pg_locale_builtin(Oid collid, MemoryContext context) result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct)); - result->info.builtin.locale = MemoryContextStrdup(context, locstr); - result->info.builtin.casemap_full = (strcmp(locstr, "PG_UNICODE_FAST") == 0); + builtin = MemoryContextAllocZero(context, sizeof(struct builtin_provider)); + builtin->locale = MemoryContextStrdup(context, locstr); + builtin->casemap_full = (strcmp(locstr, "PG_UNICODE_FAST") == 0); + result->provider_data = (void *) builtin; + result->deterministic = true; result->collate_is_c = true; result->ctype_is_c = (strcmp(locstr, "C") == 0); diff --git a/src/backend/utils/adt/pg_locale_icu.c b/src/backend/utils/adt/pg_locale_icu.c index e4f0398c217..7bd58f26c44 100644 --- a/src/backend/utils/adt/pg_locale_icu.c +++ b/src/backend/utils/adt/pg_locale_icu.c @@ -51,6 +51,12 @@ extern pg_locale_t create_pg_locale_icu(Oid collid, MemoryContext context); #ifdef USE_ICU +struct icu_provider +{ + const char *locale; + UCollator *ucol; +}; + extern UCollator *pg_ucol_open(const char *loc_str); static size_t strlower_icu(char *dst, size_t dstsize, const char *src, @@ -235,6 +241,7 @@ create_pg_locale_icu(Oid collid, MemoryContext context) bool deterministic; const char *iculocstr; const char *icurules = NULL; + struct icu_provider *icu; UCollator *collator; pg_locale_t result; @@ -286,8 +293,12 @@ create_pg_locale_icu(Oid collid, MemoryContext context) collator = make_icu_collator(iculocstr, icurules); result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct)); - result->info.icu.locale = MemoryContextStrdup(context, iculocstr); - result->info.icu.ucol = collator; + + icu = MemoryContextAllocZero(context, sizeof(struct icu_provider)); + icu->locale = MemoryContextStrdup(context, iculocstr); + icu->ucol = collator; + result->provider_data = (void *) icu; + result->deterministic = deterministic; result->collate_is_c = false; result->ctype_is_c = false; @@ -543,11 +554,12 @@ strncoll_icu_utf8(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2 { int result; UErrorCode status; + struct icu_provider *icu = (struct icu_provider *) locale->provider_data; Assert(GetDatabaseEncoding() == PG_UTF8); status = U_ZERO_ERROR; - result = ucol_strcollUTF8(locale->info.icu.ucol, + result = ucol_strcollUTF8(icu->ucol, arg1, len1, arg2, len2, &status); @@ -571,6 +583,8 @@ strnxfrm_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, size_t uchar_bsize; Size result_bsize; + struct icu_provider *icu = (struct icu_provider *) locale->provider_data; + init_icu_converter(); ulen = uchar_length(icu_converter, src, srclen); @@ -584,7 +598,7 @@ strnxfrm_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen); - result_bsize = ucol_getSortKey(locale->info.icu.ucol, + result_bsize = ucol_getSortKey(icu->ucol, uchar, ulen, (uint8_t *) dest, destsize); @@ -615,12 +629,14 @@ strnxfrm_prefix_icu_utf8(char *dest, size_t destsize, uint32_t state[2]; UErrorCode status; + struct icu_provider *icu = (struct icu_provider *) locale->provider_data; + Assert(GetDatabaseEncoding() == PG_UTF8); uiter_setUTF8(&iter, src, srclen); state[0] = state[1] = 0; /* won't need that again */ status = U_ZERO_ERROR; - result = ucol_nextSortKeyPart(locale->info.icu.ucol, + result = ucol_nextSortKeyPart(icu->ucol, &iter, state, (uint8_t *) dest, @@ -727,11 +743,13 @@ icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale, UErrorCode status; int32_t len_dest; + struct icu_provider *icu = (struct icu_provider *) mylocale->provider_data; + len_dest = len_source; /* try first with same length */ *buff_dest = palloc(len_dest * sizeof(**buff_dest)); status = U_ZERO_ERROR; len_dest = func(*buff_dest, len_dest, buff_source, len_source, - mylocale->info.icu.locale, &status); + icu->locale, &status); if (status == U_BUFFER_OVERFLOW_ERROR) { /* try again with adjusted length */ @@ -739,7 +757,7 @@ icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale, *buff_dest = palloc(len_dest * sizeof(**buff_dest)); status = U_ZERO_ERROR; len_dest = func(*buff_dest, len_dest, buff_source, len_source, - mylocale->info.icu.locale, &status); + icu->locale, &status); } if (U_FAILURE(status)) ereport(ERROR, @@ -781,6 +799,8 @@ strncoll_icu(const char *arg1, ssize_t len1, *uchar2; int result; + struct icu_provider *icu = (struct icu_provider *) locale->provider_data; + /* if encoding is UTF8, use more efficient strncoll_icu_utf8 */ #ifdef HAVE_UCOL_STRCOLLUTF8 Assert(GetDatabaseEncoding() != PG_UTF8); @@ -803,7 +823,7 @@ strncoll_icu(const char *arg1, ssize_t len1, ulen1 = uchar_convert(icu_converter, uchar1, ulen1 + 1, arg1, len1); ulen2 = uchar_convert(icu_converter, uchar2, ulen2 + 1, arg2, len2); - result = ucol_strcoll(locale->info.icu.ucol, + result = ucol_strcoll(icu->ucol, uchar1, ulen1, uchar2, ulen2); @@ -829,6 +849,8 @@ strnxfrm_prefix_icu(char *dest, size_t destsize, size_t uchar_bsize; Size result_bsize; + struct icu_provider *icu = (struct icu_provider *) locale->provider_data; + /* if encoding is UTF8, use more efficient strnxfrm_prefix_icu_utf8 */ Assert(GetDatabaseEncoding() != PG_UTF8); @@ -848,7 +870,7 @@ strnxfrm_prefix_icu(char *dest, size_t destsize, uiter_setString(&iter, uchar, ulen); state[0] = state[1] = 0; /* won't need that again */ status = U_ZERO_ERROR; - result_bsize = ucol_nextSortKeyPart(locale->info.icu.ucol, + result_bsize = ucol_nextSortKeyPart(icu->ucol, &iter, state, (uint8_t *) dest, diff --git a/src/backend/utils/adt/pg_locale_libc.c b/src/backend/utils/adt/pg_locale_libc.c index 1582f8cdd2a..d357962ebdf 100644 --- a/src/backend/utils/adt/pg_locale_libc.c +++ b/src/backend/utils/adt/pg_locale_libc.c @@ -1,3 +1,4 @@ + /*----------------------------------------------------------------------- * * PostgreSQL locale utilities for libc @@ -41,6 +42,11 @@ */ #define TEXTBUFLEN 1024 +struct libc_provider +{ + locale_t lt; +}; + extern pg_locale_t create_pg_locale_libc(Oid collid, MemoryContext context); static int strncoll_libc(const char *arg1, ssize_t len1, @@ -82,116 +88,154 @@ static size_t strupper_libc_mb(char *dest, size_t destsize, static bool wc_isdigit_libc_sb(pg_wchar wc, pg_locale_t locale) { - return isdigit_l((unsigned char) wc, locale->info.lt); + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + + return isdigit_l((unsigned char) wc, libc->lt); } static bool wc_isalpha_libc_sb(pg_wchar wc, pg_locale_t locale) { - return isalpha_l((unsigned char) wc, locale->info.lt); + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + + return isalpha_l((unsigned char) wc, libc->lt); } static bool wc_isalnum_libc_sb(pg_wchar wc, pg_locale_t locale) { - return isalnum_l((unsigned char) wc, locale->info.lt); + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + + return isalnum_l((unsigned char) wc, libc->lt); } static bool wc_isupper_libc_sb(pg_wchar wc, pg_locale_t locale) { - return isupper_l((unsigned char) wc, locale->info.lt); + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + + return isupper_l((unsigned char) wc, libc->lt); } static bool wc_islower_libc_sb(pg_wchar wc, pg_locale_t locale) { - return islower_l((unsigned char) wc, locale->info.lt); + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + + return islower_l((unsigned char) wc, libc->lt); } static bool wc_isgraph_libc_sb(pg_wchar wc, pg_locale_t locale) { - return isgraph_l((unsigned char) wc, locale->info.lt); + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + + return isgraph_l((unsigned char) wc, libc->lt); } static bool wc_isprint_libc_sb(pg_wchar wc, pg_locale_t locale) { - return isprint_l((unsigned char) wc, locale->info.lt); + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + + return isprint_l((unsigned char) wc, libc->lt); } static bool wc_ispunct_libc_sb(pg_wchar wc, pg_locale_t locale) { - return ispunct_l((unsigned char) wc, locale->info.lt); + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + + return ispunct_l((unsigned char) wc, libc->lt); } static bool wc_isspace_libc_sb(pg_wchar wc, pg_locale_t locale) { - return isspace_l((unsigned char) wc, locale->info.lt); + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + + return isspace_l((unsigned char) wc, libc->lt); } static bool wc_isdigit_libc_mb(pg_wchar wc, pg_locale_t locale) { - return iswdigit_l((wint_t) wc, locale->info.lt); + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + + return iswdigit_l((wint_t) wc, libc->lt); } static bool wc_isalpha_libc_mb(pg_wchar wc, pg_locale_t locale) { - return iswalpha_l((wint_t) wc, locale->info.lt); + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + + return iswalpha_l((wint_t) wc, libc->lt); } static bool wc_isalnum_libc_mb(pg_wchar wc, pg_locale_t locale) { - return iswalnum_l((wint_t) wc, locale->info.lt); + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + + return iswalnum_l((wint_t) wc, libc->lt); } static bool wc_isupper_libc_mb(pg_wchar wc, pg_locale_t locale) { - return iswupper_l((wint_t) wc, locale->info.lt); + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + + return iswupper_l((wint_t) wc, libc->lt); } static bool wc_islower_libc_mb(pg_wchar wc, pg_locale_t locale) { - return iswlower_l((wint_t) wc, locale->info.lt); + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + + return iswlower_l((wint_t) wc, libc->lt); } static bool wc_isgraph_libc_mb(pg_wchar wc, pg_locale_t locale) { - return iswgraph_l((wint_t) wc, locale->info.lt); + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + + return iswgraph_l((wint_t) wc, libc->lt); } static bool wc_isprint_libc_mb(pg_wchar wc, pg_locale_t locale) { - return iswprint_l((wint_t) wc, locale->info.lt); + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + + return iswprint_l((wint_t) wc, libc->lt); } static bool wc_ispunct_libc_mb(pg_wchar wc, pg_locale_t locale) { - return iswpunct_l((wint_t) wc, locale->info.lt); + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + + return iswpunct_l((wint_t) wc, libc->lt); } static bool wc_isspace_libc_mb(pg_wchar wc, pg_locale_t locale) { - return iswspace_l((wint_t) wc, locale->info.lt); + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + + return iswspace_l((wint_t) wc, libc->lt); } static char char_tolower_libc(unsigned char ch, pg_locale_t locale) { + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + Assert(pg_database_encoding_max_length() == 1); - return tolower_l(ch, locale->info.lt); + return tolower_l(ch, libc->lt); } static bool @@ -199,19 +243,23 @@ char_is_cased_libc(char ch, pg_locale_t locale) { bool is_multibyte = pg_database_encoding_max_length() > 1; + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + if (is_multibyte && IS_HIGHBIT_SET(ch)) return true; else - return isalpha_l((unsigned char) ch, locale->info.lt); + return isalpha_l((unsigned char) ch, libc->lt); } static pg_wchar toupper_libc_sb(pg_wchar wc, pg_locale_t locale) { + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + Assert(GetDatabaseEncoding() != PG_UTF8); if (wc <= (pg_wchar) UCHAR_MAX) - return toupper_l((unsigned char) wc, locale->info.lt); + return toupper_l((unsigned char) wc, libc->lt); else return wc; } @@ -219,10 +267,12 @@ toupper_libc_sb(pg_wchar wc, pg_locale_t locale) static pg_wchar toupper_libc_mb(pg_wchar wc, pg_locale_t locale) { + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + Assert(GetDatabaseEncoding() == PG_UTF8); if (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF) - return towupper_l((wint_t) wc, locale->info.lt); + return towupper_l((wint_t) wc, libc->lt); else return wc; } @@ -230,10 +280,12 @@ toupper_libc_mb(pg_wchar wc, pg_locale_t locale) static pg_wchar tolower_libc_sb(pg_wchar wc, pg_locale_t locale) { + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + Assert(GetDatabaseEncoding() != PG_UTF8); if (wc <= (pg_wchar) UCHAR_MAX) - return tolower_l((unsigned char) wc, locale->info.lt); + return tolower_l((unsigned char) wc, libc->lt); else return wc; } @@ -241,10 +293,12 @@ tolower_libc_sb(pg_wchar wc, pg_locale_t locale) static pg_wchar tolower_libc_mb(pg_wchar wc, pg_locale_t locale) { + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + Assert(GetDatabaseEncoding() == PG_UTF8); if (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF) - return towlower_l((wint_t) wc, locale->info.lt); + return towlower_l((wint_t) wc, libc->lt); else return wc; } @@ -355,7 +409,7 @@ strlower_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, if (srclen + 1 <= destsize) { - locale_t loc = locale->info.lt; + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; char *p; if (srclen + 1 > destsize) @@ -376,7 +430,7 @@ strlower_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, if (locale->is_default) *p = pg_tolower((unsigned char) *p); else - *p = tolower_l((unsigned char) *p, loc); + *p = tolower_l((unsigned char) *p, libc->lt); } } @@ -387,7 +441,8 @@ static size_t strlower_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { - locale_t loc = locale->info.lt; + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + size_t result_size; wchar_t *workspace; char *result; @@ -409,7 +464,7 @@ strlower_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, char2wchar(workspace, srclen + 1, src, srclen, locale); for (curr_char = 0; workspace[curr_char] != 0; curr_char++) - workspace[curr_char] = towlower_l(workspace[curr_char], loc); + workspace[curr_char] = towlower_l(workspace[curr_char], libc->lt); /* * Make result large enough; case change might change number of bytes @@ -440,7 +495,7 @@ strtitle_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, if (srclen + 1 <= destsize) { - locale_t loc = locale->info.lt; + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; int wasalnum = false; char *p; @@ -466,11 +521,11 @@ strtitle_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, else { if (wasalnum) - *p = tolower_l((unsigned char) *p, loc); + *p = tolower_l((unsigned char) *p, libc->lt); else - *p = toupper_l((unsigned char) *p, loc); + *p = toupper_l((unsigned char) *p, libc->lt); } - wasalnum = isalnum_l((unsigned char) *p, loc); + wasalnum = isalnum_l((unsigned char) *p, libc->lt); } } @@ -481,7 +536,8 @@ static size_t strtitle_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { - locale_t loc = locale->info.lt; + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + int wasalnum = false; size_t result_size; wchar_t *workspace; @@ -506,10 +562,10 @@ strtitle_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, for (curr_char = 0; workspace[curr_char] != 0; curr_char++) { if (wasalnum) - workspace[curr_char] = towlower_l(workspace[curr_char], loc); + workspace[curr_char] = towlower_l(workspace[curr_char], libc->lt); else - workspace[curr_char] = towupper_l(workspace[curr_char], loc); - wasalnum = iswalnum_l(workspace[curr_char], loc); + workspace[curr_char] = towupper_l(workspace[curr_char], libc->lt); + wasalnum = iswalnum_l(workspace[curr_char], libc->lt); } /* @@ -541,7 +597,7 @@ strupper_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, if (srclen + 1 <= destsize) { - locale_t loc = locale->info.lt; + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; char *p; memcpy(dest, src, srclen); @@ -559,7 +615,7 @@ strupper_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, if (locale->is_default) *p = pg_toupper((unsigned char) *p); else - *p = toupper_l((unsigned char) *p, loc); + *p = toupper_l((unsigned char) *p, libc->lt); } } @@ -570,7 +626,8 @@ static size_t strupper_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { - locale_t loc = locale->info.lt; + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + size_t result_size; wchar_t *workspace; char *result; @@ -592,7 +649,7 @@ strupper_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, char2wchar(workspace, srclen + 1, src, srclen, locale); for (curr_char = 0; workspace[curr_char] != 0; curr_char++) - workspace[curr_char] = towupper_l(workspace[curr_char], loc); + workspace[curr_char] = towupper_l(workspace[curr_char], libc->lt); /* * Make result large enough; case change might change number of bytes @@ -620,6 +677,7 @@ create_pg_locale_libc(Oid collid, MemoryContext context) const char *collate; const char *ctype; locale_t loc; + struct libc_provider *libc; pg_locale_t result; if (collid == DEFAULT_COLLATION_OID) @@ -658,16 +716,19 @@ create_pg_locale_libc(Oid collid, MemoryContext context) ReleaseSysCache(tp); } - loc = make_libc_collator(collate, ctype); result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct)); + + libc = MemoryContextAllocZero(context, sizeof(struct libc_provider)); + libc->lt = loc; + result->provider_data = (void *) libc; + result->deterministic = true; result->collate_is_c = (strcmp(collate, "C") == 0) || (strcmp(collate, "POSIX") == 0); result->ctype_is_c = (strcmp(ctype, "C") == 0) || (strcmp(ctype, "POSIX") == 0); - result->info.lt = loc; if (!result->collate_is_c) { #ifdef WIN32 @@ -781,6 +842,8 @@ strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, const char *arg2n; int result; + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + if (bufsize1 + bufsize2 > TEXTBUFLEN) buf = palloc(bufsize1 + bufsize2); @@ -811,7 +874,7 @@ strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, arg2n = buf2; } - result = strcoll_l(arg1n, arg2n, locale->info.lt); + result = strcoll_l(arg1n, arg2n, libc->lt); if (buf != sbuf) pfree(buf); @@ -835,8 +898,10 @@ strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen, size_t bufsize = srclen + 1; size_t result; + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + if (srclen == -1) - return strxfrm_l(dest, src, destsize, locale->info.lt); + return strxfrm_l(dest, src, destsize, libc->lt); if (bufsize > TEXTBUFLEN) buf = palloc(bufsize); @@ -845,7 +910,7 @@ strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen, memcpy(buf, src, srclen); buf[srclen] = '\0'; - result = strxfrm_l(dest, buf, destsize, locale->info.lt); + result = strxfrm_l(dest, buf, destsize, libc->lt); if (buf != sbuf) pfree(buf); @@ -943,6 +1008,8 @@ strncoll_libc_win32_utf8(const char *arg1, ssize_t len1, const char *arg2, int r; int result; + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + Assert(GetDatabaseEncoding() == PG_UTF8); if (len1 == -1) @@ -987,7 +1054,7 @@ strncoll_libc_win32_utf8(const char *arg1, ssize_t len1, const char *arg2, ((LPWSTR) a2p)[r] = 0; errno = 0; - result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, locale->info.lt); + result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, libc->lt); if (result == 2147483647) /* _NLSCMPERROR; missing from mingw headers */ ereport(ERROR, (errmsg("could not compare Unicode strings: %m"))); @@ -1116,8 +1183,10 @@ wchar2char(char *to, const wchar_t *from, size_t tolen, pg_locale_t locale) } else { + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + /* Use wcstombs_l for nondefault locales */ - result = wcstombs_l(to, from, tolen, locale->info.lt); + result = wcstombs_l(to, from, tolen, libc->lt); } return result; @@ -1176,8 +1245,10 @@ char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen, } else { + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + /* Use mbstowcs_l for nondefault locales */ - result = mbstowcs_l(to, str, tolen, locale->info.lt); + result = mbstowcs_l(to, str, tolen, libc->lt); } pfree(str); diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h index 11e1810eeb8..74dd8435a6b 100644 --- a/src/include/utils/pg_locale.h +++ b/src/include/utils/pg_locale.h @@ -148,22 +148,7 @@ struct pg_locale_struct const struct collate_methods *collate; /* NULL if collate_is_c */ const struct ctype_methods *ctype; /* NULL if ctype_is_c */ - union - { - struct - { - const char *locale; - bool casemap_full; - } builtin; - locale_t lt; -#ifdef USE_ICU - struct - { - const char *locale; - UCollator *ucol; - } icu; -#endif - } info; + void *provider_data; }; extern void init_database_collation(void); -- 2.34.1
From c7abdaf4198e6e8d8e812523f82663fb3bede1e7 Mon Sep 17 00:00:00 2001 From: Jeff Davis <j...@j-davis.com> Date: Wed, 9 Oct 2024 10:00:58 -0700 Subject: [PATCH v14 4/4] Don't include ICU headers in pg_locale.h. --- src/backend/commands/collationcmds.c | 4 ++++ src/backend/utils/adt/formatting.c | 4 ---- src/backend/utils/adt/pg_locale.c | 4 ++++ src/backend/utils/adt/pg_locale_icu.c | 1 + src/backend/utils/adt/varlena.c | 4 ++++ src/include/utils/pg_locale.h | 4 ---- 6 files changed, 13 insertions(+), 8 deletions(-) diff --git a/src/backend/commands/collationcmds.c b/src/backend/commands/collationcmds.c index 8acbfbbeda0..a57fe93c387 100644 --- a/src/backend/commands/collationcmds.c +++ b/src/backend/commands/collationcmds.c @@ -14,6 +14,10 @@ */ #include "postgres.h" +#ifdef USE_ICU +#include <unicode/ucol.h> +#endif + #include "access/htup_details.h" #include "access/table.h" #include "access/xact.h" diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c index 3960235e14e..2ba4ca7f0f2 100644 --- a/src/backend/utils/adt/formatting.c +++ b/src/backend/utils/adt/formatting.c @@ -71,10 +71,6 @@ #include <limits.h> #include <wctype.h> -#ifdef USE_ICU -#include <unicode/ustring.h> -#endif - #include "catalog/pg_collation.h" #include "catalog/pg_type.h" #include "common/int.h" diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index 5b78237f72e..f73888de68c 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -54,6 +54,10 @@ #include <time.h> +#ifdef USE_ICU +#include <unicode/ucol.h> +#endif + #include "access/htup_details.h" #include "catalog/pg_collation.h" #include "catalog/pg_database.h" diff --git a/src/backend/utils/adt/pg_locale_icu.c b/src/backend/utils/adt/pg_locale_icu.c index 7bd58f26c44..0469c52b669 100644 --- a/src/backend/utils/adt/pg_locale_icu.c +++ b/src/backend/utils/adt/pg_locale_icu.c @@ -13,6 +13,7 @@ #ifdef USE_ICU #include <unicode/ucnv.h> +#include <unicode/ucol.h> #include <unicode/ustring.h> /* diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index 34796f2e27c..c57262e1888 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -17,6 +17,10 @@ #include <ctype.h> #include <limits.h> +#ifdef USE_ICU +#include <unicode/uchar.h> +#endif + #include "access/detoast.h" #include "access/toast_compression.h" #include "catalog/pg_collation.h" diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h index 74dd8435a6b..acb4890a78a 100644 --- a/src/include/utils/pg_locale.h +++ b/src/include/utils/pg_locale.h @@ -14,10 +14,6 @@ #include "mb/pg_wchar.h" -#ifdef USE_ICU -#include <unicode/ucol.h> -#endif - /* use for libc locale names */ #define LOCALE_NAME_BUFLEN 128 -- 2.34.1