> I'm still inlined to think the method table is a good thing to do: > > (a) The performance cases I tried seem implausibly bad -- running > character classification patterns over large fields consisting only > of > codepoints over U+07FF. > > (b) The method tables seem like a better code organization that > separates the responsibilities of the provider from the calling code. > It's also a requirement (or nearly so) if we want to provide some > pluggability or support multiple library versions. > > It would be good to hear from others on these points, though.
Attached v15. Just a rebase. I'd still like some input here. We could either: * commit this on the grounds that it's a desirable code improvement and the worst-case regression isn't a major concern; or * wait until v19 when we might have a more compelling use for the method table (e.g. pluggable provider or multilib) Regards, Jeff Davis
From 7abb1f6c9f8c845c20d6baf648e4bba075f4a3cb Mon Sep 17 00:00:00 2001 From: Jeff Davis <j...@j-davis.com> Date: Fri, 29 Nov 2024 09:37:43 -0800 Subject: [PATCH v15 1/4] Control ctype behavior internally with a method table. Previously, pattern matching and case mapping behavior branched based on the provider. Refactor to use a method table, which is less error-prone and easier to hook. --- src/backend/regex/regc_pg_locale.c | 377 +++++----------------- src/backend/utils/adt/like.c | 22 +- src/backend/utils/adt/like_support.c | 7 +- src/backend/utils/adt/pg_locale.c | 121 +++---- src/backend/utils/adt/pg_locale_builtin.c | 111 ++++++- src/backend/utils/adt/pg_locale_icu.c | 116 ++++++- src/backend/utils/adt/pg_locale_libc.c | 279 +++++++++++++--- src/include/utils/pg_locale.h | 52 +++ src/tools/pgindent/typedefs.list | 1 - 9 files changed, 630 insertions(+), 456 deletions(-) diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c index ed7411df83d..31b8f4a9478 100644 --- a/src/backend/regex/regc_pg_locale.c +++ b/src/backend/regex/regc_pg_locale.c @@ -63,18 +63,13 @@ * NB: the coding here assumes pg_wchar is an unsigned type. */ -typedef enum -{ - PG_REGEX_STRATEGY_C, /* C locale (encoding independent) */ - PG_REGEX_STRATEGY_BUILTIN, /* built-in Unicode semantics */ - PG_REGEX_STRATEGY_LIBC_WIDE, /* Use locale_t <wctype.h> functions */ - PG_REGEX_STRATEGY_LIBC_1BYTE, /* Use locale_t <ctype.h> functions */ - PG_REGEX_STRATEGY_ICU, /* Use ICU uchar.h functions */ -} PG_Locale_Strategy; - -static PG_Locale_Strategy pg_regex_strategy; static pg_locale_t pg_regex_locale; +static struct pg_locale_struct dummy_c_locale = { + .collate_is_c = true, + .ctype_is_c = true, +}; + /* * Hard-wired character properties for C locale */ @@ -231,7 +226,6 @@ void pg_set_regex_collation(Oid collation) { pg_locale_t locale = 0; - PG_Locale_Strategy strategy; if (!OidIsValid(collation)) { @@ -252,8 +246,7 @@ pg_set_regex_collation(Oid collation) * catalog access is available, so we can't call * pg_newlocale_from_collation(). */ - strategy = PG_REGEX_STRATEGY_C; - locale = 0; + locale = &dummy_c_locale; } else { @@ -270,113 +263,41 @@ pg_set_regex_collation(Oid collation) * C/POSIX collations use this path regardless of database * encoding */ - strategy = PG_REGEX_STRATEGY_C; - locale = 0; - } - else if (locale->provider == COLLPROVIDER_BUILTIN) - { - Assert(GetDatabaseEncoding() == PG_UTF8); - strategy = PG_REGEX_STRATEGY_BUILTIN; - } -#ifdef USE_ICU - else if (locale->provider == COLLPROVIDER_ICU) - { - strategy = PG_REGEX_STRATEGY_ICU; - } -#endif - else - { - Assert(locale->provider == COLLPROVIDER_LIBC); - if (GetDatabaseEncoding() == PG_UTF8) - strategy = PG_REGEX_STRATEGY_LIBC_WIDE; - else - strategy = PG_REGEX_STRATEGY_LIBC_1BYTE; + locale = &dummy_c_locale; } } - pg_regex_strategy = strategy; pg_regex_locale = locale; } static int pg_wc_isdigit(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISDIGIT)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_isdigit(c, !pg_regex_locale->info.builtin.casemap_full); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswdigit_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - isdigit_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_isdigit(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISDIGIT)); + else + return pg_regex_locale->ctype->wc_isdigit(c, pg_regex_locale); } static int pg_wc_isalpha(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISALPHA)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_isalpha(c); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswalpha_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - isalpha_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_isalpha(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISALPHA)); + else + return pg_regex_locale->ctype->wc_isalpha(c, pg_regex_locale); } static int pg_wc_isalnum(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISALNUM)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_isalnum(c, !pg_regex_locale->info.builtin.casemap_full); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswalnum_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - isalnum_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_isalnum(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISALNUM)); + else + return pg_regex_locale->ctype->wc_isalnum(c, pg_regex_locale); } static int @@ -391,219 +312,87 @@ pg_wc_isword(pg_wchar c) static int pg_wc_isupper(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISUPPER)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_isupper(c); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswupper_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - isupper_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_isupper(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISUPPER)); + else + return pg_regex_locale->ctype->wc_isupper(c, pg_regex_locale); } static int pg_wc_islower(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISLOWER)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_islower(c); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswlower_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - islower_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_islower(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISLOWER)); + else + return pg_regex_locale->ctype->wc_islower(c, pg_regex_locale); } static int pg_wc_isgraph(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISGRAPH)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_isgraph(c); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswgraph_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - isgraph_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_isgraph(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISGRAPH)); + else + return pg_regex_locale->ctype->wc_isgraph(c, pg_regex_locale); } static int pg_wc_isprint(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISPRINT)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_isprint(c); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswprint_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - isprint_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_isprint(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISPRINT)); + else + return pg_regex_locale->ctype->wc_isprint(c, pg_regex_locale); } static int pg_wc_ispunct(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISPUNCT)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_ispunct(c, !pg_regex_locale->info.builtin.casemap_full); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswpunct_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - ispunct_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_ispunct(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISPUNCT)); + else + return pg_regex_locale->ctype->wc_ispunct(c, pg_regex_locale); } static int pg_wc_isspace(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISSPACE)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_isspace(c); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswspace_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - isspace_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_isspace(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISSPACE)); + else + return pg_regex_locale->ctype->wc_isspace(c, pg_regex_locale); } static pg_wchar pg_wc_toupper(pg_wchar c) { - switch (pg_regex_strategy) + if (pg_regex_locale->ctype_is_c) { - case PG_REGEX_STRATEGY_C: - if (c <= (pg_wchar) 127) - return pg_ascii_toupper((unsigned char) c); - return c; - case PG_REGEX_STRATEGY_BUILTIN: - return unicode_uppercase_simple(c); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return towupper_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - if (c <= (pg_wchar) UCHAR_MAX) - return toupper_l((unsigned char) c, pg_regex_locale->info.lt); - return c; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_toupper(c); -#endif - break; + if (c <= (pg_wchar) 127) + return pg_ascii_toupper((unsigned char) c); + return c; } - return 0; /* can't get here, but keep compiler quiet */ + else + return pg_regex_locale->ctype->wc_toupper(c, pg_regex_locale); } static pg_wchar pg_wc_tolower(pg_wchar c) { - switch (pg_regex_strategy) + if (pg_regex_locale->ctype_is_c) { - case PG_REGEX_STRATEGY_C: - if (c <= (pg_wchar) 127) - return pg_ascii_tolower((unsigned char) c); - return c; - case PG_REGEX_STRATEGY_BUILTIN: - return unicode_lowercase_simple(c); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return towlower_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - if (c <= (pg_wchar) UCHAR_MAX) - return tolower_l((unsigned char) c, pg_regex_locale->info.lt); - return c; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_tolower(c); -#endif - break; + if (c <= (pg_wchar) 127) + return pg_ascii_tolower((unsigned char) c); + return c; } - return 0; /* can't get here, but keep compiler quiet */ + else + return pg_regex_locale->ctype->wc_tolower(c, pg_regex_locale); } @@ -729,37 +518,25 @@ pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode) * would always be true for production values of MAX_SIMPLE_CHR, but it's * useful to allow it to be small for testing purposes.) */ - switch (pg_regex_strategy) + if (pg_regex_locale->ctype_is_c) { - case PG_REGEX_STRATEGY_C: #if MAX_SIMPLE_CHR >= 127 - max_chr = (pg_wchar) 127; - pcc->cv.cclasscode = -1; + max_chr = (pg_wchar) 127; + pcc->cv.cclasscode = -1; #else - max_chr = (pg_wchar) MAX_SIMPLE_CHR; + max_chr = (pg_wchar) MAX_SIMPLE_CHR; #endif - break; - case PG_REGEX_STRATEGY_BUILTIN: - max_chr = (pg_wchar) MAX_SIMPLE_CHR; - break; - case PG_REGEX_STRATEGY_LIBC_WIDE: - max_chr = (pg_wchar) MAX_SIMPLE_CHR; - break; - case PG_REGEX_STRATEGY_LIBC_1BYTE: -#if MAX_SIMPLE_CHR >= UCHAR_MAX - max_chr = (pg_wchar) UCHAR_MAX; + } + else + { + if (pg_regex_locale->ctype->max_chr != 0 && + pg_regex_locale->ctype->max_chr <= MAX_SIMPLE_CHR) + { + max_chr = pg_regex_locale->ctype->max_chr; pcc->cv.cclasscode = -1; -#else - max_chr = (pg_wchar) MAX_SIMPLE_CHR; -#endif - break; - case PG_REGEX_STRATEGY_ICU: + } + else max_chr = (pg_wchar) MAX_SIMPLE_CHR; - break; - default: - Assert(false); - max_chr = 0; /* can't get here, but keep compiler quiet */ - break; } /* diff --git a/src/backend/utils/adt/like.c b/src/backend/utils/adt/like.c index 7f4cf614585..4216ac17f43 100644 --- a/src/backend/utils/adt/like.c +++ b/src/backend/utils/adt/like.c @@ -98,7 +98,7 @@ SB_lower_char(unsigned char c, pg_locale_t locale) else if (locale->is_default) return pg_tolower(c); else - return tolower_l(c, locale->info.lt); + return char_tolower(c, locale); } @@ -209,7 +209,17 @@ Generic_Text_IC_like(text *str, text *pat, Oid collation) * way. */ - if (pg_database_encoding_max_length() > 1 || (locale->provider == COLLPROVIDER_ICU)) + if (locale->ctype_is_c || + (char_tolower_enabled(locale) && + pg_database_encoding_max_length() == 1)) + { + p = VARDATA_ANY(pat); + plen = VARSIZE_ANY_EXHDR(pat); + s = VARDATA_ANY(str); + slen = VARSIZE_ANY_EXHDR(str); + return SB_IMatchText(s, slen, p, plen, locale); + } + else { pat = DatumGetTextPP(DirectFunctionCall1Coll(lower, collation, PointerGetDatum(pat))); @@ -224,14 +234,6 @@ Generic_Text_IC_like(text *str, text *pat, Oid collation) else return MB_MatchText(s, slen, p, plen, 0); } - else - { - p = VARDATA_ANY(pat); - plen = VARSIZE_ANY_EXHDR(pat); - s = VARDATA_ANY(str); - slen = VARSIZE_ANY_EXHDR(str); - return SB_IMatchText(s, slen, p, plen, locale); - } } /* diff --git a/src/backend/utils/adt/like_support.c b/src/backend/utils/adt/like_support.c index 8fdc677371f..999f23f86d5 100644 --- a/src/backend/utils/adt/like_support.c +++ b/src/backend/utils/adt/like_support.c @@ -1495,13 +1495,8 @@ pattern_char_isalpha(char c, bool is_multibyte, { if (locale->ctype_is_c) return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); - else if (is_multibyte && IS_HIGHBIT_SET(c)) - return true; - else if (locale->provider != COLLPROVIDER_LIBC) - return IS_HIGHBIT_SET(c) || - (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); else - return isalpha_l((unsigned char) c, locale->info.lt); + return char_is_cased(c, locale); } diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index 7d92f580a57..f26ce0e6cc7 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -100,31 +100,6 @@ extern pg_locale_t create_pg_locale_icu(Oid collid, MemoryContext context); extern pg_locale_t create_pg_locale_libc(Oid collid, MemoryContext context); extern char *get_collation_actual_version_libc(const char *collcollate); -extern size_t strlower_builtin(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strtitle_builtin(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strupper_builtin(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strfold_builtin(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); - -extern size_t strlower_icu(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strtitle_icu(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strupper_icu(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strfold_icu(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); - -extern size_t strlower_libc(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strtitle_libc(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strupper_libc(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); - /* GUC settings */ char *locale_messages; char *locale_monetary; @@ -1236,6 +1211,9 @@ create_pg_locale(Oid collid, MemoryContext context) Assert((result->collate_is_c && result->collate == NULL) || (!result->collate_is_c && result->collate != NULL)); + Assert((result->ctype_is_c && result->ctype == NULL) || + (!result->ctype_is_c && result->ctype != NULL)); + datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collversion, &isnull); if (!isnull) @@ -1398,77 +1376,31 @@ size_t pg_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale) { - if (locale->provider == COLLPROVIDER_BUILTIN) - return strlower_builtin(dst, dstsize, src, srclen, locale); -#ifdef USE_ICU - else if (locale->provider == COLLPROVIDER_ICU) - return strlower_icu(dst, dstsize, src, srclen, locale); -#endif - else if (locale->provider == COLLPROVIDER_LIBC) - return strlower_libc(dst, dstsize, src, srclen, locale); - else - /* shouldn't happen */ - PGLOCALE_SUPPORT_ERROR(locale->provider); - - return 0; /* keep compiler quiet */ + return locale->ctype->strlower(dst, dstsize, src, srclen, locale); } size_t pg_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale) { - if (locale->provider == COLLPROVIDER_BUILTIN) - return strtitle_builtin(dst, dstsize, src, srclen, locale); -#ifdef USE_ICU - else if (locale->provider == COLLPROVIDER_ICU) - return strtitle_icu(dst, dstsize, src, srclen, locale); -#endif - else if (locale->provider == COLLPROVIDER_LIBC) - return strtitle_libc(dst, dstsize, src, srclen, locale); - else - /* shouldn't happen */ - PGLOCALE_SUPPORT_ERROR(locale->provider); - - return 0; /* keep compiler quiet */ + return locale->ctype->strtitle(dst, dstsize, src, srclen, locale); } size_t pg_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale) { - if (locale->provider == COLLPROVIDER_BUILTIN) - return strupper_builtin(dst, dstsize, src, srclen, locale); -#ifdef USE_ICU - else if (locale->provider == COLLPROVIDER_ICU) - return strupper_icu(dst, dstsize, src, srclen, locale); -#endif - else if (locale->provider == COLLPROVIDER_LIBC) - return strupper_libc(dst, dstsize, src, srclen, locale); - else - /* shouldn't happen */ - PGLOCALE_SUPPORT_ERROR(locale->provider); - - return 0; /* keep compiler quiet */ + return locale->ctype->strupper(dst, dstsize, src, srclen, locale); } size_t pg_strfold(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale) { - if (locale->provider == COLLPROVIDER_BUILTIN) - return strfold_builtin(dst, dstsize, src, srclen, locale); -#ifdef USE_ICU - else if (locale->provider == COLLPROVIDER_ICU) - return strfold_icu(dst, dstsize, src, srclen, locale); -#endif - /* for libc, just use strlower */ - else if (locale->provider == COLLPROVIDER_LIBC) - return strlower_libc(dst, dstsize, src, srclen, locale); + if (locale->ctype->strfold) + return locale->ctype->strfold(dst, dstsize, src, srclen, locale); else - /* shouldn't happen */ - PGLOCALE_SUPPORT_ERROR(locale->provider); - - return 0; /* keep compiler quiet */ + return locale->ctype->strlower(dst, dstsize, src, srclen, locale); } /* @@ -1605,6 +1537,41 @@ pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src, return locale->collate->strnxfrm_prefix(dest, destsize, src, srclen, locale); } +/* + * char_is_cased() + * + * Fuzzy test of whether the given char is case-varying or not. The argument + * is a single byte, so in a multibyte encoding, just assume any non-ASCII + * char is case-varying. + */ +bool +char_is_cased(char ch, pg_locale_t locale) +{ + return locale->ctype->char_is_cased(ch, locale); +} + +/* + * char_tolower_enabled() + * + * Does the provider support char_tolower()? + */ +bool +char_tolower_enabled(pg_locale_t locale) +{ + return (locale->ctype->char_tolower != NULL); +} + +/* + * char_tolower() + * + * Convert char (single-byte encoding) to lowercase. + */ +char +char_tolower(unsigned char ch, pg_locale_t locale) +{ + return locale->ctype->char_tolower(ch, locale); +} + /* * Return required encoding ID for the given locale, or -1 if any encoding is * valid for the locale. diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c index 33ad20bbf07..23504be383a 100644 --- a/src/backend/utils/adt/pg_locale_builtin.c +++ b/src/backend/utils/adt/pg_locale_builtin.c @@ -25,15 +25,6 @@ extern pg_locale_t create_pg_locale_builtin(Oid collid, MemoryContext context); extern char *get_collation_actual_version_builtin(const char *collcollate); -extern size_t strlower_builtin(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strtitle_builtin(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strupper_builtin(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strfold_builtin(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); - struct WordBoundaryState { @@ -76,7 +67,7 @@ initcap_wbnext(void *state) return wbstate->len; } -size_t +static size_t strlower_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { @@ -84,7 +75,7 @@ strlower_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, locale->info.builtin.casemap_full); } -size_t +static size_t strtitle_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { @@ -101,7 +92,7 @@ strtitle_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, initcap_wbnext, &wbstate); } -size_t +static size_t strupper_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { @@ -109,7 +100,7 @@ strupper_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, locale->info.builtin.casemap_full); } -size_t +static size_t strfold_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { @@ -117,6 +108,98 @@ strfold_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, locale->info.builtin.casemap_full); } +static bool +wc_isdigit_builtin(pg_wchar wc, pg_locale_t locale) +{ + return pg_u_isdigit(wc, !locale->info.builtin.casemap_full); +} + +static bool +wc_isalpha_builtin(pg_wchar wc, pg_locale_t locale) +{ + return pg_u_isalpha(wc); +} + +static bool +wc_isalnum_builtin(pg_wchar wc, pg_locale_t locale) +{ + return pg_u_isalnum(wc, !locale->info.builtin.casemap_full); +} + +static bool +wc_isupper_builtin(pg_wchar wc, pg_locale_t locale) +{ + return pg_u_isupper(wc); +} + +static bool +wc_islower_builtin(pg_wchar wc, pg_locale_t locale) +{ + return pg_u_islower(wc); +} + +static bool +wc_isgraph_builtin(pg_wchar wc, pg_locale_t locale) +{ + return pg_u_isgraph(wc); +} + +static bool +wc_isprint_builtin(pg_wchar wc, pg_locale_t locale) +{ + return pg_u_isprint(wc); +} + +static bool +wc_ispunct_builtin(pg_wchar wc, pg_locale_t locale) +{ + return pg_u_ispunct(wc, !locale->info.builtin.casemap_full); +} + +static bool +wc_isspace_builtin(pg_wchar wc, pg_locale_t locale) +{ + return pg_u_isspace(wc); +} + +static bool +char_is_cased_builtin(char ch, pg_locale_t locale) +{ + return IS_HIGHBIT_SET(ch) || + (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z'); +} + +static pg_wchar +wc_toupper_builtin(pg_wchar wc, pg_locale_t locale) +{ + return unicode_uppercase_simple(wc); +} + +static pg_wchar +wc_tolower_builtin(pg_wchar wc, pg_locale_t locale) +{ + return unicode_lowercase_simple(wc); +} + +static const struct ctype_methods ctype_methods_builtin = { + .strlower = strlower_builtin, + .strtitle = strtitle_builtin, + .strupper = strupper_builtin, + .strfold = strfold_builtin, + .wc_isdigit = wc_isdigit_builtin, + .wc_isalpha = wc_isalpha_builtin, + .wc_isalnum = wc_isalnum_builtin, + .wc_isupper = wc_isupper_builtin, + .wc_islower = wc_islower_builtin, + .wc_isgraph = wc_isgraph_builtin, + .wc_isprint = wc_isprint_builtin, + .wc_ispunct = wc_ispunct_builtin, + .wc_isspace = wc_isspace_builtin, + .char_is_cased = char_is_cased_builtin, + .wc_tolower = wc_tolower_builtin, + .wc_toupper = wc_toupper_builtin, +}; + pg_locale_t create_pg_locale_builtin(Oid collid, MemoryContext context) { @@ -160,6 +243,8 @@ create_pg_locale_builtin(Oid collid, MemoryContext context) result->deterministic = true; result->collate_is_c = true; result->ctype_is_c = (strcmp(locstr, "C") == 0); + if (!result->ctype_is_c) + result->ctype = &ctype_methods_builtin; return result; } diff --git a/src/backend/utils/adt/pg_locale_icu.c b/src/backend/utils/adt/pg_locale_icu.c index b0c73f2e43d..1cb24fadea2 100644 --- a/src/backend/utils/adt/pg_locale_icu.c +++ b/src/backend/utils/adt/pg_locale_icu.c @@ -48,19 +48,19 @@ #define TEXTBUFLEN 1024 extern pg_locale_t create_pg_locale_icu(Oid collid, MemoryContext context); -extern size_t strlower_icu(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strtitle_icu(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strupper_icu(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strfold_icu(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); #ifdef USE_ICU extern UCollator *pg_ucol_open(const char *loc_str); +static size_t strlower_icu(char *dst, size_t dstsize, const char *src, + ssize_t srclen, pg_locale_t locale); +static size_t strtitle_icu(char *dst, size_t dstsize, const char *src, + ssize_t srclen, pg_locale_t locale); +static size_t strupper_icu(char *dst, size_t dstsize, const char *src, + ssize_t srclen, pg_locale_t locale); +static size_t strfold_icu(char *dst, size_t dstsize, const char *src, + ssize_t srclen, pg_locale_t locale); static int strncoll_icu(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, pg_locale_t locale); @@ -124,6 +124,25 @@ static int32_t u_strFoldCase_default(UChar *dest, int32_t destCapacity, const char *locale, UErrorCode *pErrorCode); +static bool +char_is_cased_icu(char ch, pg_locale_t locale) +{ + return IS_HIGHBIT_SET(ch) || + (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z'); +} + +static pg_wchar +toupper_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_toupper(wc); +} + +static pg_wchar +tolower_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_tolower(wc); +} + static const struct collate_methods collate_methods_icu = { .strncoll = strncoll_icu, .strnxfrm = strnxfrm_icu, @@ -142,6 +161,78 @@ static const struct collate_methods collate_methods_icu_utf8 = { .strxfrm_is_safe = true, }; +static bool +wc_isdigit_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_isdigit(wc); +} + +static bool +wc_isalpha_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_isalpha(wc); +} + +static bool +wc_isalnum_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_isalnum(wc); +} + +static bool +wc_isupper_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_isupper(wc); +} + +static bool +wc_islower_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_islower(wc); +} + +static bool +wc_isgraph_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_isgraph(wc); +} + +static bool +wc_isprint_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_isprint(wc); +} + +static bool +wc_ispunct_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_ispunct(wc); +} + +static bool +wc_isspace_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_isspace(wc); +} + +static const struct ctype_methods ctype_methods_icu = { + .strlower = strlower_icu, + .strtitle = strtitle_icu, + .strupper = strupper_icu, + .strfold = strfold_icu, + .wc_isdigit = wc_isdigit_icu, + .wc_isalpha = wc_isalpha_icu, + .wc_isalnum = wc_isalnum_icu, + .wc_isupper = wc_isupper_icu, + .wc_islower = wc_islower_icu, + .wc_isgraph = wc_isgraph_icu, + .wc_isprint = wc_isprint_icu, + .wc_ispunct = wc_ispunct_icu, + .wc_isspace = wc_isspace_icu, + .char_is_cased = char_is_cased_icu, + .wc_toupper = toupper_icu, + .wc_tolower = tolower_icu, +}; #endif pg_locale_t @@ -212,6 +303,7 @@ create_pg_locale_icu(Oid collid, MemoryContext context) result->collate = &collate_methods_icu_utf8; else result->collate = &collate_methods_icu; + result->ctype = &ctype_methods_icu; return result; #else @@ -385,7 +477,7 @@ make_icu_collator(const char *iculocstr, const char *icurules) } } -size_t +static size_t strlower_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { @@ -405,7 +497,7 @@ strlower_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, return result_len; } -size_t +static size_t strtitle_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { @@ -425,7 +517,7 @@ strtitle_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, return result_len; } -size_t +static size_t strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { @@ -445,7 +537,7 @@ strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, return result_len; } -size_t +static size_t strfold_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { diff --git a/src/backend/utils/adt/pg_locale_libc.c b/src/backend/utils/adt/pg_locale_libc.c index 8f9a8637897..1144c6ff304 100644 --- a/src/backend/utils/adt/pg_locale_libc.c +++ b/src/backend/utils/adt/pg_locale_libc.c @@ -43,13 +43,6 @@ extern pg_locale_t create_pg_locale_libc(Oid collid, MemoryContext context); -extern size_t strlower_libc(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strtitle_libc(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strupper_libc(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); - static int strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, pg_locale_t locale); @@ -86,6 +79,239 @@ static size_t strupper_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale); +static bool +wc_isdigit_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + return isdigit_l((unsigned char) wc, locale->info.lt); +} + +static bool +wc_isalpha_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + return isalpha_l((unsigned char) wc, locale->info.lt); +} + +static bool +wc_isalnum_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + return isalnum_l((unsigned char) wc, locale->info.lt); +} + +static bool +wc_isupper_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + return isupper_l((unsigned char) wc, locale->info.lt); +} + +static bool +wc_islower_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + return islower_l((unsigned char) wc, locale->info.lt); +} + +static bool +wc_isgraph_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + return isgraph_l((unsigned char) wc, locale->info.lt); +} + +static bool +wc_isprint_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + return isprint_l((unsigned char) wc, locale->info.lt); +} + +static bool +wc_ispunct_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + return ispunct_l((unsigned char) wc, locale->info.lt); +} + +static bool +wc_isspace_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + return isspace_l((unsigned char) wc, locale->info.lt); +} + +static bool +wc_isdigit_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + return iswdigit_l((wint_t) wc, locale->info.lt); +} + +static bool +wc_isalpha_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + return iswalpha_l((wint_t) wc, locale->info.lt); +} + +static bool +wc_isalnum_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + return iswalnum_l((wint_t) wc, locale->info.lt); +} + +static bool +wc_isupper_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + return iswupper_l((wint_t) wc, locale->info.lt); +} + +static bool +wc_islower_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + return iswlower_l((wint_t) wc, locale->info.lt); +} + +static bool +wc_isgraph_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + return iswgraph_l((wint_t) wc, locale->info.lt); +} + +static bool +wc_isprint_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + return iswprint_l((wint_t) wc, locale->info.lt); +} + +static bool +wc_ispunct_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + return iswpunct_l((wint_t) wc, locale->info.lt); +} + +static bool +wc_isspace_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + return iswspace_l((wint_t) wc, locale->info.lt); +} + +static char +char_tolower_libc(unsigned char ch, pg_locale_t locale) +{ + Assert(pg_database_encoding_max_length() == 1); + return tolower_l(ch, locale->info.lt); +} + +static bool +char_is_cased_libc(char ch, pg_locale_t locale) +{ + bool is_multibyte = pg_database_encoding_max_length() > 1; + + if (is_multibyte && IS_HIGHBIT_SET(ch)) + return true; + else + return isalpha_l((unsigned char) ch, locale->info.lt); +} + +static pg_wchar +toupper_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + Assert(GetDatabaseEncoding() != PG_UTF8); + + if (wc <= (pg_wchar) UCHAR_MAX) + return toupper_l((unsigned char) wc, locale->info.lt); + else + return wc; +} + +static pg_wchar +toupper_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + Assert(GetDatabaseEncoding() == PG_UTF8); + + if (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF) + return towupper_l((wint_t) wc, locale->info.lt); + else + return wc; +} + +static pg_wchar +tolower_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + Assert(GetDatabaseEncoding() != PG_UTF8); + + if (wc <= (pg_wchar) UCHAR_MAX) + return tolower_l((unsigned char) wc, locale->info.lt); + else + return wc; +} + +static pg_wchar +tolower_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + Assert(GetDatabaseEncoding() == PG_UTF8); + + if (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF) + return towlower_l((wint_t) wc, locale->info.lt); + else + return wc; +} + +static const struct ctype_methods ctype_methods_libc_sb = { + .strlower = strlower_libc_sb, + .strtitle = strtitle_libc_sb, + .strupper = strupper_libc_sb, + .wc_isdigit = wc_isdigit_libc_sb, + .wc_isalpha = wc_isalpha_libc_sb, + .wc_isalnum = wc_isalnum_libc_sb, + .wc_isupper = wc_isupper_libc_sb, + .wc_islower = wc_islower_libc_sb, + .wc_isgraph = wc_isgraph_libc_sb, + .wc_isprint = wc_isprint_libc_sb, + .wc_ispunct = wc_ispunct_libc_sb, + .wc_isspace = wc_isspace_libc_sb, + .char_is_cased = char_is_cased_libc, + .char_tolower = char_tolower_libc, + .wc_toupper = toupper_libc_sb, + .wc_tolower = tolower_libc_sb, + .max_chr = UCHAR_MAX, +}; + +/* + * Non-UTF8 multibyte encodings use multibyte semantics for case mapping, but + * single-byte semantics for pattern matching. + */ +static const struct ctype_methods ctype_methods_libc_other_mb = { + .strlower = strlower_libc_mb, + .strtitle = strtitle_libc_mb, + .strupper = strupper_libc_mb, + .wc_isdigit = wc_isdigit_libc_sb, + .wc_isalpha = wc_isalpha_libc_sb, + .wc_isalnum = wc_isalnum_libc_sb, + .wc_isupper = wc_isupper_libc_sb, + .wc_islower = wc_islower_libc_sb, + .wc_isgraph = wc_isgraph_libc_sb, + .wc_isprint = wc_isprint_libc_sb, + .wc_ispunct = wc_ispunct_libc_sb, + .wc_isspace = wc_isspace_libc_sb, + .char_is_cased = char_is_cased_libc, + .char_tolower = char_tolower_libc, + .wc_toupper = toupper_libc_sb, + .wc_tolower = tolower_libc_sb, + .max_chr = UCHAR_MAX, +}; + +static const struct ctype_methods ctype_methods_libc_utf8 = { + .strlower = strlower_libc_mb, + .strtitle = strtitle_libc_mb, + .strupper = strupper_libc_mb, + .wc_isdigit = wc_isdigit_libc_mb, + .wc_isalpha = wc_isalpha_libc_mb, + .wc_isalnum = wc_isalnum_libc_mb, + .wc_isupper = wc_isupper_libc_mb, + .wc_islower = wc_islower_libc_mb, + .wc_isgraph = wc_isgraph_libc_mb, + .wc_isprint = wc_isprint_libc_mb, + .wc_ispunct = wc_ispunct_libc_mb, + .wc_isspace = wc_isspace_libc_mb, + .char_is_cased = char_is_cased_libc, + .char_tolower = char_tolower_libc, + .wc_toupper = toupper_libc_mb, + .wc_tolower = tolower_libc_mb, +}; + static const struct collate_methods collate_methods_libc = { .strncoll = strncoll_libc, .strnxfrm = strnxfrm_libc, @@ -120,36 +346,6 @@ static const struct collate_methods collate_methods_libc_win32_utf8 = { }; #endif -size_t -strlower_libc(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale) -{ - if (pg_database_encoding_max_length() > 1) - return strlower_libc_mb(dst, dstsize, src, srclen, locale); - else - return strlower_libc_sb(dst, dstsize, src, srclen, locale); -} - -size_t -strtitle_libc(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale) -{ - if (pg_database_encoding_max_length() > 1) - return strtitle_libc_mb(dst, dstsize, src, srclen, locale); - else - return strtitle_libc_sb(dst, dstsize, src, srclen, locale); -} - -size_t -strupper_libc(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale) -{ - if (pg_database_encoding_max_length() > 1) - return strupper_libc_mb(dst, dstsize, src, srclen, locale); - else - return strupper_libc_sb(dst, dstsize, src, srclen, locale); -} - static size_t strlower_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) @@ -482,6 +678,15 @@ create_pg_locale_libc(Oid collid, MemoryContext context) #endif result->collate = &collate_methods_libc; } + if (!result->ctype_is_c) + { + if (GetDatabaseEncoding() == PG_UTF8) + result->ctype = &ctype_methods_libc_utf8; + else if (pg_database_encoding_max_length() > 1) + result->ctype = &ctype_methods_libc_other_mb; + else + result->ctype = &ctype_methods_libc_sb; + } return result; } diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h index 0d5f0513ceb..cd2c812ae26 100644 --- a/src/include/utils/pg_locale.h +++ b/src/include/utils/pg_locale.h @@ -12,6 +12,8 @@ #ifndef _PG_LOCALE_ #define _PG_LOCALE_ +#include "mb/pg_wchar.h" + #ifdef USE_ICU #include <unicode/ucol.h> #endif @@ -77,6 +79,52 @@ struct collate_methods bool strxfrm_is_safe; }; +struct ctype_methods +{ + /* case mapping: LOWER()/INITCAP()/UPPER() */ + size_t (*strlower) (char *dest, size_t destsize, + const char *src, ssize_t srclen, + pg_locale_t locale); + size_t (*strtitle) (char *dest, size_t destsize, + const char *src, ssize_t srclen, + pg_locale_t locale); + size_t (*strupper) (char *dest, size_t destsize, + const char *src, ssize_t srclen, + pg_locale_t locale); + size_t (*strfold) (char *dest, size_t destsize, + const char *src, ssize_t srclen, + pg_locale_t locale); + + /* required */ + bool (*wc_isdigit) (pg_wchar wc, pg_locale_t locale); + bool (*wc_isalpha) (pg_wchar wc, pg_locale_t locale); + bool (*wc_isalnum) (pg_wchar wc, pg_locale_t locale); + bool (*wc_isupper) (pg_wchar wc, pg_locale_t locale); + bool (*wc_islower) (pg_wchar wc, pg_locale_t locale); + bool (*wc_isgraph) (pg_wchar wc, pg_locale_t locale); + bool (*wc_isprint) (pg_wchar wc, pg_locale_t locale); + bool (*wc_ispunct) (pg_wchar wc, pg_locale_t locale); + bool (*wc_isspace) (pg_wchar wc, pg_locale_t locale); + pg_wchar (*wc_toupper) (pg_wchar wc, pg_locale_t locale); + pg_wchar (*wc_tolower) (pg_wchar wc, pg_locale_t locale); + + /* required */ + bool (*char_is_cased) (char ch, pg_locale_t locale); + + /* + * Optional. If defined, will only be called for single-byte encodings. If + * not defined, or if the encoding is multibyte, will fall back to + * pg_strlower(). + */ + char (*char_tolower) (unsigned char ch, pg_locale_t locale); + + /* + * For regex and pattern matching efficiency, the maximum char value + * supported by the above methods. If zero, limit is set by regex code. + */ + pg_wchar max_chr; +}; + /* * We use a discriminated union to hold either a locale_t or an ICU collator. * pg_locale_t is occasionally checked for truth, so make it a pointer. @@ -102,6 +150,7 @@ struct pg_locale_struct bool is_default; const struct collate_methods *collate; /* NULL if collate_is_c */ + const struct ctype_methods *ctype; /* NULL if ctype_is_c */ union { @@ -125,6 +174,9 @@ extern void init_database_collation(void); extern pg_locale_t pg_newlocale_from_collation(Oid collid); extern char *get_collation_actual_version(char collprovider, const char *collcollate); +extern bool char_is_cased(char ch, pg_locale_t locale); +extern bool char_tolower_enabled(pg_locale_t locale); +extern char char_tolower(unsigned char ch, pg_locale_t locale); extern size_t pg_strlower(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale); diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 9a3bee93dec..594c98509c0 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1829,7 +1829,6 @@ PGTargetServerType PGTernaryBool PGTransactionStatusType PGVerbosity -PG_Locale_Strategy PG_Lock_Status PG_init_t PGcancel -- 2.34.1
From 25e77e96f99412a4bf1a6d363bb5dbe8f914fbbf Mon Sep 17 00:00:00 2001 From: Jeff Davis <j...@j-davis.com> Date: Mon, 7 Oct 2024 12:51:27 -0700 Subject: [PATCH v15 2/4] Remove provider field from pg_locale_t. The behavior of pg_locale_t is entirely specified by methods, so a separate provider field is no longer necessary. --- src/backend/utils/adt/pg_locale_builtin.c | 1 - src/backend/utils/adt/pg_locale_icu.c | 11 ----------- src/backend/utils/adt/pg_locale_libc.c | 6 ------ src/include/utils/pg_locale.h | 1 - 4 files changed, 19 deletions(-) diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c index 23504be383a..bba98284e25 100644 --- a/src/backend/utils/adt/pg_locale_builtin.c +++ b/src/backend/utils/adt/pg_locale_builtin.c @@ -239,7 +239,6 @@ create_pg_locale_builtin(Oid collid, MemoryContext context) result->info.builtin.locale = MemoryContextStrdup(context, locstr); result->info.builtin.casemap_full = (strcmp(locstr, "PG_UNICODE_FAST") == 0); - result->provider = COLLPROVIDER_BUILTIN; result->deterministic = true; result->collate_is_c = true; result->ctype_is_c = (strcmp(locstr, "C") == 0); diff --git a/src/backend/utils/adt/pg_locale_icu.c b/src/backend/utils/adt/pg_locale_icu.c index 1cb24fadea2..f0068749471 100644 --- a/src/backend/utils/adt/pg_locale_icu.c +++ b/src/backend/utils/adt/pg_locale_icu.c @@ -295,7 +295,6 @@ create_pg_locale_icu(Oid collid, MemoryContext context) result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct)); result->info.icu.locale = MemoryContextStrdup(context, iculocstr); result->info.icu.ucol = collator; - result->provider = COLLPROVIDER_ICU; result->deterministic = deterministic; result->collate_is_c = false; result->ctype_is_c = false; @@ -572,8 +571,6 @@ strncoll_icu_utf8(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2 int result; UErrorCode status; - Assert(locale->provider == COLLPROVIDER_ICU); - Assert(GetDatabaseEncoding() == PG_UTF8); status = U_ZERO_ERROR; @@ -601,8 +598,6 @@ strnxfrm_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, size_t uchar_bsize; Size result_bsize; - Assert(locale->provider == COLLPROVIDER_ICU); - init_icu_converter(); ulen = uchar_length(icu_converter, src, srclen); @@ -647,8 +642,6 @@ strnxfrm_prefix_icu_utf8(char *dest, size_t destsize, uint32_t state[2]; UErrorCode status; - Assert(locale->provider == COLLPROVIDER_ICU); - Assert(GetDatabaseEncoding() == PG_UTF8); uiter_setUTF8(&iter, src, srclen); @@ -847,8 +840,6 @@ strncoll_icu(const char *arg1, ssize_t len1, *uchar2; int result; - Assert(locale->provider == COLLPROVIDER_ICU); - /* if encoding is UTF8, use more efficient strncoll_icu_utf8 */ #ifdef HAVE_UCOL_STRCOLLUTF8 Assert(GetDatabaseEncoding() != PG_UTF8); @@ -897,8 +888,6 @@ strnxfrm_prefix_icu(char *dest, size_t destsize, size_t uchar_bsize; Size result_bsize; - Assert(locale->provider == COLLPROVIDER_ICU); - /* if encoding is UTF8, use more efficient strnxfrm_prefix_icu_utf8 */ Assert(GetDatabaseEncoding() != PG_UTF8); diff --git a/src/backend/utils/adt/pg_locale_libc.c b/src/backend/utils/adt/pg_locale_libc.c index 1144c6ff304..1582f8cdd2a 100644 --- a/src/backend/utils/adt/pg_locale_libc.c +++ b/src/backend/utils/adt/pg_locale_libc.c @@ -662,7 +662,6 @@ create_pg_locale_libc(Oid collid, MemoryContext context) loc = make_libc_collator(collate, ctype); result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct)); - result->provider = COLLPROVIDER_LIBC; result->deterministic = true; result->collate_is_c = (strcmp(collate, "C") == 0) || (strcmp(collate, "POSIX") == 0); @@ -782,8 +781,6 @@ strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, const char *arg2n; int result; - Assert(locale->provider == COLLPROVIDER_LIBC); - if (bufsize1 + bufsize2 > TEXTBUFLEN) buf = palloc(bufsize1 + bufsize2); @@ -838,8 +835,6 @@ strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen, size_t bufsize = srclen + 1; size_t result; - Assert(locale->provider == COLLPROVIDER_LIBC); - if (srclen == -1) return strxfrm_l(dest, src, destsize, locale->info.lt); @@ -948,7 +943,6 @@ strncoll_libc_win32_utf8(const char *arg1, ssize_t len1, const char *arg2, int r; int result; - Assert(locale->provider == COLLPROVIDER_LIBC); Assert(GetDatabaseEncoding() == PG_UTF8); if (len1 == -1) diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h index cd2c812ae26..f081091d200 100644 --- a/src/include/utils/pg_locale.h +++ b/src/include/utils/pg_locale.h @@ -143,7 +143,6 @@ struct ctype_methods */ struct pg_locale_struct { - char provider; bool deterministic; bool collate_is_c; bool ctype_is_c; -- 2.34.1
From babe04624ec0143794329950547ff9a4dda91990 Mon Sep 17 00:00:00 2001 From: Jeff Davis <j...@j-davis.com> Date: Mon, 7 Oct 2024 13:36:44 -0700 Subject: [PATCH v15 3/4] Make provider data in pg_locale_t an opaque pointer. --- src/backend/utils/adt/pg_locale_builtin.c | 55 +++++-- src/backend/utils/adt/pg_locale_icu.c | 40 ++++-- src/backend/utils/adt/pg_locale_libc.c | 167 +++++++++++++++------- src/include/utils/pg_locale.h | 17 +-- 4 files changed, 197 insertions(+), 82 deletions(-) diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c index bba98284e25..f0462aa96aa 100644 --- a/src/backend/utils/adt/pg_locale_builtin.c +++ b/src/backend/utils/adt/pg_locale_builtin.c @@ -26,6 +26,12 @@ extern pg_locale_t create_pg_locale_builtin(Oid collid, MemoryContext context); extern char *get_collation_actual_version_builtin(const char *collcollate); +struct builtin_provider +{ + const char *locale; + bool casemap_full; +}; + struct WordBoundaryState { const char *str; @@ -71,14 +77,19 @@ static size_t strlower_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { + struct builtin_provider *builtin; + + builtin = (struct builtin_provider *) locale->provider_data; + return unicode_strlower(dest, destsize, src, srclen, - locale->info.builtin.casemap_full); + builtin->casemap_full); } static size_t strtitle_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { + struct builtin_provider *builtin; struct WordBoundaryState wbstate = { .str = src, .len = srclen, @@ -87,8 +98,10 @@ strtitle_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, .prev_alnum = false, }; + builtin = (struct builtin_provider *) locale->provider_data; + return unicode_strtitle(dest, destsize, src, srclen, - locale->info.builtin.casemap_full, + builtin->casemap_full, initcap_wbnext, &wbstate); } @@ -96,22 +109,34 @@ static size_t strupper_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { + struct builtin_provider *builtin; + + builtin = (struct builtin_provider *) locale->provider_data; + return unicode_strupper(dest, destsize, src, srclen, - locale->info.builtin.casemap_full); + builtin->casemap_full); } static size_t strfold_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { + struct builtin_provider *builtin; + + builtin = (struct builtin_provider *) locale->provider_data; + return unicode_strfold(dest, destsize, src, srclen, - locale->info.builtin.casemap_full); + builtin->casemap_full); } static bool wc_isdigit_builtin(pg_wchar wc, pg_locale_t locale) { - return pg_u_isdigit(wc, !locale->info.builtin.casemap_full); + struct builtin_provider *builtin; + + builtin = (struct builtin_provider *) locale->provider_data; + + return pg_u_isdigit(wc, !builtin->casemap_full); } static bool @@ -123,7 +148,11 @@ wc_isalpha_builtin(pg_wchar wc, pg_locale_t locale) static bool wc_isalnum_builtin(pg_wchar wc, pg_locale_t locale) { - return pg_u_isalnum(wc, !locale->info.builtin.casemap_full); + struct builtin_provider *builtin; + + builtin = (struct builtin_provider *) locale->provider_data; + + return pg_u_isalnum(wc, !builtin->casemap_full); } static bool @@ -153,7 +182,11 @@ wc_isprint_builtin(pg_wchar wc, pg_locale_t locale) static bool wc_ispunct_builtin(pg_wchar wc, pg_locale_t locale) { - return pg_u_ispunct(wc, !locale->info.builtin.casemap_full); + struct builtin_provider *builtin; + + builtin = (struct builtin_provider *) locale->provider_data; + + return pg_u_ispunct(wc, !builtin->casemap_full); } static bool @@ -204,6 +237,7 @@ pg_locale_t create_pg_locale_builtin(Oid collid, MemoryContext context) { const char *locstr; + struct builtin_provider *builtin; pg_locale_t result; if (collid == DEFAULT_COLLATION_OID) @@ -237,8 +271,11 @@ create_pg_locale_builtin(Oid collid, MemoryContext context) result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct)); - result->info.builtin.locale = MemoryContextStrdup(context, locstr); - result->info.builtin.casemap_full = (strcmp(locstr, "PG_UNICODE_FAST") == 0); + builtin = MemoryContextAllocZero(context, sizeof(struct builtin_provider)); + builtin->locale = MemoryContextStrdup(context, locstr); + builtin->casemap_full = (strcmp(locstr, "PG_UNICODE_FAST") == 0); + result->provider_data = (void *) builtin; + result->deterministic = true; result->collate_is_c = true; result->ctype_is_c = (strcmp(locstr, "C") == 0); diff --git a/src/backend/utils/adt/pg_locale_icu.c b/src/backend/utils/adt/pg_locale_icu.c index f0068749471..19c51504f85 100644 --- a/src/backend/utils/adt/pg_locale_icu.c +++ b/src/backend/utils/adt/pg_locale_icu.c @@ -51,6 +51,12 @@ extern pg_locale_t create_pg_locale_icu(Oid collid, MemoryContext context); #ifdef USE_ICU +struct icu_provider +{ + const char *locale; + UCollator *ucol; +}; + extern UCollator *pg_ucol_open(const char *loc_str); static size_t strlower_icu(char *dst, size_t dstsize, const char *src, @@ -242,6 +248,7 @@ create_pg_locale_icu(Oid collid, MemoryContext context) bool deterministic; const char *iculocstr; const char *icurules = NULL; + struct icu_provider *icu; UCollator *collator; pg_locale_t result; @@ -293,8 +300,12 @@ create_pg_locale_icu(Oid collid, MemoryContext context) collator = make_icu_collator(iculocstr, icurules); result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct)); - result->info.icu.locale = MemoryContextStrdup(context, iculocstr); - result->info.icu.ucol = collator; + + icu = MemoryContextAllocZero(context, sizeof(struct icu_provider)); + icu->locale = MemoryContextStrdup(context, iculocstr); + icu->ucol = collator; + result->provider_data = (void *) icu; + result->deterministic = deterministic; result->collate_is_c = false; result->ctype_is_c = false; @@ -570,11 +581,12 @@ strncoll_icu_utf8(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2 { int result; UErrorCode status; + struct icu_provider *icu = (struct icu_provider *) locale->provider_data; Assert(GetDatabaseEncoding() == PG_UTF8); status = U_ZERO_ERROR; - result = ucol_strcollUTF8(locale->info.icu.ucol, + result = ucol_strcollUTF8(icu->ucol, arg1, len1, arg2, len2, &status); @@ -598,6 +610,8 @@ strnxfrm_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, size_t uchar_bsize; Size result_bsize; + struct icu_provider *icu = (struct icu_provider *) locale->provider_data; + init_icu_converter(); ulen = uchar_length(icu_converter, src, srclen); @@ -611,7 +625,7 @@ strnxfrm_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen); - result_bsize = ucol_getSortKey(locale->info.icu.ucol, + result_bsize = ucol_getSortKey(icu->ucol, uchar, ulen, (uint8_t *) dest, destsize); @@ -642,12 +656,14 @@ strnxfrm_prefix_icu_utf8(char *dest, size_t destsize, uint32_t state[2]; UErrorCode status; + struct icu_provider *icu = (struct icu_provider *) locale->provider_data; + Assert(GetDatabaseEncoding() == PG_UTF8); uiter_setUTF8(&iter, src, srclen); state[0] = state[1] = 0; /* won't need that again */ status = U_ZERO_ERROR; - result = ucol_nextSortKeyPart(locale->info.icu.ucol, + result = ucol_nextSortKeyPart(icu->ucol, &iter, state, (uint8_t *) dest, @@ -754,11 +770,13 @@ icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale, UErrorCode status; int32_t len_dest; + struct icu_provider *icu = (struct icu_provider *) mylocale->provider_data; + len_dest = len_source; /* try first with same length */ *buff_dest = palloc(len_dest * sizeof(**buff_dest)); status = U_ZERO_ERROR; len_dest = func(*buff_dest, len_dest, buff_source, len_source, - mylocale->info.icu.locale, &status); + icu->locale, &status); if (status == U_BUFFER_OVERFLOW_ERROR) { /* try again with adjusted length */ @@ -766,7 +784,7 @@ icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale, *buff_dest = palloc(len_dest * sizeof(**buff_dest)); status = U_ZERO_ERROR; len_dest = func(*buff_dest, len_dest, buff_source, len_source, - mylocale->info.icu.locale, &status); + icu->locale, &status); } if (U_FAILURE(status)) ereport(ERROR, @@ -840,6 +858,8 @@ strncoll_icu(const char *arg1, ssize_t len1, *uchar2; int result; + struct icu_provider *icu = (struct icu_provider *) locale->provider_data; + /* if encoding is UTF8, use more efficient strncoll_icu_utf8 */ #ifdef HAVE_UCOL_STRCOLLUTF8 Assert(GetDatabaseEncoding() != PG_UTF8); @@ -862,7 +882,7 @@ strncoll_icu(const char *arg1, ssize_t len1, ulen1 = uchar_convert(icu_converter, uchar1, ulen1 + 1, arg1, len1); ulen2 = uchar_convert(icu_converter, uchar2, ulen2 + 1, arg2, len2); - result = ucol_strcoll(locale->info.icu.ucol, + result = ucol_strcoll(icu->ucol, uchar1, ulen1, uchar2, ulen2); @@ -888,6 +908,8 @@ strnxfrm_prefix_icu(char *dest, size_t destsize, size_t uchar_bsize; Size result_bsize; + struct icu_provider *icu = (struct icu_provider *) locale->provider_data; + /* if encoding is UTF8, use more efficient strnxfrm_prefix_icu_utf8 */ Assert(GetDatabaseEncoding() != PG_UTF8); @@ -907,7 +929,7 @@ strnxfrm_prefix_icu(char *dest, size_t destsize, uiter_setString(&iter, uchar, ulen); state[0] = state[1] = 0; /* won't need that again */ status = U_ZERO_ERROR; - result_bsize = ucol_nextSortKeyPart(locale->info.icu.ucol, + result_bsize = ucol_nextSortKeyPart(icu->ucol, &iter, state, (uint8_t *) dest, diff --git a/src/backend/utils/adt/pg_locale_libc.c b/src/backend/utils/adt/pg_locale_libc.c index 1582f8cdd2a..d357962ebdf 100644 --- a/src/backend/utils/adt/pg_locale_libc.c +++ b/src/backend/utils/adt/pg_locale_libc.c @@ -1,3 +1,4 @@ + /*----------------------------------------------------------------------- * * PostgreSQL locale utilities for libc @@ -41,6 +42,11 @@ */ #define TEXTBUFLEN 1024 +struct libc_provider +{ + locale_t lt; +}; + extern pg_locale_t create_pg_locale_libc(Oid collid, MemoryContext context); static int strncoll_libc(const char *arg1, ssize_t len1, @@ -82,116 +88,154 @@ static size_t strupper_libc_mb(char *dest, size_t destsize, static bool wc_isdigit_libc_sb(pg_wchar wc, pg_locale_t locale) { - return isdigit_l((unsigned char) wc, locale->info.lt); + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + + return isdigit_l((unsigned char) wc, libc->lt); } static bool wc_isalpha_libc_sb(pg_wchar wc, pg_locale_t locale) { - return isalpha_l((unsigned char) wc, locale->info.lt); + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + + return isalpha_l((unsigned char) wc, libc->lt); } static bool wc_isalnum_libc_sb(pg_wchar wc, pg_locale_t locale) { - return isalnum_l((unsigned char) wc, locale->info.lt); + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + + return isalnum_l((unsigned char) wc, libc->lt); } static bool wc_isupper_libc_sb(pg_wchar wc, pg_locale_t locale) { - return isupper_l((unsigned char) wc, locale->info.lt); + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + + return isupper_l((unsigned char) wc, libc->lt); } static bool wc_islower_libc_sb(pg_wchar wc, pg_locale_t locale) { - return islower_l((unsigned char) wc, locale->info.lt); + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + + return islower_l((unsigned char) wc, libc->lt); } static bool wc_isgraph_libc_sb(pg_wchar wc, pg_locale_t locale) { - return isgraph_l((unsigned char) wc, locale->info.lt); + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + + return isgraph_l((unsigned char) wc, libc->lt); } static bool wc_isprint_libc_sb(pg_wchar wc, pg_locale_t locale) { - return isprint_l((unsigned char) wc, locale->info.lt); + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + + return isprint_l((unsigned char) wc, libc->lt); } static bool wc_ispunct_libc_sb(pg_wchar wc, pg_locale_t locale) { - return ispunct_l((unsigned char) wc, locale->info.lt); + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + + return ispunct_l((unsigned char) wc, libc->lt); } static bool wc_isspace_libc_sb(pg_wchar wc, pg_locale_t locale) { - return isspace_l((unsigned char) wc, locale->info.lt); + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + + return isspace_l((unsigned char) wc, libc->lt); } static bool wc_isdigit_libc_mb(pg_wchar wc, pg_locale_t locale) { - return iswdigit_l((wint_t) wc, locale->info.lt); + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + + return iswdigit_l((wint_t) wc, libc->lt); } static bool wc_isalpha_libc_mb(pg_wchar wc, pg_locale_t locale) { - return iswalpha_l((wint_t) wc, locale->info.lt); + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + + return iswalpha_l((wint_t) wc, libc->lt); } static bool wc_isalnum_libc_mb(pg_wchar wc, pg_locale_t locale) { - return iswalnum_l((wint_t) wc, locale->info.lt); + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + + return iswalnum_l((wint_t) wc, libc->lt); } static bool wc_isupper_libc_mb(pg_wchar wc, pg_locale_t locale) { - return iswupper_l((wint_t) wc, locale->info.lt); + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + + return iswupper_l((wint_t) wc, libc->lt); } static bool wc_islower_libc_mb(pg_wchar wc, pg_locale_t locale) { - return iswlower_l((wint_t) wc, locale->info.lt); + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + + return iswlower_l((wint_t) wc, libc->lt); } static bool wc_isgraph_libc_mb(pg_wchar wc, pg_locale_t locale) { - return iswgraph_l((wint_t) wc, locale->info.lt); + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + + return iswgraph_l((wint_t) wc, libc->lt); } static bool wc_isprint_libc_mb(pg_wchar wc, pg_locale_t locale) { - return iswprint_l((wint_t) wc, locale->info.lt); + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + + return iswprint_l((wint_t) wc, libc->lt); } static bool wc_ispunct_libc_mb(pg_wchar wc, pg_locale_t locale) { - return iswpunct_l((wint_t) wc, locale->info.lt); + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + + return iswpunct_l((wint_t) wc, libc->lt); } static bool wc_isspace_libc_mb(pg_wchar wc, pg_locale_t locale) { - return iswspace_l((wint_t) wc, locale->info.lt); + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + + return iswspace_l((wint_t) wc, libc->lt); } static char char_tolower_libc(unsigned char ch, pg_locale_t locale) { + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + Assert(pg_database_encoding_max_length() == 1); - return tolower_l(ch, locale->info.lt); + return tolower_l(ch, libc->lt); } static bool @@ -199,19 +243,23 @@ char_is_cased_libc(char ch, pg_locale_t locale) { bool is_multibyte = pg_database_encoding_max_length() > 1; + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + if (is_multibyte && IS_HIGHBIT_SET(ch)) return true; else - return isalpha_l((unsigned char) ch, locale->info.lt); + return isalpha_l((unsigned char) ch, libc->lt); } static pg_wchar toupper_libc_sb(pg_wchar wc, pg_locale_t locale) { + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + Assert(GetDatabaseEncoding() != PG_UTF8); if (wc <= (pg_wchar) UCHAR_MAX) - return toupper_l((unsigned char) wc, locale->info.lt); + return toupper_l((unsigned char) wc, libc->lt); else return wc; } @@ -219,10 +267,12 @@ toupper_libc_sb(pg_wchar wc, pg_locale_t locale) static pg_wchar toupper_libc_mb(pg_wchar wc, pg_locale_t locale) { + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + Assert(GetDatabaseEncoding() == PG_UTF8); if (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF) - return towupper_l((wint_t) wc, locale->info.lt); + return towupper_l((wint_t) wc, libc->lt); else return wc; } @@ -230,10 +280,12 @@ toupper_libc_mb(pg_wchar wc, pg_locale_t locale) static pg_wchar tolower_libc_sb(pg_wchar wc, pg_locale_t locale) { + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + Assert(GetDatabaseEncoding() != PG_UTF8); if (wc <= (pg_wchar) UCHAR_MAX) - return tolower_l((unsigned char) wc, locale->info.lt); + return tolower_l((unsigned char) wc, libc->lt); else return wc; } @@ -241,10 +293,12 @@ tolower_libc_sb(pg_wchar wc, pg_locale_t locale) static pg_wchar tolower_libc_mb(pg_wchar wc, pg_locale_t locale) { + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + Assert(GetDatabaseEncoding() == PG_UTF8); if (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF) - return towlower_l((wint_t) wc, locale->info.lt); + return towlower_l((wint_t) wc, libc->lt); else return wc; } @@ -355,7 +409,7 @@ strlower_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, if (srclen + 1 <= destsize) { - locale_t loc = locale->info.lt; + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; char *p; if (srclen + 1 > destsize) @@ -376,7 +430,7 @@ strlower_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, if (locale->is_default) *p = pg_tolower((unsigned char) *p); else - *p = tolower_l((unsigned char) *p, loc); + *p = tolower_l((unsigned char) *p, libc->lt); } } @@ -387,7 +441,8 @@ static size_t strlower_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { - locale_t loc = locale->info.lt; + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + size_t result_size; wchar_t *workspace; char *result; @@ -409,7 +464,7 @@ strlower_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, char2wchar(workspace, srclen + 1, src, srclen, locale); for (curr_char = 0; workspace[curr_char] != 0; curr_char++) - workspace[curr_char] = towlower_l(workspace[curr_char], loc); + workspace[curr_char] = towlower_l(workspace[curr_char], libc->lt); /* * Make result large enough; case change might change number of bytes @@ -440,7 +495,7 @@ strtitle_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, if (srclen + 1 <= destsize) { - locale_t loc = locale->info.lt; + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; int wasalnum = false; char *p; @@ -466,11 +521,11 @@ strtitle_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, else { if (wasalnum) - *p = tolower_l((unsigned char) *p, loc); + *p = tolower_l((unsigned char) *p, libc->lt); else - *p = toupper_l((unsigned char) *p, loc); + *p = toupper_l((unsigned char) *p, libc->lt); } - wasalnum = isalnum_l((unsigned char) *p, loc); + wasalnum = isalnum_l((unsigned char) *p, libc->lt); } } @@ -481,7 +536,8 @@ static size_t strtitle_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { - locale_t loc = locale->info.lt; + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + int wasalnum = false; size_t result_size; wchar_t *workspace; @@ -506,10 +562,10 @@ strtitle_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, for (curr_char = 0; workspace[curr_char] != 0; curr_char++) { if (wasalnum) - workspace[curr_char] = towlower_l(workspace[curr_char], loc); + workspace[curr_char] = towlower_l(workspace[curr_char], libc->lt); else - workspace[curr_char] = towupper_l(workspace[curr_char], loc); - wasalnum = iswalnum_l(workspace[curr_char], loc); + workspace[curr_char] = towupper_l(workspace[curr_char], libc->lt); + wasalnum = iswalnum_l(workspace[curr_char], libc->lt); } /* @@ -541,7 +597,7 @@ strupper_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, if (srclen + 1 <= destsize) { - locale_t loc = locale->info.lt; + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; char *p; memcpy(dest, src, srclen); @@ -559,7 +615,7 @@ strupper_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, if (locale->is_default) *p = pg_toupper((unsigned char) *p); else - *p = toupper_l((unsigned char) *p, loc); + *p = toupper_l((unsigned char) *p, libc->lt); } } @@ -570,7 +626,8 @@ static size_t strupper_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { - locale_t loc = locale->info.lt; + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + size_t result_size; wchar_t *workspace; char *result; @@ -592,7 +649,7 @@ strupper_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, char2wchar(workspace, srclen + 1, src, srclen, locale); for (curr_char = 0; workspace[curr_char] != 0; curr_char++) - workspace[curr_char] = towupper_l(workspace[curr_char], loc); + workspace[curr_char] = towupper_l(workspace[curr_char], libc->lt); /* * Make result large enough; case change might change number of bytes @@ -620,6 +677,7 @@ create_pg_locale_libc(Oid collid, MemoryContext context) const char *collate; const char *ctype; locale_t loc; + struct libc_provider *libc; pg_locale_t result; if (collid == DEFAULT_COLLATION_OID) @@ -658,16 +716,19 @@ create_pg_locale_libc(Oid collid, MemoryContext context) ReleaseSysCache(tp); } - loc = make_libc_collator(collate, ctype); result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct)); + + libc = MemoryContextAllocZero(context, sizeof(struct libc_provider)); + libc->lt = loc; + result->provider_data = (void *) libc; + result->deterministic = true; result->collate_is_c = (strcmp(collate, "C") == 0) || (strcmp(collate, "POSIX") == 0); result->ctype_is_c = (strcmp(ctype, "C") == 0) || (strcmp(ctype, "POSIX") == 0); - result->info.lt = loc; if (!result->collate_is_c) { #ifdef WIN32 @@ -781,6 +842,8 @@ strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, const char *arg2n; int result; + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + if (bufsize1 + bufsize2 > TEXTBUFLEN) buf = palloc(bufsize1 + bufsize2); @@ -811,7 +874,7 @@ strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, arg2n = buf2; } - result = strcoll_l(arg1n, arg2n, locale->info.lt); + result = strcoll_l(arg1n, arg2n, libc->lt); if (buf != sbuf) pfree(buf); @@ -835,8 +898,10 @@ strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen, size_t bufsize = srclen + 1; size_t result; + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + if (srclen == -1) - return strxfrm_l(dest, src, destsize, locale->info.lt); + return strxfrm_l(dest, src, destsize, libc->lt); if (bufsize > TEXTBUFLEN) buf = palloc(bufsize); @@ -845,7 +910,7 @@ strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen, memcpy(buf, src, srclen); buf[srclen] = '\0'; - result = strxfrm_l(dest, buf, destsize, locale->info.lt); + result = strxfrm_l(dest, buf, destsize, libc->lt); if (buf != sbuf) pfree(buf); @@ -943,6 +1008,8 @@ strncoll_libc_win32_utf8(const char *arg1, ssize_t len1, const char *arg2, int r; int result; + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + Assert(GetDatabaseEncoding() == PG_UTF8); if (len1 == -1) @@ -987,7 +1054,7 @@ strncoll_libc_win32_utf8(const char *arg1, ssize_t len1, const char *arg2, ((LPWSTR) a2p)[r] = 0; errno = 0; - result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, locale->info.lt); + result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, libc->lt); if (result == 2147483647) /* _NLSCMPERROR; missing from mingw headers */ ereport(ERROR, (errmsg("could not compare Unicode strings: %m"))); @@ -1116,8 +1183,10 @@ wchar2char(char *to, const wchar_t *from, size_t tolen, pg_locale_t locale) } else { + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + /* Use wcstombs_l for nondefault locales */ - result = wcstombs_l(to, from, tolen, locale->info.lt); + result = wcstombs_l(to, from, tolen, libc->lt); } return result; @@ -1176,8 +1245,10 @@ char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen, } else { + struct libc_provider *libc = (struct libc_provider *) locale->provider_data; + /* Use mbstowcs_l for nondefault locales */ - result = mbstowcs_l(to, str, tolen, locale->info.lt); + result = mbstowcs_l(to, str, tolen, libc->lt); } pfree(str); diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h index f081091d200..ccf10cd617b 100644 --- a/src/include/utils/pg_locale.h +++ b/src/include/utils/pg_locale.h @@ -151,22 +151,7 @@ struct pg_locale_struct const struct collate_methods *collate; /* NULL if collate_is_c */ const struct ctype_methods *ctype; /* NULL if ctype_is_c */ - union - { - struct - { - const char *locale; - bool casemap_full; - } builtin; - locale_t lt; -#ifdef USE_ICU - struct - { - const char *locale; - UCollator *ucol; - } icu; -#endif - } info; + void *provider_data; }; extern void init_database_collation(void); -- 2.34.1
From e32a88cc236a84cbc80f00c1713f2e851be1d109 Mon Sep 17 00:00:00 2001 From: Jeff Davis <j...@j-davis.com> Date: Wed, 9 Oct 2024 10:00:58 -0700 Subject: [PATCH v15 4/4] Don't include ICU headers in pg_locale.h. --- src/backend/commands/collationcmds.c | 4 ++++ src/backend/utils/adt/formatting.c | 4 ---- src/backend/utils/adt/pg_locale.c | 4 ++++ src/backend/utils/adt/pg_locale_icu.c | 1 + src/backend/utils/adt/varlena.c | 4 ++++ src/include/utils/pg_locale.h | 4 ---- 6 files changed, 13 insertions(+), 8 deletions(-) diff --git a/src/backend/commands/collationcmds.c b/src/backend/commands/collationcmds.c index 8acbfbbeda0..a57fe93c387 100644 --- a/src/backend/commands/collationcmds.c +++ b/src/backend/commands/collationcmds.c @@ -14,6 +14,10 @@ */ #include "postgres.h" +#ifdef USE_ICU +#include <unicode/ucol.h> +#endif + #include "access/htup_details.h" #include "access/table.h" #include "access/xact.h" diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c index 5bd1e01f7e4..b3d5e0436ee 100644 --- a/src/backend/utils/adt/formatting.c +++ b/src/backend/utils/adt/formatting.c @@ -70,10 +70,6 @@ #include <limits.h> #include <wctype.h> -#ifdef USE_ICU -#include <unicode/ustring.h> -#endif - #include "catalog/pg_collation.h" #include "catalog/pg_type.h" #include "common/int.h" diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index f26ce0e6cc7..894f141dbb1 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -54,6 +54,10 @@ #include <time.h> +#ifdef USE_ICU +#include <unicode/ucol.h> +#endif + #include "access/htup_details.h" #include "catalog/pg_collation.h" #include "catalog/pg_database.h" diff --git a/src/backend/utils/adt/pg_locale_icu.c b/src/backend/utils/adt/pg_locale_icu.c index 19c51504f85..2ff7b960abc 100644 --- a/src/backend/utils/adt/pg_locale_icu.c +++ b/src/backend/utils/adt/pg_locale_icu.c @@ -13,6 +13,7 @@ #ifdef USE_ICU #include <unicode/ucnv.h> +#include <unicode/ucol.h> #include <unicode/ustring.h> /* diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index 34796f2e27c..c57262e1888 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -17,6 +17,10 @@ #include <ctype.h> #include <limits.h> +#ifdef USE_ICU +#include <unicode/uchar.h> +#endif + #include "access/detoast.h" #include "access/toast_compression.h" #include "catalog/pg_collation.h" diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h index ccf10cd617b..4c941d02d76 100644 --- a/src/include/utils/pg_locale.h +++ b/src/include/utils/pg_locale.h @@ -14,10 +14,6 @@ #include "mb/pg_wchar.h" -#ifdef USE_ICU -#include <unicode/ucol.h> -#endif - /* use for libc locale names */ #define LOCALE_NAME_BUFLEN 128 -- 2.34.1