(moving to -hackers) On Fri, Mar 18, 2022 at 03:40:51PM +0800, Julien Rouhaud wrote: > On Fri, Mar 18, 2022 at 02:36:48PM +0800, Julien Rouhaud wrote: > > On Fri, Mar 18, 2022 at 06:15:47PM +1300, Thomas Munro wrote: > > > > > > No idea what's happening here but one observation is that that animal > > > is running an older distro that shipped with ICU 5.0. Commit b8f9a2a6 > > > may hold a clue... > > > > Right. I'm setting up a similar podman environment, hopefully more info > > soon. > > And indeed b8f9a2a6 is the problem. We would need some form of > icu_set_collation_attributes() on the frontend side if we want to detect such > a > problem on older ICU version at the expected moment rather than when > bootstrapping the info. A similar check is also needed in createdb().
I'm attaching a patch that fixes both issues for me with ICU 50. Note that there's already a test that would have failed for CREATE DATABASE if initdb tests didn't fail first, so no new test needed. I ended up copy/pasting icu_set_collation_attributes() in initdb.c. There shouldn't be new attributes added in old ICU versions, and there are enough differences to make it work in the frontend that it didn't seems worth to have a single function.
>From 52ae34ed5413f3274bc6c9a96847ec762f6e388d Mon Sep 17 00:00:00 2001 From: Julien Rouhaud <julien.rouh...@free.fr> Date: Fri, 18 Mar 2022 16:20:01 +0800 Subject: [PATCH v1] Fix global icu collations for ICU < 54 Multiple call sites didn't check for collation attributes validity, which has to be done explicitly on ICU < 54. --- src/backend/commands/dbcommands.c | 9 +-- src/backend/utils/adt/pg_locale.c | 24 +++++++ src/bin/initdb/initdb.c | 109 +++++++++++++++++++++++++++++- src/include/utils/pg_locale.h | 1 + 4 files changed, 134 insertions(+), 9 deletions(-) diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index 2e1af642e4..104e45b66d 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -460,14 +460,7 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) if (dblocprovider == COLLPROVIDER_ICU) { #ifdef USE_ICU - UErrorCode status; - - status = U_ZERO_ERROR; - ucol_open(dbiculocale, &status); - if (U_FAILURE(status)) - ereport(ERROR, - (errmsg("could not open collator for locale \"%s\": %s", - dbiculocale, u_errorName(status)))); + check_icu_locale(dbiculocale); #else ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index 4019255f8e..bc0a038168 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -1854,6 +1854,9 @@ icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes) * function's result is the number of bytes generated (not counting nul). * * The result string is nul-terminated. + * + * If you modify this function, modify accordingly the similar frontend + * function in initdb.c */ int32_t icu_from_uchar(char **result, const UChar *buff_uchar, int32_t len_uchar) @@ -1884,6 +1887,27 @@ icu_from_uchar(char **result, const UChar *buff_uchar, int32_t len_uchar) return len_result; } +/* + * Check if the given locale ID is valid, and ereport(ERROR) if it isn't. + */ +void +check_icu_locale(const char *icu_locale) +{ + UCollator *collator; + UErrorCode status; + + status = U_ZERO_ERROR; + collator = ucol_open(icu_locale, &status); + if (U_FAILURE(status)) + ereport(ERROR, + (errmsg("could not open collator for locale \"%s\": %s", + icu_locale, u_errorName(status)))); + + if (U_ICU_VERSION_MAJOR_NUM < 54) + icu_set_collation_attributes(collator, icu_locale); + ucol_close(collator); +} + /* * Parse collation attributes and apply them to the open collator. This takes * a string like "und@colStrength=primary;colCaseLevel=yes" and parses and diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index cbcd55288f..ee6243560b 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -282,6 +282,10 @@ static int locale_date_order(const char *locale); static void check_locale_name(int category, const char *locale, char **canonname); static bool check_locale_encoding(const char *locale, int encoding); +#ifdef USE_ICU +static void icu_set_collation_attributes(UCollator *collator, + const char *loc); +#endif static void setlocales(void); static void usage(const char *progname); void setup_pgdata(void); @@ -2145,6 +2149,105 @@ check_locale_encoding(const char *locale, int user_enc) return true; } +#ifdef USE_ICU +/* + * Frontend version of the same function in pg_locale.c. + * + * If you modify this function, modify accordingly the similar backend function + * in pg_locale.c + */ +static void +icu_set_collation_attributes(UCollator *collator, const char *loc) +{ + char *str = pg_strdup(loc); + + str = strchr(str, '@'); + if (!str) + return; + str++; + + for (char *token = strtok(str, ";"); token; token = strtok(NULL, ";")) + { + char *e = strchr(token, '='); + + if (e) + { + char *name; + char *value; + UColAttribute uattr; + UColAttributeValue uvalue; + UErrorCode status; + + status = U_ZERO_ERROR; + + *e = '\0'; + name = token; + value = e + 1; + + /* + * See attribute name and value lists in ICU i18n/coll.cpp + */ + if (pg_strcasecmp(name, "colstrength") == 0) + uattr = UCOL_STRENGTH; + else if (pg_strcasecmp(name, "colbackwards") == 0) + uattr = UCOL_FRENCH_COLLATION; + else if (pg_strcasecmp(name, "colcaselevel") == 0) + uattr = UCOL_CASE_LEVEL; + else if (pg_strcasecmp(name, "colcasefirst") == 0) + uattr = UCOL_CASE_FIRST; + else if (pg_strcasecmp(name, "colalternate") == 0) + uattr = UCOL_ALTERNATE_HANDLING; + else if (pg_strcasecmp(name, "colnormalization") == 0) + uattr = UCOL_NORMALIZATION_MODE; + else if (pg_strcasecmp(name, "colnumeric") == 0) + uattr = UCOL_NUMERIC_COLLATION; + else + /* ignore if unknown */ + continue; + + if (pg_strcasecmp(value, "primary") == 0) + uvalue = UCOL_PRIMARY; + else if (pg_strcasecmp(value, "secondary") == 0) + uvalue = UCOL_SECONDARY; + else if (pg_strcasecmp(value, "tertiary") == 0) + uvalue = UCOL_TERTIARY; + else if (pg_strcasecmp(value, "quaternary") == 0) + uvalue = UCOL_QUATERNARY; + else if (pg_strcasecmp(value, "identical") == 0) + uvalue = UCOL_IDENTICAL; + else if (pg_strcasecmp(value, "no") == 0) + uvalue = UCOL_OFF; + else if (pg_strcasecmp(value, "yes") == 0) + uvalue = UCOL_ON; + else if (pg_strcasecmp(value, "shifted") == 0) + uvalue = UCOL_SHIFTED; + else if (pg_strcasecmp(value, "non-ignorable") == 0) + uvalue = UCOL_NON_IGNORABLE; + else if (pg_strcasecmp(value, "lower") == 0) + uvalue = UCOL_LOWER_FIRST; + else if (pg_strcasecmp(value, "upper") == 0) + uvalue = UCOL_UPPER_FIRST; + else + status = U_ILLEGAL_ARGUMENT_ERROR; + + if (status == U_ZERO_ERROR) + ucol_setAttribute(collator, uattr, uvalue, &status); + + /* + * Pretend the error came from ucol_open(), for consistent error + * message across ICU versions. + */ + if (U_FAILURE(status)) + { + pg_log_error("could not open collator for locale \"%s\": %s", + loc, u_errorName(status)); + exit(1); + } + } + } +} +#endif + /* * set up the locale variables * @@ -2209,16 +2312,20 @@ setlocales(void) */ #ifdef USE_ICU { + UCollator *collator; UErrorCode status; status = U_ZERO_ERROR; - ucol_open(icu_locale, &status); + collator = ucol_open(icu_locale, &status); if (U_FAILURE(status)) { pg_log_error("could not open collator for locale \"%s\": %s", icu_locale, u_errorName(status)); exit(1); } + if (U_ICU_VERSION_MAJOR_NUM < 54) + icu_set_collation_attributes(collator, icu_locale); + ucol_close(collator); } #else pg_log_error("ICU is not supported in this build"); diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h index 9b158f24a0..297d14d918 100644 --- a/src/include/utils/pg_locale.h +++ b/src/include/utils/pg_locale.h @@ -115,6 +115,7 @@ extern char *get_collation_actual_version(char collprovider, const char *collcol #ifdef USE_ICU extern int32_t icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes); extern int32_t icu_from_uchar(char **result, const UChar *buff_uchar, int32_t len_uchar); +extern void check_icu_locale(const char *icu_locale); #endif /* These functions convert from/to libc's wchar_t, *not* pg_wchar_t */ -- 2.33.1