(moving to -hackers)

On Fri, Mar 18, 2022 at 03:40:51PM +0800, Julien Rouhaud wrote:
> On Fri, Mar 18, 2022 at 02:36:48PM +0800, Julien Rouhaud wrote:
> > On Fri, Mar 18, 2022 at 06:15:47PM +1300, Thomas Munro wrote:
> > >
> > > No idea what's happening here but one observation is that that animal
> > > is running an older distro that shipped with ICU 5.0.  Commit b8f9a2a6
> > > may hold a clue...
> >
> > Right.  I'm setting up a similar podman environment, hopefully more info 
> > soon.
> 
> And indeed b8f9a2a6 is the problem.  We would need some form of
> icu_set_collation_attributes() on the frontend side if we want to detect such 
> a
> problem on older ICU version at the expected moment rather than when
> bootstrapping the info.  A similar check is also needed in createdb().

I'm attaching a patch that fixes both issues for me with ICU 50.  Note that
there's already a test that would have failed for CREATE DATABASE if initdb
tests didn't fail first, so no new test needed.

I ended up copy/pasting icu_set_collation_attributes() in initdb.c.  There
shouldn't be new attributes added in old ICU versions, and there are enough
differences to make it work in the frontend that it didn't seems worth to have
a single function.
>From 52ae34ed5413f3274bc6c9a96847ec762f6e388d Mon Sep 17 00:00:00 2001
From: Julien Rouhaud <julien.rouh...@free.fr>
Date: Fri, 18 Mar 2022 16:20:01 +0800
Subject: [PATCH v1] Fix global icu collations for ICU < 54

Multiple call sites didn't check for collation attributes validity, which has
to be done explicitly on ICU < 54.
---
 src/backend/commands/dbcommands.c |   9 +--
 src/backend/utils/adt/pg_locale.c |  24 +++++++
 src/bin/initdb/initdb.c           | 109 +++++++++++++++++++++++++++++-
 src/include/utils/pg_locale.h     |   1 +
 4 files changed, 134 insertions(+), 9 deletions(-)

diff --git a/src/backend/commands/dbcommands.c 
b/src/backend/commands/dbcommands.c
index 2e1af642e4..104e45b66d 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -460,14 +460,7 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
        if (dblocprovider == COLLPROVIDER_ICU)
        {
 #ifdef USE_ICU
-               UErrorCode  status;
-
-               status = U_ZERO_ERROR;
-               ucol_open(dbiculocale, &status);
-               if (U_FAILURE(status))
-                       ereport(ERROR,
-                                       (errmsg("could not open collator for 
locale \"%s\": %s",
-                                                       dbiculocale, 
u_errorName(status))));
+               check_icu_locale(dbiculocale);
 #else
                ereport(ERROR,
                                (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
diff --git a/src/backend/utils/adt/pg_locale.c 
b/src/backend/utils/adt/pg_locale.c
index 4019255f8e..bc0a038168 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -1854,6 +1854,9 @@ icu_to_uchar(UChar **buff_uchar, const char *buff, size_t 
nbytes)
  * function's result is the number of bytes generated (not counting nul).
  *
  * The result string is nul-terminated.
+ *
+ * If you modify this function, modify accordingly the similar frontend
+ * function in initdb.c
  */
 int32_t
 icu_from_uchar(char **result, const UChar *buff_uchar, int32_t len_uchar)
@@ -1884,6 +1887,27 @@ icu_from_uchar(char **result, const UChar *buff_uchar, 
int32_t len_uchar)
        return len_result;
 }
 
+/*
+ * Check if the given locale ID is valid, and ereport(ERROR) if it isn't.
+ */
+void
+check_icu_locale(const char *icu_locale)
+{
+       UCollator  *collator;
+       UErrorCode  status;
+
+       status = U_ZERO_ERROR;
+       collator = ucol_open(icu_locale, &status);
+       if (U_FAILURE(status))
+               ereport(ERROR,
+                               (errmsg("could not open collator for locale 
\"%s\": %s",
+                                               icu_locale, 
u_errorName(status))));
+
+       if (U_ICU_VERSION_MAJOR_NUM < 54)
+               icu_set_collation_attributes(collator, icu_locale);
+       ucol_close(collator);
+}
+
 /*
  * Parse collation attributes and apply them to the open collator.  This takes
  * a string like "und@colStrength=primary;colCaseLevel=yes" and parses and
diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index cbcd55288f..ee6243560b 100644
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -282,6 +282,10 @@ static int locale_date_order(const char *locale);
 static void check_locale_name(int category, const char *locale,
                                                          char **canonname);
 static bool check_locale_encoding(const char *locale, int encoding);
+#ifdef USE_ICU
+static void icu_set_collation_attributes(UCollator *collator,
+                                                                               
 const char *loc);
+#endif
 static void setlocales(void);
 static void usage(const char *progname);
 void           setup_pgdata(void);
@@ -2145,6 +2149,105 @@ check_locale_encoding(const char *locale, int user_enc)
        return true;
 }
 
+#ifdef USE_ICU
+/*
+ * Frontend version of the same function in pg_locale.c.
+ *
+ * If you modify this function, modify accordingly the similar backend function
+ * in pg_locale.c
+ */
+static void
+icu_set_collation_attributes(UCollator *collator, const char *loc)
+{
+       char       *str = pg_strdup(loc);
+
+       str = strchr(str, '@');
+       if (!str)
+               return;
+       str++;
+
+       for (char *token = strtok(str, ";"); token; token = strtok(NULL, ";"))
+       {
+               char       *e = strchr(token, '=');
+
+               if (e)
+               {
+                       char       *name;
+                       char       *value;
+                       UColAttribute uattr;
+                       UColAttributeValue uvalue;
+                       UErrorCode      status;
+
+                       status = U_ZERO_ERROR;
+
+                       *e = '\0';
+                       name = token;
+                       value = e + 1;
+
+                       /*
+                        * See attribute name and value lists in ICU 
i18n/coll.cpp
+                        */
+                       if (pg_strcasecmp(name, "colstrength") == 0)
+                               uattr = UCOL_STRENGTH;
+                       else if (pg_strcasecmp(name, "colbackwards") == 0)
+                               uattr = UCOL_FRENCH_COLLATION;
+                       else if (pg_strcasecmp(name, "colcaselevel") == 0)
+                               uattr = UCOL_CASE_LEVEL;
+                       else if (pg_strcasecmp(name, "colcasefirst") == 0)
+                               uattr = UCOL_CASE_FIRST;
+                       else if (pg_strcasecmp(name, "colalternate") == 0)
+                               uattr = UCOL_ALTERNATE_HANDLING;
+                       else if (pg_strcasecmp(name, "colnormalization") == 0)
+                               uattr = UCOL_NORMALIZATION_MODE;
+                       else if (pg_strcasecmp(name, "colnumeric") == 0)
+                               uattr = UCOL_NUMERIC_COLLATION;
+                       else
+                               /* ignore if unknown */
+                               continue;
+
+                       if (pg_strcasecmp(value, "primary") == 0)
+                               uvalue = UCOL_PRIMARY;
+                       else if (pg_strcasecmp(value, "secondary") == 0)
+                               uvalue = UCOL_SECONDARY;
+                       else if (pg_strcasecmp(value, "tertiary") == 0)
+                               uvalue = UCOL_TERTIARY;
+                       else if (pg_strcasecmp(value, "quaternary") == 0)
+                               uvalue = UCOL_QUATERNARY;
+                       else if (pg_strcasecmp(value, "identical") == 0)
+                               uvalue = UCOL_IDENTICAL;
+                       else if (pg_strcasecmp(value, "no") == 0)
+                               uvalue = UCOL_OFF;
+                       else if (pg_strcasecmp(value, "yes") == 0)
+                               uvalue = UCOL_ON;
+                       else if (pg_strcasecmp(value, "shifted") == 0)
+                               uvalue = UCOL_SHIFTED;
+                       else if (pg_strcasecmp(value, "non-ignorable") == 0)
+                               uvalue = UCOL_NON_IGNORABLE;
+                       else if (pg_strcasecmp(value, "lower") == 0)
+                               uvalue = UCOL_LOWER_FIRST;
+                       else if (pg_strcasecmp(value, "upper") == 0)
+                               uvalue = UCOL_UPPER_FIRST;
+                       else
+                               status = U_ILLEGAL_ARGUMENT_ERROR;
+
+                       if (status == U_ZERO_ERROR)
+                               ucol_setAttribute(collator, uattr, uvalue, 
&status);
+
+                       /*
+                        * Pretend the error came from ucol_open(), for 
consistent error
+                        * message across ICU versions.
+                        */
+                       if (U_FAILURE(status))
+                       {
+                               pg_log_error("could not open collator for 
locale \"%s\": %s",
+                                                        loc, 
u_errorName(status));
+                               exit(1);
+                       }
+               }
+       }
+}
+#endif
+
 /*
  * set up the locale variables
  *
@@ -2209,16 +2312,20 @@ setlocales(void)
                 */
 #ifdef USE_ICU
                {
+                       UCollator  *collator;
                        UErrorCode      status;
 
                        status = U_ZERO_ERROR;
-                       ucol_open(icu_locale, &status);
+                       collator = ucol_open(icu_locale, &status);
                        if (U_FAILURE(status))
                        {
                                pg_log_error("could not open collator for 
locale \"%s\": %s",
                                                         icu_locale, 
u_errorName(status));
                                exit(1);
                        }
+                       if (U_ICU_VERSION_MAJOR_NUM < 54)
+                               icu_set_collation_attributes(collator, 
icu_locale);
+                       ucol_close(collator);
                }
 #else
                pg_log_error("ICU is not supported in this build");
diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h
index 9b158f24a0..297d14d918 100644
--- a/src/include/utils/pg_locale.h
+++ b/src/include/utils/pg_locale.h
@@ -115,6 +115,7 @@ extern char *get_collation_actual_version(char 
collprovider, const char *collcol
 #ifdef USE_ICU
 extern int32_t icu_to_uchar(UChar **buff_uchar, const char *buff, size_t 
nbytes);
 extern int32_t icu_from_uchar(char **result, const UChar *buff_uchar, int32_t 
len_uchar);
+extern void check_icu_locale(const char *icu_locale);
 #endif
 
 /* These functions convert from/to libc's wchar_t, *not* pg_wchar_t */
-- 
2.33.1

Reply via email to