On Tue, 2023-06-20 at 12:16 -0400, Tom Lane wrote:
> Jeff Davis <pg...@j-davis.com> writes:
> > Status on collation loose ends:
> 
> This all sounds good to me.

Patches attached.

0001 also removes the code to get a default locale when ICU is being
used, because that was a part of the same commit that changed the
default provider to be ICU and I don't see a lot of value in keeping
just that part.

I'm planning to commit something similar to the attached patches
tomorrow (Wednesday) unless I get more input.

Regards,
        Jeff Davis

From 1aaac6e154d9a1e4f728d732fd41ee263db6903e Mon Sep 17 00:00:00 2001
From: Jeff Davis <j...@j-davis.com>
Date: Tue, 20 Jun 2023 12:03:26 -0700
Subject: [PATCH v1 1/2] initdb: change default --locale-provider back to libc.

Reverts 27b62377b4.

Discussion: https://postgr.es/m/eff031036baa07f325de29215371a4c9e69d61f3.ca...@j-davis.com
Discussion: https://postgr.es/m/3353947.1682092...@sss.pgh.pa.us
---
 doc/src/sgml/ref/initdb.sgml                  | 42 +++++++------------
 src/bin/initdb/initdb.c                       | 23 +---------
 src/bin/initdb/t/001_initdb.pl                |  5 +++
 src/bin/pg_dump/t/002_pg_dump.pl              |  2 +-
 src/bin/scripts/t/020_createdb.pl             |  2 +-
 src/test/icu/t/010_database.pl                |  2 +-
 .../regress/expected/collate.icu.utf8.out     |  4 +-
 src/test/regress/sql/collate.icu.utf8.sql     |  4 +-
 8 files changed, 29 insertions(+), 55 deletions(-)

diff --git a/doc/src/sgml/ref/initdb.sgml b/doc/src/sgml/ref/initdb.sgml
index f850dc404d..22f1011781 100644
--- a/doc/src/sgml/ref/initdb.sgml
+++ b/doc/src/sgml/ref/initdb.sgml
@@ -93,24 +93,10 @@ PostgreSQL documentation
   </para>
 
   <para>
-   By default, <command>initdb</command> uses the ICU library to provide
-   locale services if the server was built with ICU support; otherwise it uses
-   the <literal>libc</literal> locale provider (see <xref
-   linkend="locale-providers"/>). To choose the specific ICU locale ID to
-   apply, use the option <option>--icu-locale</option>.  Note that for
-   implementation reasons and to support legacy code,
-   <command>initdb</command> will still select and initialize libc locale
-   settings when the ICU locale provider is used.
-  </para>
-
-  <para>
-   Alternatively, <command>initdb</command> can use the locale provider
-   <literal>libc</literal>. To select this option, specify
-   <literal>--locale-provider=libc</literal>, or build the server without ICU
-   support. The <literal>libc</literal> locale provider takes the locale
-   settings from the environment, and determines the encoding from the locale
-   settings.  This is almost always sufficient, unless there are special
-   requirements.
+   By default, <command>initdb</command> uses the locale provider
+   <literal>libc</literal> (see <xref linkend="locale-providers"/>). The
+   <literal>libc</literal> locale provider takes the locale settings from the
+   environment, and determines the encoding from the locale settings.
   </para>
 
   <para>
@@ -122,6 +108,16 @@ PostgreSQL documentation
    this should be used with care.
   </para>
 
+  <para>
+   Alternatively, <command>initdb</command> can use the ICU library to provide
+   locale services by specifying <literal>--locale-provider=icu</literal>. The
+   server must be built with ICU support. To choose the specific ICU locale ID
+   to apply, use the option <option>--icu-locale</option>.  Note that for
+   implementation reasons and to support legacy code,
+   <command>initdb</command> will still select and initialize libc locale
+   settings when the ICU locale provider is used.
+  </para>
+
   <para>
    When <command>initdb</command> runs, it will print out the locale settings
    it has chosen.  If you have complex requirements or specified multiple
@@ -251,11 +247,6 @@ PostgreSQL documentation
         Specifies the ICU locale when the ICU provider is used. Locale support
         is described in <xref linkend="locale"/>.
        </para>
-       <para>
-        If this option is not specified, the locale is inherited from the
-        environment in which <command>initdb</command> runs. The environment's
-        locale is matched to a similar ICU locale name, if possible.
-       </para>
       </listitem>
      </varlistentry>
 
@@ -330,9 +321,8 @@ PostgreSQL documentation
         This option sets the locale provider for databases created in the new
         cluster.  It can be overridden in the <command>CREATE
         DATABASE</command> command when new databases are subsequently
-        created.  The default is <literal>icu</literal> if the server was
-        built with ICU support; otherwise the default is
-        <literal>libc</literal> (see <xref linkend="locale-providers"/>).
+        created.  The default is <literal>libc</literal> (see <xref
+        linkend="locale-providers"/>).
        </para>
       </listitem>
      </varlistentry>
diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index 71a3d26c37..fa3af0d75c 100644
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -143,11 +143,7 @@ static char *lc_monetary = NULL;
 static char *lc_numeric = NULL;
 static char *lc_time = NULL;
 static char *lc_messages = NULL;
-#ifdef USE_ICU
-static char locale_provider = COLLPROVIDER_ICU;
-#else
 static char locale_provider = COLLPROVIDER_LIBC;
-#endif
 static char *icu_locale = NULL;
 static char *icu_rules = NULL;
 static const char *default_text_search_config = NULL;
@@ -2357,19 +2353,6 @@ icu_validate_locale(const char *loc_str)
 #endif
 }
 
-/*
- * Determine the default ICU locale
- */
-static char *
-default_icu_locale(void)
-{
-#ifdef USE_ICU
-	return pg_strdup(uloc_getDefault());
-#else
-	pg_fatal("ICU is not supported in this build");
-#endif
-}
-
 /*
  * set up the locale variables
  *
@@ -2429,10 +2412,7 @@ setlocales(void)
 
 		/* acquire default locale from the environment, if not specified */
 		if (icu_locale == NULL)
-		{
-			icu_locale = default_icu_locale();
-			printf(_("Using default ICU locale \"%s\".\n"), icu_locale);
-		}
+			pg_fatal("ICU locale must be specified");
 
 		/* canonicalize to a language tag */
 		langtag = icu_language_tag(icu_locale);
@@ -3273,7 +3253,6 @@ main(int argc, char *argv[])
 				break;
 			case 8:
 				locale = "C";
-				locale_provider = COLLPROVIDER_LIBC;
 				break;
 			case 9:
 				pwfilename = pg_strdup(optarg);
diff --git a/src/bin/initdb/t/001_initdb.pl b/src/bin/initdb/t/001_initdb.pl
index 8b4acb7148..2d7469d2fc 100644
--- a/src/bin/initdb/t/001_initdb.pl
+++ b/src/bin/initdb/t/001_initdb.pl
@@ -103,6 +103,11 @@ SKIP:
 
 if ($ENV{with_icu} eq 'yes')
 {
+	command_fails_like(
+		[ 'initdb', '--no-sync', '--locale-provider=icu', "$tempdir/data2" ],
+		qr/initdb: error: ICU locale must be specified/,
+		'locale provider ICU requires --icu-locale');
+
 	command_ok(
 		[
 			'initdb', '--no-sync',
diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl
index 15852188c4..63bb4689d4 100644
--- a/src/bin/pg_dump/t/002_pg_dump.pl
+++ b/src/bin/pg_dump/t/002_pg_dump.pl
@@ -1965,7 +1965,7 @@ my %tests = (
 		create_sql =>
 		  "CREATE DATABASE dump_test2 LOCALE = 'C' TEMPLATE = template0;",
 		regexp => qr/^
-			\QCREATE DATABASE dump_test2 \E.*\QLOCALE = 'C'\E
+			\QCREATE DATABASE dump_test2 \E.*\QLOCALE = 'C';\E
 			/xm,
 		like => { pg_dumpall_dbprivs => 1, },
 	},
diff --git a/src/bin/scripts/t/020_createdb.pl b/src/bin/scripts/t/020_createdb.pl
index 57383561a0..40291924e5 100644
--- a/src/bin/scripts/t/020_createdb.pl
+++ b/src/bin/scripts/t/020_createdb.pl
@@ -13,7 +13,7 @@ program_version_ok('createdb');
 program_options_handling_ok('createdb');
 
 my $node = PostgreSQL::Test::Cluster->new('main');
-$node->init(extra => ['--locale-provider=libc']);
+$node->init;
 $node->start;
 
 $node->issues_sql_like(
diff --git a/src/test/icu/t/010_database.pl b/src/test/icu/t/010_database.pl
index cbe5467f3c..3beee2ff96 100644
--- a/src/test/icu/t/010_database.pl
+++ b/src/test/icu/t/010_database.pl
@@ -12,7 +12,7 @@ if ($ENV{with_icu} ne 'yes')
 }
 
 my $node1 = PostgreSQL::Test::Cluster->new('node1');
-$node1->init(extra => ['--locale-provider=libc']);
+$node1->init;
 $node1->start;
 
 $node1->safe_psql('postgres',
diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out
index dc96e590f7..b7fbee447f 100644
--- a/src/test/regress/expected/collate.icu.utf8.out
+++ b/src/test/regress/expected/collate.icu.utf8.out
@@ -1023,7 +1023,7 @@ SET client_min_messages TO WARNING;
 do $$
 BEGIN
   EXECUTE 'CREATE COLLATION test0 (provider = icu, locale = ' ||
-          quote_literal((SELECT daticulocale FROM pg_database WHERE datname = current_database())) || ');';
+          quote_literal((SELECT CASE WHEN datlocprovider='i' THEN daticulocale ELSE datcollate END FROM pg_database WHERE datname = current_database())) || ');';
 END
 $$;
 CREATE COLLATION test0 FROM "C"; -- fail, duplicate name
@@ -1031,7 +1031,7 @@ ERROR:  collation "test0" already exists
 do $$
 BEGIN
   EXECUTE 'CREATE COLLATION test1 (provider = icu, locale = ' ||
-          quote_literal((SELECT daticulocale FROM pg_database WHERE datname = current_database())) || ');';
+          quote_literal((SELECT CASE WHEN datlocprovider='i' THEN daticulocale ELSE datcollate END FROM pg_database WHERE datname = current_database())) || ');';
 END
 $$;
 RESET client_min_messages;
diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql
index a8001b4b8e..079d7ae39d 100644
--- a/src/test/regress/sql/collate.icu.utf8.sql
+++ b/src/test/regress/sql/collate.icu.utf8.sql
@@ -362,14 +362,14 @@ SET client_min_messages TO WARNING;
 do $$
 BEGIN
   EXECUTE 'CREATE COLLATION test0 (provider = icu, locale = ' ||
-          quote_literal((SELECT daticulocale FROM pg_database WHERE datname = current_database())) || ');';
+          quote_literal((SELECT CASE WHEN datlocprovider='i' THEN daticulocale ELSE datcollate END FROM pg_database WHERE datname = current_database())) || ');';
 END
 $$;
 CREATE COLLATION test0 FROM "C"; -- fail, duplicate name
 do $$
 BEGIN
   EXECUTE 'CREATE COLLATION test1 (provider = icu, locale = ' ||
-          quote_literal((SELECT daticulocale FROM pg_database WHERE datname = current_database())) || ');';
+          quote_literal((SELECT CASE WHEN datlocprovider='i' THEN daticulocale ELSE datcollate END FROM pg_database WHERE datname = current_database())) || ');';
 END
 $$;
 
-- 
2.34.1

From e339123ae9c41a1161d516173e67c543d5542a60 Mon Sep 17 00:00:00 2001
From: Jeff Davis <j...@j-davis.com>
Date: Thu, 15 Jun 2023 15:18:50 -0700
Subject: [PATCH v1 2/2] ICU: do not convert locale 'C' to 'en-US-u-va-posix'.

Older versions of ICU canonicalize "C" to "en-US-u-va-posix"; but
starting in ICU version 64, the "C" locale is considered
obsolete. Postgres commit ea1db8ae70 introduced code to always
canonicalize "C" to "en-US-u-va-posix" for consistency and
convenience, but it was deemed too confusing.

This commit removes that code, so that "C" is treated like other ICU
locale names: canonicalization is attempted, and if it fails, the
behavior is controlled by icu_validation_level.

A similar change was previously committed as f7faa9976c, then reverted
due to an ICU-version-dependent test failure. This commit un-reverts
it, omitting the test because we now expect the behavior to depend on
the version of ICU being used.

Discussion: https://postgr.es/m/3a200aca-4672-4b37-fc91-5d198a323503%40eisentraut.org
Discussion: https://postgr.es/m/f83f089ee1e9acd5dbbbf3353294d24e1f196e95.ca...@j-davis.com
Discussion: https://postgr.es/m/37520ec1ae9591f83132f82dbd625f3fc2d69c16.ca...@j-davis.com
---
 src/backend/utils/adt/pg_locale.c             | 27 ++++---------------
 src/bin/initdb/initdb.c                       | 25 ++++-------------
 .../regress/expected/collate.icu.utf8.out     |  2 ++
 src/test/regress/sql/collate.icu.utf8.sql     |  2 ++
 4 files changed, 14 insertions(+), 42 deletions(-)

diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index 31e3b16ae0..69a2fb3376 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -2783,26 +2783,10 @@ char *
 icu_language_tag(const char *loc_str, int elevel)
 {
 #ifdef USE_ICU
-	UErrorCode	status;
-	char		lang[ULOC_LANG_CAPACITY];
-	char	   *langtag;
-	size_t		buflen = 32;	/* arbitrary starting buffer size */
-	const bool	strict = true;
-
-	status = U_ZERO_ERROR;
-	uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status);
-	if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
-	{
-		if (elevel > 0)
-			ereport(elevel,
-					(errmsg("could not get language from locale \"%s\": %s",
-							loc_str, u_errorName(status))));
-		return NULL;
-	}
-
-	/* C/POSIX locales aren't handled by uloc_getLanguageTag() */
-	if (strcmp(lang, "c") == 0 || strcmp(lang, "posix") == 0)
-		return pstrdup("en-US-u-va-posix");
+	UErrorCode	 status;
+	char		*langtag;
+	size_t		 buflen = 32;	/* arbitrary starting buffer size */
+	const bool	 strict = true;
 
 	/*
 	 * A BCP47 language tag doesn't have a clearly-defined upper limit (cf.
@@ -2884,8 +2868,7 @@ icu_validate_locale(const char *loc_str)
 
 	/* check for special language name */
 	if (strcmp(lang, "") == 0 ||
-		strcmp(lang, "root") == 0 || strcmp(lang, "und") == 0 ||
-		strcmp(lang, "c") == 0 || strcmp(lang, "posix") == 0)
+		strcmp(lang, "root") == 0 || strcmp(lang, "und") == 0)
 		found = true;
 
 	/* search for matching language within ICU */
diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index fa3af0d75c..a5ce1f66d5 100644
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -2243,24 +2243,10 @@ static char *
 icu_language_tag(const char *loc_str)
 {
 #ifdef USE_ICU
-	UErrorCode	status;
-	char		lang[ULOC_LANG_CAPACITY];
-	char	   *langtag;
-	size_t		buflen = 32;	/* arbitrary starting buffer size */
-	const bool	strict = true;
-
-	status = U_ZERO_ERROR;
-	uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status);
-	if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
-	{
-		pg_fatal("could not get language from locale \"%s\": %s",
-				 loc_str, u_errorName(status));
-		return NULL;
-	}
-
-	/* C/POSIX locales aren't handled by uloc_getLanguageTag() */
-	if (strcmp(lang, "c") == 0 || strcmp(lang, "posix") == 0)
-		return pstrdup("en-US-u-va-posix");
+	UErrorCode	 status;
+	char		*langtag;
+	size_t		 buflen = 32;	/* arbitrary starting buffer size */
+	const bool	 strict = true;
 
 	/*
 	 * A BCP47 language tag doesn't have a clearly-defined upper limit (cf.
@@ -2326,8 +2312,7 @@ icu_validate_locale(const char *loc_str)
 
 	/* check for special language name */
 	if (strcmp(lang, "") == 0 ||
-		strcmp(lang, "root") == 0 || strcmp(lang, "und") == 0 ||
-		strcmp(lang, "c") == 0 || strcmp(lang, "posix") == 0)
+		strcmp(lang, "root") == 0 || strcmp(lang, "und") == 0)
 		found = true;
 
 	/* search for matching language within ICU */
diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out
index b7fbee447f..78a9cb38fa 100644
--- a/src/test/regress/expected/collate.icu.utf8.out
+++ b/src/test/regress/expected/collate.icu.utf8.out
@@ -1020,6 +1020,7 @@ CREATE ROLE regress_test_role;
 CREATE SCHEMA test_schema;
 -- We need to do this this way to cope with varying names for encodings:
 SET client_min_messages TO WARNING;
+SET icu_validation_level = disabled;
 do $$
 BEGIN
   EXECUTE 'CREATE COLLATION test0 (provider = icu, locale = ' ||
@@ -1034,6 +1035,7 @@ BEGIN
           quote_literal((SELECT CASE WHEN datlocprovider='i' THEN daticulocale ELSE datcollate END FROM pg_database WHERE datname = current_database())) || ');';
 END
 $$;
+RESET icu_validation_level;
 RESET client_min_messages;
 CREATE COLLATION test3 (provider = icu, lc_collate = 'en_US.utf8'); -- fail, needs "locale"
 ERROR:  parameter "locale" must be specified
diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql
index 079d7ae39d..3db9e25913 100644
--- a/src/test/regress/sql/collate.icu.utf8.sql
+++ b/src/test/regress/sql/collate.icu.utf8.sql
@@ -358,6 +358,7 @@ CREATE SCHEMA test_schema;
 
 -- We need to do this this way to cope with varying names for encodings:
 SET client_min_messages TO WARNING;
+SET icu_validation_level = disabled;
 
 do $$
 BEGIN
@@ -373,6 +374,7 @@ BEGIN
 END
 $$;
 
+RESET icu_validation_level;
 RESET client_min_messages;
 
 CREATE COLLATION test3 (provider = icu, lc_collate = 'en_US.utf8'); -- fail, needs "locale"
-- 
2.34.1

Reply via email to