On Tue, 2023-06-20 at 12:16 -0400, Tom Lane wrote: > Jeff Davis <pg...@j-davis.com> writes: > > Status on collation loose ends: > > This all sounds good to me.
Patches attached. 0001 also removes the code to get a default locale when ICU is being used, because that was a part of the same commit that changed the default provider to be ICU and I don't see a lot of value in keeping just that part. I'm planning to commit something similar to the attached patches tomorrow (Wednesday) unless I get more input. Regards, Jeff Davis
From 1aaac6e154d9a1e4f728d732fd41ee263db6903e Mon Sep 17 00:00:00 2001 From: Jeff Davis <j...@j-davis.com> Date: Tue, 20 Jun 2023 12:03:26 -0700 Subject: [PATCH v1 1/2] initdb: change default --locale-provider back to libc. Reverts 27b62377b4. Discussion: https://postgr.es/m/eff031036baa07f325de29215371a4c9e69d61f3.ca...@j-davis.com Discussion: https://postgr.es/m/3353947.1682092...@sss.pgh.pa.us --- doc/src/sgml/ref/initdb.sgml | 42 +++++++------------ src/bin/initdb/initdb.c | 23 +--------- src/bin/initdb/t/001_initdb.pl | 5 +++ src/bin/pg_dump/t/002_pg_dump.pl | 2 +- src/bin/scripts/t/020_createdb.pl | 2 +- src/test/icu/t/010_database.pl | 2 +- .../regress/expected/collate.icu.utf8.out | 4 +- src/test/regress/sql/collate.icu.utf8.sql | 4 +- 8 files changed, 29 insertions(+), 55 deletions(-) diff --git a/doc/src/sgml/ref/initdb.sgml b/doc/src/sgml/ref/initdb.sgml index f850dc404d..22f1011781 100644 --- a/doc/src/sgml/ref/initdb.sgml +++ b/doc/src/sgml/ref/initdb.sgml @@ -93,24 +93,10 @@ PostgreSQL documentation </para> <para> - By default, <command>initdb</command> uses the ICU library to provide - locale services if the server was built with ICU support; otherwise it uses - the <literal>libc</literal> locale provider (see <xref - linkend="locale-providers"/>). To choose the specific ICU locale ID to - apply, use the option <option>--icu-locale</option>. Note that for - implementation reasons and to support legacy code, - <command>initdb</command> will still select and initialize libc locale - settings when the ICU locale provider is used. - </para> - - <para> - Alternatively, <command>initdb</command> can use the locale provider - <literal>libc</literal>. To select this option, specify - <literal>--locale-provider=libc</literal>, or build the server without ICU - support. The <literal>libc</literal> locale provider takes the locale - settings from the environment, and determines the encoding from the locale - settings. This is almost always sufficient, unless there are special - requirements. + By default, <command>initdb</command> uses the locale provider + <literal>libc</literal> (see <xref linkend="locale-providers"/>). The + <literal>libc</literal> locale provider takes the locale settings from the + environment, and determines the encoding from the locale settings. </para> <para> @@ -122,6 +108,16 @@ PostgreSQL documentation this should be used with care. </para> + <para> + Alternatively, <command>initdb</command> can use the ICU library to provide + locale services by specifying <literal>--locale-provider=icu</literal>. The + server must be built with ICU support. To choose the specific ICU locale ID + to apply, use the option <option>--icu-locale</option>. Note that for + implementation reasons and to support legacy code, + <command>initdb</command> will still select and initialize libc locale + settings when the ICU locale provider is used. + </para> + <para> When <command>initdb</command> runs, it will print out the locale settings it has chosen. If you have complex requirements or specified multiple @@ -251,11 +247,6 @@ PostgreSQL documentation Specifies the ICU locale when the ICU provider is used. Locale support is described in <xref linkend="locale"/>. </para> - <para> - If this option is not specified, the locale is inherited from the - environment in which <command>initdb</command> runs. The environment's - locale is matched to a similar ICU locale name, if possible. - </para> </listitem> </varlistentry> @@ -330,9 +321,8 @@ PostgreSQL documentation This option sets the locale provider for databases created in the new cluster. It can be overridden in the <command>CREATE DATABASE</command> command when new databases are subsequently - created. The default is <literal>icu</literal> if the server was - built with ICU support; otherwise the default is - <literal>libc</literal> (see <xref linkend="locale-providers"/>). + created. The default is <literal>libc</literal> (see <xref + linkend="locale-providers"/>). </para> </listitem> </varlistentry> diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index 71a3d26c37..fa3af0d75c 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -143,11 +143,7 @@ static char *lc_monetary = NULL; static char *lc_numeric = NULL; static char *lc_time = NULL; static char *lc_messages = NULL; -#ifdef USE_ICU -static char locale_provider = COLLPROVIDER_ICU; -#else static char locale_provider = COLLPROVIDER_LIBC; -#endif static char *icu_locale = NULL; static char *icu_rules = NULL; static const char *default_text_search_config = NULL; @@ -2357,19 +2353,6 @@ icu_validate_locale(const char *loc_str) #endif } -/* - * Determine the default ICU locale - */ -static char * -default_icu_locale(void) -{ -#ifdef USE_ICU - return pg_strdup(uloc_getDefault()); -#else - pg_fatal("ICU is not supported in this build"); -#endif -} - /* * set up the locale variables * @@ -2429,10 +2412,7 @@ setlocales(void) /* acquire default locale from the environment, if not specified */ if (icu_locale == NULL) - { - icu_locale = default_icu_locale(); - printf(_("Using default ICU locale \"%s\".\n"), icu_locale); - } + pg_fatal("ICU locale must be specified"); /* canonicalize to a language tag */ langtag = icu_language_tag(icu_locale); @@ -3273,7 +3253,6 @@ main(int argc, char *argv[]) break; case 8: locale = "C"; - locale_provider = COLLPROVIDER_LIBC; break; case 9: pwfilename = pg_strdup(optarg); diff --git a/src/bin/initdb/t/001_initdb.pl b/src/bin/initdb/t/001_initdb.pl index 8b4acb7148..2d7469d2fc 100644 --- a/src/bin/initdb/t/001_initdb.pl +++ b/src/bin/initdb/t/001_initdb.pl @@ -103,6 +103,11 @@ SKIP: if ($ENV{with_icu} eq 'yes') { + command_fails_like( + [ 'initdb', '--no-sync', '--locale-provider=icu', "$tempdir/data2" ], + qr/initdb: error: ICU locale must be specified/, + 'locale provider ICU requires --icu-locale'); + command_ok( [ 'initdb', '--no-sync', diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl index 15852188c4..63bb4689d4 100644 --- a/src/bin/pg_dump/t/002_pg_dump.pl +++ b/src/bin/pg_dump/t/002_pg_dump.pl @@ -1965,7 +1965,7 @@ my %tests = ( create_sql => "CREATE DATABASE dump_test2 LOCALE = 'C' TEMPLATE = template0;", regexp => qr/^ - \QCREATE DATABASE dump_test2 \E.*\QLOCALE = 'C'\E + \QCREATE DATABASE dump_test2 \E.*\QLOCALE = 'C';\E /xm, like => { pg_dumpall_dbprivs => 1, }, }, diff --git a/src/bin/scripts/t/020_createdb.pl b/src/bin/scripts/t/020_createdb.pl index 57383561a0..40291924e5 100644 --- a/src/bin/scripts/t/020_createdb.pl +++ b/src/bin/scripts/t/020_createdb.pl @@ -13,7 +13,7 @@ program_version_ok('createdb'); program_options_handling_ok('createdb'); my $node = PostgreSQL::Test::Cluster->new('main'); -$node->init(extra => ['--locale-provider=libc']); +$node->init; $node->start; $node->issues_sql_like( diff --git a/src/test/icu/t/010_database.pl b/src/test/icu/t/010_database.pl index cbe5467f3c..3beee2ff96 100644 --- a/src/test/icu/t/010_database.pl +++ b/src/test/icu/t/010_database.pl @@ -12,7 +12,7 @@ if ($ENV{with_icu} ne 'yes') } my $node1 = PostgreSQL::Test::Cluster->new('node1'); -$node1->init(extra => ['--locale-provider=libc']); +$node1->init; $node1->start; $node1->safe_psql('postgres', diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out index dc96e590f7..b7fbee447f 100644 --- a/src/test/regress/expected/collate.icu.utf8.out +++ b/src/test/regress/expected/collate.icu.utf8.out @@ -1023,7 +1023,7 @@ SET client_min_messages TO WARNING; do $$ BEGIN EXECUTE 'CREATE COLLATION test0 (provider = icu, locale = ' || - quote_literal((SELECT daticulocale FROM pg_database WHERE datname = current_database())) || ');'; + quote_literal((SELECT CASE WHEN datlocprovider='i' THEN daticulocale ELSE datcollate END FROM pg_database WHERE datname = current_database())) || ');'; END $$; CREATE COLLATION test0 FROM "C"; -- fail, duplicate name @@ -1031,7 +1031,7 @@ ERROR: collation "test0" already exists do $$ BEGIN EXECUTE 'CREATE COLLATION test1 (provider = icu, locale = ' || - quote_literal((SELECT daticulocale FROM pg_database WHERE datname = current_database())) || ');'; + quote_literal((SELECT CASE WHEN datlocprovider='i' THEN daticulocale ELSE datcollate END FROM pg_database WHERE datname = current_database())) || ');'; END $$; RESET client_min_messages; diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql index a8001b4b8e..079d7ae39d 100644 --- a/src/test/regress/sql/collate.icu.utf8.sql +++ b/src/test/regress/sql/collate.icu.utf8.sql @@ -362,14 +362,14 @@ SET client_min_messages TO WARNING; do $$ BEGIN EXECUTE 'CREATE COLLATION test0 (provider = icu, locale = ' || - quote_literal((SELECT daticulocale FROM pg_database WHERE datname = current_database())) || ');'; + quote_literal((SELECT CASE WHEN datlocprovider='i' THEN daticulocale ELSE datcollate END FROM pg_database WHERE datname = current_database())) || ');'; END $$; CREATE COLLATION test0 FROM "C"; -- fail, duplicate name do $$ BEGIN EXECUTE 'CREATE COLLATION test1 (provider = icu, locale = ' || - quote_literal((SELECT daticulocale FROM pg_database WHERE datname = current_database())) || ');'; + quote_literal((SELECT CASE WHEN datlocprovider='i' THEN daticulocale ELSE datcollate END FROM pg_database WHERE datname = current_database())) || ');'; END $$; -- 2.34.1
From e339123ae9c41a1161d516173e67c543d5542a60 Mon Sep 17 00:00:00 2001 From: Jeff Davis <j...@j-davis.com> Date: Thu, 15 Jun 2023 15:18:50 -0700 Subject: [PATCH v1 2/2] ICU: do not convert locale 'C' to 'en-US-u-va-posix'. Older versions of ICU canonicalize "C" to "en-US-u-va-posix"; but starting in ICU version 64, the "C" locale is considered obsolete. Postgres commit ea1db8ae70 introduced code to always canonicalize "C" to "en-US-u-va-posix" for consistency and convenience, but it was deemed too confusing. This commit removes that code, so that "C" is treated like other ICU locale names: canonicalization is attempted, and if it fails, the behavior is controlled by icu_validation_level. A similar change was previously committed as f7faa9976c, then reverted due to an ICU-version-dependent test failure. This commit un-reverts it, omitting the test because we now expect the behavior to depend on the version of ICU being used. Discussion: https://postgr.es/m/3a200aca-4672-4b37-fc91-5d198a323503%40eisentraut.org Discussion: https://postgr.es/m/f83f089ee1e9acd5dbbbf3353294d24e1f196e95.ca...@j-davis.com Discussion: https://postgr.es/m/37520ec1ae9591f83132f82dbd625f3fc2d69c16.ca...@j-davis.com --- src/backend/utils/adt/pg_locale.c | 27 ++++--------------- src/bin/initdb/initdb.c | 25 ++++------------- .../regress/expected/collate.icu.utf8.out | 2 ++ src/test/regress/sql/collate.icu.utf8.sql | 2 ++ 4 files changed, 14 insertions(+), 42 deletions(-) diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index 31e3b16ae0..69a2fb3376 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -2783,26 +2783,10 @@ char * icu_language_tag(const char *loc_str, int elevel) { #ifdef USE_ICU - UErrorCode status; - char lang[ULOC_LANG_CAPACITY]; - char *langtag; - size_t buflen = 32; /* arbitrary starting buffer size */ - const bool strict = true; - - status = U_ZERO_ERROR; - uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status); - if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING) - { - if (elevel > 0) - ereport(elevel, - (errmsg("could not get language from locale \"%s\": %s", - loc_str, u_errorName(status)))); - return NULL; - } - - /* C/POSIX locales aren't handled by uloc_getLanguageTag() */ - if (strcmp(lang, "c") == 0 || strcmp(lang, "posix") == 0) - return pstrdup("en-US-u-va-posix"); + UErrorCode status; + char *langtag; + size_t buflen = 32; /* arbitrary starting buffer size */ + const bool strict = true; /* * A BCP47 language tag doesn't have a clearly-defined upper limit (cf. @@ -2884,8 +2868,7 @@ icu_validate_locale(const char *loc_str) /* check for special language name */ if (strcmp(lang, "") == 0 || - strcmp(lang, "root") == 0 || strcmp(lang, "und") == 0 || - strcmp(lang, "c") == 0 || strcmp(lang, "posix") == 0) + strcmp(lang, "root") == 0 || strcmp(lang, "und") == 0) found = true; /* search for matching language within ICU */ diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index fa3af0d75c..a5ce1f66d5 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -2243,24 +2243,10 @@ static char * icu_language_tag(const char *loc_str) { #ifdef USE_ICU - UErrorCode status; - char lang[ULOC_LANG_CAPACITY]; - char *langtag; - size_t buflen = 32; /* arbitrary starting buffer size */ - const bool strict = true; - - status = U_ZERO_ERROR; - uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status); - if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING) - { - pg_fatal("could not get language from locale \"%s\": %s", - loc_str, u_errorName(status)); - return NULL; - } - - /* C/POSIX locales aren't handled by uloc_getLanguageTag() */ - if (strcmp(lang, "c") == 0 || strcmp(lang, "posix") == 0) - return pstrdup("en-US-u-va-posix"); + UErrorCode status; + char *langtag; + size_t buflen = 32; /* arbitrary starting buffer size */ + const bool strict = true; /* * A BCP47 language tag doesn't have a clearly-defined upper limit (cf. @@ -2326,8 +2312,7 @@ icu_validate_locale(const char *loc_str) /* check for special language name */ if (strcmp(lang, "") == 0 || - strcmp(lang, "root") == 0 || strcmp(lang, "und") == 0 || - strcmp(lang, "c") == 0 || strcmp(lang, "posix") == 0) + strcmp(lang, "root") == 0 || strcmp(lang, "und") == 0) found = true; /* search for matching language within ICU */ diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out index b7fbee447f..78a9cb38fa 100644 --- a/src/test/regress/expected/collate.icu.utf8.out +++ b/src/test/regress/expected/collate.icu.utf8.out @@ -1020,6 +1020,7 @@ CREATE ROLE regress_test_role; CREATE SCHEMA test_schema; -- We need to do this this way to cope with varying names for encodings: SET client_min_messages TO WARNING; +SET icu_validation_level = disabled; do $$ BEGIN EXECUTE 'CREATE COLLATION test0 (provider = icu, locale = ' || @@ -1034,6 +1035,7 @@ BEGIN quote_literal((SELECT CASE WHEN datlocprovider='i' THEN daticulocale ELSE datcollate END FROM pg_database WHERE datname = current_database())) || ');'; END $$; +RESET icu_validation_level; RESET client_min_messages; CREATE COLLATION test3 (provider = icu, lc_collate = 'en_US.utf8'); -- fail, needs "locale" ERROR: parameter "locale" must be specified diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql index 079d7ae39d..3db9e25913 100644 --- a/src/test/regress/sql/collate.icu.utf8.sql +++ b/src/test/regress/sql/collate.icu.utf8.sql @@ -358,6 +358,7 @@ CREATE SCHEMA test_schema; -- We need to do this this way to cope with varying names for encodings: SET client_min_messages TO WARNING; +SET icu_validation_level = disabled; do $$ BEGIN @@ -373,6 +374,7 @@ BEGIN END $$; +RESET icu_validation_level; RESET client_min_messages; CREATE COLLATION test3 (provider = icu, lc_collate = 'en_US.utf8'); -- fail, needs "locale" -- 2.34.1