$SUBJECT makes it easier to test other providers, especially the
regression tests.

For this to be useful, it should avoid throwing an error for plain
"initdb" (without locale flags specified), which means we need defaults
for the builtin locale or the ICU locale. I chose "C.UTF-8" and "und"
(we could also have environment variables for those too, but that would
create some questions when --locale is also specified).

Another benefit is that this would make it easier to change the initdb
default, which is being discussed here:

https://www.postgresql.org/message-id/9b259f4c532943e428e9665122f37c099bab250e.ca...@j-davis.com

One annoyance is that the tests don't pass when
INITDB_LOCALE_PROVIDER=icu. That's because a lot of tests use either --
locale=C or --no-locale, and ICU doesn't have a way to interpret that.
We could force the provider to be builtin in that case, I suppose.

Another annoyance is that, if INITDB_LOCALE_PROVIDER=builtin, and
LC_CTYPE is not UTF-8-compatible, then we need to force LC_CTYPE=C.
That affects fewer things than it would with the libc provider, but it
still affects some things.

Regards,
        Jeff Davis

From e5876ac466d5158d3aafa1cf92dc54ff45a6b996 Mon Sep 17 00:00:00 2001
From: Jeff Davis <j...@j-davis.com>
Date: Thu, 17 Jul 2025 13:07:50 -0700
Subject: [PATCH v1] initdb: new environment variable INITDB_LOCALE_PROVIDER

Controls the locale provider used by initdb.

Also specifies defaults for both the builtin provider and ICU, so that
plain initdb (without locale arguments specified) will succeed for any
provider. For the builtin provider's UTF-8 based locales, if LC_CTYPE
is not compatible with UTF-8, forces LC_CTYPE=C to avoid such an
error.
---
 src/backend/commands/dbcommands.c             |  2 +-
 src/bin/initdb/initdb.c                       | 71 +++++++++++++++----
 src/bin/initdb/t/001_initdb.pl                | 11 +--
 src/bin/scripts/t/020_createdb.pl             | 69 ++++++++++--------
 .../modules/test_escape/t/001_test_escape.pl  |  2 +-
 5 files changed, 104 insertions(+), 51 deletions(-)

diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index 502a45163c8..92a396b8406 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -1052,7 +1052,7 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
 		dbctype = src_ctype;
 	if (dblocprovider == '\0')
 		dblocprovider = src_locprovider;
-	if (dblocale == NULL)
+	if (dblocale == NULL && dblocprovider == src_locprovider)
 		dblocale = src_locale;
 	if (dbicurules == NULL)
 		dbicurules = src_icurules;
diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index 62bbd08d9f6..60e5c9d4a31 100644
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -82,6 +82,9 @@
 #include "mb/pg_wchar.h"
 #include "miscadmin.h"
 
+#define DEFAULT_LOCALE_PROVIDER		COLLPROVIDER_LIBC
+#define DEFAULT_BUILTIN_LOCALE		"C.UTF-8"
+#define DEFAULT_ICU_LOCALE			"und"
 
 /* Ideally this would be in a .h file, but it hardly seems worth the trouble */
 extern const char *select_default_timezone(const char *share_path);
@@ -144,7 +147,7 @@ static char *lc_monetary = NULL;
 static char *lc_numeric = NULL;
 static char *lc_time = NULL;
 static char *lc_messages = NULL;
-static char locale_provider = COLLPROVIDER_LIBC;
+static char locale_provider = '\0';
 static bool builtin_locale_specified = false;
 static char *datlocale = NULL;
 static bool icu_locale_specified = false;
@@ -2468,12 +2471,11 @@ setlocales(void)
 	lc_messages = canonname;
 #endif
 
-	if (locale_provider != COLLPROVIDER_LIBC && datlocale == NULL)
-		pg_fatal("locale must be specified if provider is %s",
-				 collprovider_name(locale_provider));
-
 	if (locale_provider == COLLPROVIDER_BUILTIN)
 	{
+		if (!datlocale)
+			datlocale = DEFAULT_BUILTIN_LOCALE;
+
 		if (strcmp(datlocale, "C") == 0)
 			canonname = "C";
 		else if (strcmp(datlocale, "C.UTF-8") == 0 ||
@@ -2491,11 +2493,13 @@ setlocales(void)
 	{
 		char	   *langtag;
 
+		if (!datlocale)
+			datlocale = DEFAULT_ICU_LOCALE;
+
 		/* canonicalize to a language tag */
 		langtag = icu_language_tag(datlocale);
 		printf(_("Using language tag \"%s\" for ICU locale \"%s\".\n"),
 			   langtag, datlocale);
-		pg_free(datlocale);
 		datlocale = langtag;
 
 		icu_validate_locale(datlocale);
@@ -2686,6 +2690,30 @@ setup_locale_encoding(void)
 {
 	setlocales();
 
+	/*
+	 * For the builtin provider (other than the "C" locale), default encoding
+	 * to UTF-8. If lc_ctype is not compatible with UTF-8, also force lc_ctype
+	 * to "C". On windows, all locales are compatible with UTF-8.
+	 */
+	if (!encoding && locale_provider == COLLPROVIDER_BUILTIN &&
+		strcmp(datlocale, "C") != 0)
+	{
+#ifndef WIN32
+		int			ctype_enc = pg_get_encoding_from_locale(lc_ctype, false);
+
+		if (!(ctype_enc == PG_UTF8 ||
+			  ctype_enc == PG_SQL_ASCII))
+		{
+			pg_log_warning("setting LC_CTYPE to \"C\"");
+			pg_log_warning_detail("Encoding of LC_CTYPE locale \"%s\" does not match encoding required by builtin locale \"%s\".",
+								  lc_ctype, datlocale);
+			pg_log_warning_hint("Specify a UTF-8 compatible locale with --lc-ctype, or choose a different locale provider.");
+			lc_ctype = "C";
+		}
+#endif
+		encoding = "UTF-8";
+	}
+
 	if (locale_provider == COLLPROVIDER_LIBC &&
 		strcmp(lc_ctype, lc_collate) == 0 &&
 		strcmp(lc_ctype, lc_time) == 0 &&
@@ -2721,10 +2749,11 @@ setup_locale_encoding(void)
 		ctype_enc = pg_get_encoding_from_locale(lc_ctype, true);
 
 		/*
-		 * If ctype_enc=SQL_ASCII, it's compatible with any encoding. ICU does
-		 * not support SQL_ASCII, so select UTF-8 instead.
+		 * If ctype_enc=SQL_ASCII, it's compatible with any encoding. Neither
+		 * ICU nor the builtin provider support SQL_ASCII, so select UTF-8
+		 * instead.
 		 */
-		if (locale_provider == COLLPROVIDER_ICU && ctype_enc == PG_SQL_ASCII)
+		if (locale_provider != COLLPROVIDER_LIBC && ctype_enc == PG_SQL_ASCII)
 			ctype_enc = PG_UTF8;
 
 		if (ctype_enc == -1)
@@ -2773,11 +2802,10 @@ setup_locale_encoding(void)
 		!check_locale_encoding(lc_collate, encodingid))
 		exit(1);				/* check_locale_encoding printed the error */
 
-	if (locale_provider == COLLPROVIDER_BUILTIN)
+	if (locale_provider == COLLPROVIDER_BUILTIN &&
+		strcmp(datlocale, "C") != 0)
 	{
-		if ((strcmp(datlocale, "C.UTF-8") == 0 ||
-			 strcmp(datlocale, "PG_UNICODE_FAST") == 0) &&
-			encodingid != PG_UTF8)
+		if (encodingid != PG_UTF8)
 			pg_fatal("builtin provider locale \"%s\" requires encoding \"%s\"",
 					 datlocale, "UTF-8");
 	}
@@ -3402,7 +3430,6 @@ main(int argc, char *argv[])
 		}
 	}
 
-
 	/*
 	 * Non-option argument specifies data directory as long as it wasn't
 	 * already specified with -D / --pgdata
@@ -3421,6 +3448,22 @@ main(int argc, char *argv[])
 		exit(1);
 	}
 
+	if (locale_provider == '\0')
+	{
+		char	   *provider_name = getenv("INITDB_LOCALE_PROVIDER");
+
+		if (!provider_name)
+			locale_provider = DEFAULT_LOCALE_PROVIDER;
+		else if (strcmp(provider_name, "builtin") == 0)
+			locale_provider = COLLPROVIDER_BUILTIN;
+		else if (strcmp(provider_name, "icu") == 0)
+			locale_provider = COLLPROVIDER_ICU;
+		else if (strcmp(provider_name, "libc") == 0)
+			locale_provider = COLLPROVIDER_LIBC;
+		else
+			pg_fatal("unrecognized locale provider: %s", provider_name);
+	}
+
 	if (builtin_locale_specified && locale_provider != COLLPROVIDER_BUILTIN)
 		pg_fatal("%s cannot be specified unless locale provider \"%s\" is chosen",
 				 "--builtin-locale", "builtin");
diff --git a/src/bin/initdb/t/001_initdb.pl b/src/bin/initdb/t/001_initdb.pl
index b7ef7ed8d06..ba3211a4aa6 100644
--- a/src/bin/initdb/t/001_initdb.pl
+++ b/src/bin/initdb/t/001_initdb.pl
@@ -113,14 +113,13 @@ SKIP:
 
 if ($ENV{with_icu} eq 'yes')
 {
-	command_fails_like(
+	command_ok(
 		[
 			'initdb', '--no-sync',
 			'--locale-provider' => 'icu',
 			"$tempdir/data2"
 		],
-		qr/initdb: error: locale must be specified if provider is icu/,
-		'locale provider ICU requires --icu-locale');
+		'locale provider ICU default locale');
 
 	command_ok(
 		[
@@ -200,13 +199,15 @@ else
 		'locale provider ICU fails since no ICU support');
 }
 
-command_fails(
+command_like(
 	[
 		'initdb', '--no-sync',
+		'--auth' => 'trust',
 		'--locale-provider' => 'builtin',
 		"$tempdir/data6"
 	],
-	'locale provider builtin fails without --locale');
+	qr/^\s+default collation:\s+C.UTF-8\n/ms,
+	'locale provider builtin defaults to C.UTF-8');
 
 command_ok(
 	[
diff --git a/src/bin/scripts/t/020_createdb.pl b/src/bin/scripts/t/020_createdb.pl
index a8293390ede..6003d213e89 100644
--- a/src/bin/scripts/t/020_createdb.pl
+++ b/src/bin/scripts/t/020_createdb.pl
@@ -16,6 +16,9 @@ my $node = PostgreSQL::Test::Cluster->new('main');
 $node->init;
 $node->start;
 
+my $datlocprovider = $node->safe_psql('postgres',
+	"SELECT datlocprovider FROM pg_database WHERE datname='template1'");
+
 $node->issues_sql_like(
 	[ 'createdb', 'foobar1' ],
 	qr/statement: CREATE DATABASE foobar1/,
@@ -33,19 +36,6 @@ $node->issues_sql_like(
 
 if ($ENV{with_icu} eq 'yes')
 {
-	# This fails because template0 uses libc provider and has no ICU
-	# locale set.  It would succeed if template0 used the icu
-	# provider.  XXX Maybe split into multiple tests?
-	$node->command_fails(
-		[
-			'createdb',
-			'--template' => 'template0',
-			'--encoding' => 'UTF8',
-			'--locale-provider' => 'icu',
-			'foobar4',
-		],
-		'create database with ICU fails without ICU locale specified');
-
 	$node->issues_sql_like(
 		[
 			'createdb',
@@ -130,14 +120,18 @@ else
 		'create database with ICU fails since no ICU support');
 }
 
-$node->command_fails(
-	[
-		'createdb',
-		'--template' => 'template0',
-		'--locale-provider' => 'builtin',
-		'tbuiltin1',
-	],
-	'create database with provider "builtin" fails without --locale');
+if ($datlocprovider eq 'c')
+{
+	$node->command_fails(
+		[
+			'createdb',
+			'--template' => 'template0',
+			'--encoding' => 'UTF8',
+			'--locale-provider' => 'builtin',
+			'foobar4',
+		],
+		'create database with builtin provider fails without locale specified');
+}
 
 $node->command_ok(
 	[
@@ -219,15 +213,30 @@ $node->command_fails(
 	],
 	'create database with provider "builtin" and ICU_RULES=""');
 
-$node->command_fails(
-	[
-		'createdb',
-		'--template' => 'template1',
-		'--locale-provider' => 'builtin',
-		'--locale' => 'C',
-		'tbuiltin9',
-	],
-	'create database with provider "builtin" not matching template');
+if ($datlocprovider eq 'b')
+{
+	$node->command_fails(
+		[
+			'createdb',
+			'--template' => 'template1',
+			'--locale-provider' => 'libc',
+			'--locale' => 'C',
+			'tbuiltin9',
+		],
+		'create database with provider "libc" not matching template');
+}
+else
+{
+	$node->command_fails(
+		[
+			'createdb',
+			'--template' => 'template1',
+			'--locale-provider' => 'builtin',
+			'--locale' => 'C',
+			'tbuiltin9',
+		],
+		'create database with provider "builtin" not matching template');
+}
 
 $node->command_fails([ 'createdb', 'foobar1' ],
 	'fails if database already exists');
diff --git a/src/test/modules/test_escape/t/001_test_escape.pl b/src/test/modules/test_escape/t/001_test_escape.pl
index 0d5aec3ed74..b29f093db28 100644
--- a/src/test/modules/test_escape/t/001_test_escape.pl
+++ b/src/test/modules/test_escape/t/001_test_escape.pl
@@ -12,7 +12,7 @@ $node->init();
 $node->start();
 
 $node->safe_psql('postgres',
-	q(CREATE DATABASE db_sql_ascii ENCODING "sql_ascii" TEMPLATE template0;));
+	q(CREATE DATABASE db_sql_ascii LOCALE_PROVIDER "builtin" LOCALE "C" ENCODING "sql_ascii" TEMPLATE template0;));
 
 my $cmd =
   [ 'test_escape', '--conninfo', $node->connstr . " dbname=db_sql_ascii" ];
-- 
2.43.0

Reply via email to