On Wed, Jun 8, 2022 at 12:23 PM Peter Geoghegan <p...@bowt.ie> wrote:
> ISTM that there are two mostly-distinct questions here:
>
> 1. How do we link to multiple versions of ICU at the same time, in a
> way that is going to work smoothly on mainstream platforms?
>
> 2. What semantics around collations do we want for Postgres once we
> gain the ability to use multiple versions of ICU at the same time? For
> example, do we want to generalize the definition of a collation, so
> that it's associated with one particular ICU version and collation for
> the purposes of on-disk compatibility, but isn't necessarily tied to
> the same ICU version in other contexts, such as on a dump and restore?

Yeah.  Well I couldn't resist doing some (very!) experimental hacking.
See attached.  The idea of putting a raw library name in there is just
a straw-man, and I already found a major problem with it: I also need
to get my hands on u_strToLower and friends for formatting.c, but
those functions are in a different library that needs to be dlopen'd
separately, so we need *two* names.  That's not done in the attached
patch, but at least this demonstrates some of the mechanics of a
dlopen() based solution that can do the collating part...  of course
there are all kinds of problems apparent (security of loading
arbitrary libraries, API stability, interaction with the "default" ICU
that our binary is linked against, creation of initial set of
collations in initdb, naming, upgrades, ...).

Demo:

$ sudo apt-get install libicu63 libicu67

postgres=# create schema icu63;
CREATE SCHEMA
postgres=# create schema icu67;
CREATE SCHEMA
postgres=# create collation icu63."en-US-x-icu" (provider = icu,
locale = 'libicui18n.so.63:en-US');
CREATE COLLATION
postgres=# create collation icu67."en-US-x-icu" (provider = icu,
locale = 'libicui18n.so.67:en-US');
CREATE COLLATION
postgres=# select collname, collnamespace::regnamespace,
colliculocale, collversion
             from pg_collation
            where collname = 'en-US-x-icu';
  collname   | collnamespace |     colliculocale      | collversion
-------------+---------------+------------------------+-------------
 en-US-x-icu | pg_catalog    | en-US                  | 153.14
 en-US-x-icu | icu63         | libicui18n.so.63:en-US | 153.88
 en-US-x-icu | icu67         | libicui18n.so.67:en-US | 153.14
(3 rows)
postgres=# select relname from pg_class order by relname collate
icu63."en-US-x-icu" limit 2;
          relname
---------------------------
 _pg_foreign_data_wrappers
 _pg_foreign_servers
(2 rows)
From 5622f25172881e021d0f436add8a785f9e3445e5 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.mu...@gmail.com>
Date: Wed, 8 Jun 2022 17:43:53 +1200
Subject: [PATCH] WIP: allow multiple ICU libraries

XXX This is highly experimental code
---
 src/backend/access/hash/hashfunc.c |  16 +--
 src/backend/utils/adt/pg_locale.c  | 209 +++++++++++++++++++++++++++--
 src/backend/utils/adt/varchar.c    |  16 +--
 src/backend/utils/adt/varlena.c    |  47 +++----
 src/include/utils/pg_locale.h      |  47 +++++++
 5 files changed, 284 insertions(+), 51 deletions(-)

diff --git a/src/backend/access/hash/hashfunc.c b/src/backend/access/hash/hashfunc.c
index b57ed946c4..c1847149de 100644
--- a/src/backend/access/hash/hashfunc.c
+++ b/src/backend/access/hash/hashfunc.c
@@ -298,11 +298,11 @@ hashtext(PG_FUNCTION_ARGS)
 
 			ulen = icu_to_uchar(&uchar, VARDATA_ANY(key), VARSIZE_ANY_EXHDR(key));
 
-			bsize = ucol_getSortKey(mylocale->info.icu.ucol,
-									uchar, ulen, NULL, 0);
+			bsize = mylocale->info.icu.funcs->getSortKey(mylocale->info.icu.ucol,
+														 uchar, ulen, NULL, 0);
 			buf = palloc(bsize);
-			ucol_getSortKey(mylocale->info.icu.ucol,
-							uchar, ulen, buf, bsize);
+			mylocale->info.icu.funcs->getSortKey(mylocale->info.icu.ucol,
+												 uchar, ulen, buf, bsize);
 
 			result = hash_any(buf, bsize);
 
@@ -355,11 +355,11 @@ hashtextextended(PG_FUNCTION_ARGS)
 
 			ulen = icu_to_uchar(&uchar, VARDATA_ANY(key), VARSIZE_ANY_EXHDR(key));
 
-			bsize = ucol_getSortKey(mylocale->info.icu.ucol,
-									uchar, ulen, NULL, 0);
+			bsize = mylocale->info.icu.funcs->getSortKey(mylocale->info.icu.ucol,
+														 uchar, ulen, NULL, 0);
 			buf = palloc(bsize);
-			ucol_getSortKey(mylocale->info.icu.ucol,
-							uchar, ulen, buf, bsize);
+			mylocale->info.icu.funcs->getSortKey(mylocale->info.icu.ucol,
+												 uchar, ulen, buf, bsize);
 
 			result = hash_any_extended(buf, bsize, PG_GETARG_INT64(1));
 
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index a0490a7522..3a8951fe46 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -78,6 +78,10 @@
 #include <shlwapi.h>
 #endif
 
+#ifdef HAVE_DLOPEN
+#include <dlfcn.h>
+#endif
+
 #define		MAX_L10N_DATA		80
 
 
@@ -1435,29 +1439,204 @@ lc_ctype_is_c(Oid collation)
 	return (lookup_collation_cache(collation, true))->ctype_is_c;
 }
 
+#ifdef USE_ICU
+
 struct pg_locale_struct default_locale;
 
+/* Linked list of ICU libraries we have loaded. */
+static pg_icu_library *icu_library_list = NULL;
+
+static void
+free_icu_library(pg_icu_library *l)
+{
+	if (l->handle)
+		dlclose(l->handle);
+	if (l->name)
+		pfree(l->name);
+	pfree(l);
+}
+
+static void *
+get_icu_function(void *handle, const char *function, int version)
+{
+	char name[80];
+
+	snprintf(name, sizeof(name), "%s_%d", function, version);
+
+	return dlsym(handle, name);
+}
+
+static int
+get_icu_library_version(const char *name, void *handle)
+{
+	/*
+	 * Probe until we find the suffix being used.  Since we don't want to use
+	 * libraries that are newer than the one we are compiled against (newer
+	 * libraries might make incompatible API changes), we'll search from a
+	 * known good ancient version number up to the version we were compiled
+	 * with.
+	 */
+	for (int i = 54; i <= U_ICU_VERSION_MAJOR_NUM; ++i)
+		if (get_icu_function(handle, "ucol_open", i))
+			return i;
+
+	/* Either it's a later version we don't dare use, or not ICU. */
+	return -1;
+}
+
+/*
+ * Given a library name, return the object we need to call its functions.
+ */
+static pg_icu_library *
+get_icu_library(const char *name)
+{
+	pg_icu_library *l;
+
+	/* Try to find it in our list of existing libraries. */
+	for (l = icu_library_list; l; l = l->next)
+	{
+		if (name == NULL && l->name == NULL)
+			return l;
+		if (name != NULL && l->name != NULL && strcmp(name, l->name) == 0)
+			return l;
+	}
+
+	/* Make a new entry. */
+	l = MemoryContextAllocZero(TopMemoryContext, sizeof(*l));
+	if (name != NULL) {
+#ifdef HAVE_DLOPEN
+		int version;
+
+		l->name = MemoryContextStrdup(TopMemoryContext, name);
+		l->handle = dlopen(name, RTLD_NOW | RTLD_GLOBAL);
+		if (l->handle == NULL)
+		{
+			int errno_save = errno;
+			free_icu_library(l);
+			errno = errno_save;
+
+			ereport(ERROR,
+					(errmsg("could not load library \"%s\": %m", name)));
+		}
+
+		/* Figure out which major version this is. */
+		version = get_icu_library_version(name, l->handle);
+		if (version < 0)
+		{
+			free_icu_library(l);
+			ereport(ERROR,
+					(errmsg("could not find compatible ICU version in library \"%s\"",
+							name)));
+		}
+		l->version = version;
+
+		/* Look up all the functions we need. */
+		l->open = get_icu_function(l->handle, "ucol_open", version);
+		l->close = get_icu_function(l->handle, "ucol_close", version);
+		l->getVersion = get_icu_function(l->handle, "ucol_getVersion", version);
+		l->versionToString = get_icu_function(l->handle, "u_versionToString",
+											  version);
+		l->strcoll = get_icu_function(l->handle, "ucol_strcoll", version);
+		l->strcollUTF8 = get_icu_function(l->handle, "ucol_strcollUTF8",
+										  version);
+		l->getSortKey = get_icu_function(l->handle, "ucol_getSortKey",
+										 version);
+		l->nextSortKeyPart = get_icu_function(l->handle, "ucol_nextSortKeyPart",
+											  version);
+		l->errorName = get_icu_function(l->handle, "u_errorName", version);
+		if (!l->open ||
+			!l->close ||
+			!l->getVersion ||
+			!l->versionToString ||
+			!l->strcoll ||
+			!l->strcollUTF8 ||
+			!l->getSortKey ||
+			!l->nextSortKeyPart ||
+			!l->errorName)
+		{
+			free_icu_library(l);
+			ereport(ERROR,
+					(errmsg("could not find expected symbols in library \"%s\"",
+							name)));
+		}
+#else
+		ereport(ERROR,
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			 errmsg("loading extra ICU libraries is not supported in this build")));
+#endif
+	}
+	else
+	{
+		/* Use the library we were compiled and linked with. */
+		l->open = ucol_open;
+		l->close = ucol_close;
+		l->getVersion = ucol_getVersion;
+		l->versionToString = u_versionToString;
+		l->strcoll = ucol_strcoll;
+		l->strcollUTF8 = ucol_strcollUTF8;
+		l->getSortKey = ucol_getSortKey;
+		l->nextSortKeyPart = ucol_nextSortKeyPart;
+		l->errorName = u_errorName;
+	}
+	l->next = icu_library_list;
+	icu_library_list = l;
+
+	return l;
+}
+
+/*
+ * Look up the library to use for a given collcollate string.
+ */
+static pg_icu_library *
+get_icu_library_for_collation(const char *collcollate, const char **rest)
+{
+	char prefix[MAXPGPATH];
+	char *separator;
+	size_t prefix_len;
+
+	separator = strchr(collcollate, ':');
+
+	/* If it's a traditional value without a prefix, use default library. */
+	if (separator == NULL)
+	{
+		*rest = collcollate;
+		return get_icu_library(NULL);
+	}
+
+	/* If it has a prefix, we look that library up. */
+	prefix_len = separator - collcollate;
+	strncpy(prefix, collcollate, prefix_len);
+	prefix[prefix_len] = 0;
+	*rest = separator + 1;
+	return get_icu_library(prefix);
+}
+
+#endif
+
 void
 make_icu_collator(const char *iculocstr,
 				  struct pg_locale_struct *resultp)
 {
 #ifdef USE_ICU
+	pg_icu_library *l;
 	UCollator  *collator;
 	UErrorCode	status;
 
+	l = get_icu_library_for_collation(iculocstr, &iculocstr);
 	status = U_ZERO_ERROR;
-	collator = ucol_open(iculocstr, &status);
+	collator = l->open(iculocstr, &status);
 	if (U_FAILURE(status))
 		ereport(ERROR,
 				(errmsg("could not open collator for locale \"%s\": %s",
-						iculocstr, u_errorName(status))));
+						iculocstr, l->errorName(status))));
 
-	if (U_ICU_VERSION_MAJOR_NUM < 54)
+	if (l->version < 54)
 		icu_set_collation_attributes(collator, iculocstr);
 
 	/* We will leak this string if the caller errors later :-( */
 	resultp->info.icu.locale = MemoryContextStrdup(TopMemoryContext, iculocstr);
 	resultp->info.icu.ucol = collator;
+	resultp->info.icu.funcs = l;
 #else							/* not USE_ICU */
 	/* could get here if a collation was created by a build with ICU */
 	ereport(ERROR,
@@ -1688,21 +1867,23 @@ get_collation_actual_version(char collprovider, const char *collcollate)
 #ifdef USE_ICU
 	if (collprovider == COLLPROVIDER_ICU)
 	{
+		pg_icu_library *l;
 		UCollator  *collator;
 		UErrorCode	status;
 		UVersionInfo versioninfo;
 		char		buf[U_MAX_VERSION_STRING_LENGTH];
 
+		l = get_icu_library_for_collation(collcollate, &collcollate);
 		status = U_ZERO_ERROR;
-		collator = ucol_open(collcollate, &status);
+		collator = l->open(collcollate, &status);
 		if (U_FAILURE(status))
 			ereport(ERROR,
 					(errmsg("could not open collator for locale \"%s\": %s",
-							collcollate, u_errorName(status))));
-		ucol_getVersion(collator, versioninfo);
-		ucol_close(collator);
+							collcollate, l->errorName(status))));
+		l->getVersion(collator, versioninfo);
+		l->close(collator);
 
-		u_versionToString(versioninfo, buf);
+		l->versionToString(versioninfo, buf);
 		collversion = pstrdup(buf);
 	}
 	else
@@ -1770,6 +1951,8 @@ get_collation_actual_version(char collprovider, const char *collcollate)
 
 
 #ifdef USE_ICU
+
+
 /*
  * Converter object for converting between ICU's UChar strings and C strings
  * in database encoding.  Since the database encoding doesn't change, we only
@@ -1991,19 +2174,21 @@ void
 check_icu_locale(const char *icu_locale)
 {
 #ifdef USE_ICU
+	pg_icu_library *l;
 	UCollator  *collator;
 	UErrorCode	status;
 
+	l = get_icu_library_for_collation(icu_locale, &icu_locale);
 	status = U_ZERO_ERROR;
-	collator = ucol_open(icu_locale, &status);
+	collator = l->open(icu_locale, &status);
 	if (U_FAILURE(status))
 		ereport(ERROR,
 				(errmsg("could not open collator for locale \"%s\": %s",
-						icu_locale, u_errorName(status))));
+						icu_locale, l->errorName(status))));
 
-	if (U_ICU_VERSION_MAJOR_NUM < 54)
+	if (l->version < 54)
 		icu_set_collation_attributes(collator, icu_locale);
-	ucol_close(collator);
+	l->close(collator);
 #else
 	ereport(ERROR,
 			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
diff --git a/src/backend/utils/adt/varchar.c b/src/backend/utils/adt/varchar.c
index bbeb0a2653..4815275ff5 100644
--- a/src/backend/utils/adt/varchar.c
+++ b/src/backend/utils/adt/varchar.c
@@ -1025,11 +1025,11 @@ hashbpchar(PG_FUNCTION_ARGS)
 
 			ulen = icu_to_uchar(&uchar, keydata, keylen);
 
-			bsize = ucol_getSortKey(mylocale->info.icu.ucol,
-									uchar, ulen, NULL, 0);
+			bsize = mylocale->info.icu.funcs->getSortKey(mylocale->info.icu.ucol,
+														 uchar, ulen, NULL, 0);
 			buf = palloc(bsize);
-			ucol_getSortKey(mylocale->info.icu.ucol,
-							uchar, ulen, buf, bsize);
+			mylocale->info.icu.funcs->getSortKey(mylocale->info.icu.ucol,
+												 uchar, ulen, buf, bsize);
 
 			result = hash_any(buf, bsize);
 
@@ -1086,11 +1086,11 @@ hashbpcharextended(PG_FUNCTION_ARGS)
 
 			ulen = icu_to_uchar(&uchar, VARDATA_ANY(key), VARSIZE_ANY_EXHDR(key));
 
-			bsize = ucol_getSortKey(mylocale->info.icu.ucol,
-									uchar, ulen, NULL, 0);
+			bsize = mylocale->info.icu.funcs->getSortKey(mylocale->info.icu.ucol,
+														 uchar, ulen, NULL, 0);
 			buf = palloc(bsize);
-			ucol_getSortKey(mylocale->info.icu.ucol,
-							uchar, ulen, buf, bsize);
+			mylocale->info.icu.funcs->getSortKey(mylocale->info.icu.ucol,
+												 uchar, ulen, buf, bsize);
 
 			result = hash_any_extended(buf, bsize, PG_GETARG_INT64(1));
 
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index 919138eaf3..f933ec0de0 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -1666,10 +1666,10 @@ varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid)
 					UErrorCode	status;
 
 					status = U_ZERO_ERROR;
-					result = ucol_strcollUTF8(mylocale->info.icu.ucol,
-											  arg1, len1,
-											  arg2, len2,
-											  &status);
+					result = mylocale->info.icu.funcs->strcollUTF8(mylocale->info.icu.ucol,
+																   arg1, len1,
+																   arg2, len2,
+																   &status);
 					if (U_FAILURE(status))
 						ereport(ERROR,
 								(errmsg("collation failed: %s", u_errorName(status))));
@@ -1685,9 +1685,9 @@ varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid)
 					ulen1 = icu_to_uchar(&uchar1, arg1, len1);
 					ulen2 = icu_to_uchar(&uchar2, arg2, len2);
 
-					result = ucol_strcoll(mylocale->info.icu.ucol,
-										  uchar1, ulen1,
-										  uchar2, ulen2);
+					result = mylocale->info.icu.funcs->strcoll(mylocale->info.icu.ucol,
+															   uchar1, ulen1,
+															   uchar2, ulen2);
 
 					pfree(uchar1);
 					pfree(uchar2);
@@ -2389,10 +2389,10 @@ varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup)
 				UErrorCode	status;
 
 				status = U_ZERO_ERROR;
-				result = ucol_strcollUTF8(sss->locale->info.icu.ucol,
-										  a1p, len1,
-										  a2p, len2,
-										  &status);
+				result = sss->locale->info.icu.funcs->strcollUTF8(sss->locale->info.icu.ucol,
+																  a1p, len1,
+																  a2p, len2,
+																  &status);
 				if (U_FAILURE(status))
 					ereport(ERROR,
 							(errmsg("collation failed: %s", u_errorName(status))));
@@ -2408,9 +2408,9 @@ varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup)
 				ulen1 = icu_to_uchar(&uchar1, a1p, len1);
 				ulen2 = icu_to_uchar(&uchar2, a2p, len2);
 
-				result = ucol_strcoll(sss->locale->info.icu.ucol,
-									  uchar1, ulen1,
-									  uchar2, ulen2);
+				result = sss->locale->info.icu.funcs->strcoll(sss->locale->info.icu.ucol,
+															  uchar1, ulen1,
+															  uchar2, ulen2);
 
 				pfree(uchar1);
 				pfree(uchar2);
@@ -2574,21 +2574,22 @@ varstr_abbrev_convert(Datum original, SortSupport ssup)
 					uiter_setUTF8(&iter, sss->buf1, len);
 					state[0] = state[1] = 0;	/* won't need that again */
 					status = U_ZERO_ERROR;
-					bsize = ucol_nextSortKeyPart(sss->locale->info.icu.ucol,
-												 &iter,
-												 state,
-												 (uint8_t *) sss->buf2,
-												 Min(sizeof(Datum), sss->buflen2),
-												 &status);
+					bsize =
+						sss->locale->info.icu.funcs->nextSortKeyPart(sss->locale->info.icu.ucol,
+																	 &iter,
+																	 state,
+																	 (uint8_t *) sss->buf2,
+																	 Min(sizeof(Datum), sss->buflen2),
+																	 &status);
 					if (U_FAILURE(status))
 						ereport(ERROR,
 								(errmsg("sort key generation failed: %s",
 										u_errorName(status))));
 				}
 				else
-					bsize = ucol_getSortKey(sss->locale->info.icu.ucol,
-											uchar, ulen,
-											(uint8_t *) sss->buf2, sss->buflen2);
+					bsize = sss->locale->info.icu.funcs->getSortKey(sss->locale->info.icu.ucol,
+																	uchar, ulen,
+																	(uint8_t *) sss->buf2, sss->buflen2);
 			}
 			else
 #endif
diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h
index e7385faef8..da533fcb91 100644
--- a/src/include/utils/pg_locale.h
+++ b/src/include/utils/pg_locale.h
@@ -74,6 +74,52 @@ extern struct lconv *PGLC_localeconv(void);
 
 extern void cache_locale_time(void);
 
+#ifdef USE_ICU
+
+/*
+ * An ICU library version that we're either linked against or have loaded at
+ * runtime.
+ */
+typedef struct pg_icu_library
+{
+	char	   *name;			/* if loaded with dlopen() */
+	void	   *handle;			/* if loaded with dlopen() */
+	int			version;		/* major version of ICU */
+	UCollator *(*open)(const char *loc, UErrorCode *status);
+	void (*close)(UCollator *coll);
+	void (*getVersion)(const UCollator *coll, UVersionInfo info);
+	void (*versionToString)(const UVersionInfo versionArray,
+							char *versionString);
+	UCollationResult (*strcoll)(const UCollator *coll,
+								const UChar *source,
+								int32_t sourceLength,
+								const UChar *target,
+								int32_t targetLength);
+	UCollationResult (*strcollUTF8)(const UCollator *coll,
+									const char *source,
+									int32_t sourceLength,
+									const char *target,
+									int32_t targetLength,
+									UErrorCode *status);
+	int32_t (*getSortKey)(const UCollator *coll,
+						  const UChar *source,
+						  int32_t sourceLength,
+						  uint8_t *result,
+						  int32_t resultLength);
+	int32_t (*nextSortKeyPart)(const UCollator *coll,
+							   UCharIterator *iter,
+							   uint32_t state[2],
+							   uint8_t *dest,
+							   int32_t count,
+							   UErrorCode *status);
+	const char *(*errorName)(UErrorCode code);
+/*
+	UVersionInfo (*versioninfo)
+*/
+	struct pg_icu_library *next;
+} pg_icu_library;
+
+#endif
 
 /*
  * We define our own wrapper around locale_t so we can keep the same
@@ -95,6 +141,7 @@ struct pg_locale_struct
 		{
 			const char *locale;
 			UCollator  *ucol;
+			pg_icu_library *funcs;
 		}			icu;
 #endif
 		int			dummy;		/* in case we have neither LOCALE_T nor ICU */
-- 
2.30.2

Reply via email to