We currently have

    static PG_Locale_Strategy pg_regex_strategy;
    static pg_locale_t pg_regex_locale;
    static Oid  pg_regex_collation;

but after the recent improvements to pg_locale_t handling, we don't need all three anymore. All the information we have is contained in pg_locale_t, so we just need to keep that one. This allows us to structure the locale-using regular expression code more similar to other locale-using code, mainly by provider, avoiding another layer that is specific only to the regular expression code. The first patch implements that.

The second patch removes a call to pg_set_regex_collation() that I think is unnecessary.

The third patch adds a pg_unset_regex_collation() call that undoes what pg_set_regex_collation() does. I mainly used this to verify the second patch, but maybe it's also useful on its own, not sure.

(I don't have any plans to get rid of the remaining global variable. That would certainly be nice from an intellectual point of view, but fiddling this into the regular expression code looks quite messy. In any case, it's probably easier with one variable instead of three, if someone wants to try.)
From 1799abec05ae3d49a7a57333acd1d377e26d0fe9 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <pe...@eisentraut.org>
Date: Tue, 15 Oct 2024 08:01:41 +0200
Subject: [PATCH 1/3] Remove pg_regex_collation and pg_regex_strategy

We don't need three global variables to describe the locale strategy
for regular expressions.  We only need to keep pg_regex_locale.  This
works now because pg_locale_t now contains all the required
information (such as a ctype_is_c field).  This allows us to structure
the locale-using regular expression code more similar to other
locale-using code, mainly by provider, avoiding another layer that is
specific only to the regular expression code.
---
 src/backend/regex/regc_pg_locale.c | 430 +++++++++++++----------------
 1 file changed, 185 insertions(+), 245 deletions(-)

diff --git a/src/backend/regex/regc_pg_locale.c 
b/src/backend/regex/regc_pg_locale.c
index b75784b6ce5..4691e796385 100644
--- a/src/backend/regex/regc_pg_locale.c
+++ b/src/backend/regex/regc_pg_locale.c
@@ -63,18 +63,7 @@
  * NB: the coding here assumes pg_wchar is an unsigned type.
  */
 
-typedef enum
-{
-       PG_REGEX_STRATEGY_C,            /* C locale (encoding independent) */
-       PG_REGEX_STRATEGY_BUILTIN,      /* built-in Unicode semantics */
-       PG_REGEX_STRATEGY_LIBC_WIDE,    /* Use locale_t <wctype.h> functions */
-       PG_REGEX_STRATEGY_LIBC_1BYTE,   /* Use locale_t <ctype.h> functions */
-       PG_REGEX_STRATEGY_ICU,          /* Use ICU uchar.h functions */
-} PG_Locale_Strategy;
-
-static PG_Locale_Strategy pg_regex_strategy;
 static pg_locale_t pg_regex_locale;
-static Oid     pg_regex_collation;
 
 /*
  * Hard-wired character properties for C locale
@@ -232,7 +221,6 @@ void
 pg_set_regex_collation(Oid collation)
 {
        pg_locale_t locale = 0;
-       PG_Locale_Strategy strategy;
 
        if (!OidIsValid(collation))
        {
@@ -253,8 +241,9 @@ pg_set_regex_collation(Oid collation)
                 * catalog access is available, so we can't call
                 * pg_newlocale_from_collation().
                 */
-               strategy = PG_REGEX_STRATEGY_C;
-               collation = C_COLLATION_OID;
+               static struct pg_locale_struct dummy_locale = {.ctype_is_c = 
true};
+
+               locale = &dummy_locale;
        }
        else
        {
@@ -264,121 +253,80 @@ pg_set_regex_collation(Oid collation)
                        ereport(ERROR,
                                        (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
                                         errmsg("nondeterministic collations 
are not supported for regular expressions")));
-
-               if (locale->ctype_is_c)
-               {
-                       /*
-                        * C/POSIX collations use this path regardless of 
database
-                        * encoding
-                        */
-                       strategy = PG_REGEX_STRATEGY_C;
-                       locale = 0;
-                       collation = C_COLLATION_OID;
-               }
-               else if (locale->provider == COLLPROVIDER_BUILTIN)
-               {
-                       Assert(GetDatabaseEncoding() == PG_UTF8);
-                       strategy = PG_REGEX_STRATEGY_BUILTIN;
-               }
-#ifdef USE_ICU
-               else if (locale->provider == COLLPROVIDER_ICU)
-               {
-                       strategy = PG_REGEX_STRATEGY_ICU;
-               }
-#endif
-               else
-               {
-                       Assert(locale->provider == COLLPROVIDER_LIBC);
-                       if (GetDatabaseEncoding() == PG_UTF8)
-                               strategy = PG_REGEX_STRATEGY_LIBC_WIDE;
-                       else
-                               strategy = PG_REGEX_STRATEGY_LIBC_1BYTE;
-               }
        }
 
-       pg_regex_strategy = strategy;
        pg_regex_locale = locale;
-       pg_regex_collation = collation;
 }
 
 static int
 pg_wc_isdigit(pg_wchar c)
 {
-       switch (pg_regex_strategy)
+       if (pg_regex_locale->ctype_is_c)
+               return (c <= (pg_wchar) 127 && (pg_char_properties[c] & 
PG_ISDIGIT));
+       else if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
+               return pg_u_isdigit(c, true);
+       else if (pg_regex_locale->provider == COLLPROVIDER_LIBC)
        {
-               case PG_REGEX_STRATEGY_C:
-                       return (c <= (pg_wchar) 127 &&
-                                       (pg_char_properties[c] & PG_ISDIGIT));
-               case PG_REGEX_STRATEGY_BUILTIN:
-                       return pg_u_isdigit(c, true);
-               case PG_REGEX_STRATEGY_LIBC_WIDE:
-                       if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-                               return iswdigit_l((wint_t) c, 
pg_regex_locale->info.lt);
-                       /* FALL THRU */
-               case PG_REGEX_STRATEGY_LIBC_1BYTE:
+               if (GetDatabaseEncoding() == PG_UTF8 &&
+                       (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF))
+                       return iswdigit_l((wint_t) c, pg_regex_locale->info.lt);
+               else
                        return (c <= (pg_wchar) UCHAR_MAX &&
                                        isdigit_l((unsigned char) c, 
pg_regex_locale->info.lt));
-                       break;
-               case PG_REGEX_STRATEGY_ICU:
+       }
 #ifdef USE_ICU
-                       return u_isdigit(c);
+       else if (pg_regex_locale->provider == COLLPROVIDER_ICU)
+               return u_isdigit(c);
 #endif
-                       break;
-       }
+
        return 0;                                       /* can't get here, but 
keep compiler quiet */
 }
 
 static int
 pg_wc_isalpha(pg_wchar c)
 {
-       switch (pg_regex_strategy)
+       if (pg_regex_locale->ctype_is_c)
+               return (c <= (pg_wchar) 127 && (pg_char_properties[c] & 
PG_ISALPHA));
+       else if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
+               return pg_u_isalpha(c);
+       else if (pg_regex_locale->provider == COLLPROVIDER_LIBC)
        {
-               case PG_REGEX_STRATEGY_C:
-                       return (c <= (pg_wchar) 127 &&
-                                       (pg_char_properties[c] & PG_ISALPHA));
-               case PG_REGEX_STRATEGY_BUILTIN:
-                       return pg_u_isalpha(c);
-               case PG_REGEX_STRATEGY_LIBC_WIDE:
-                       if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-                               return iswalpha_l((wint_t) c, 
pg_regex_locale->info.lt);
-                       /* FALL THRU */
-               case PG_REGEX_STRATEGY_LIBC_1BYTE:
+               if (GetDatabaseEncoding() == PG_UTF8 &&
+                       (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF))
+                       return iswalpha_l((wint_t) c, pg_regex_locale->info.lt);
+               else
                        return (c <= (pg_wchar) UCHAR_MAX &&
                                        isalpha_l((unsigned char) c, 
pg_regex_locale->info.lt));
-                       break;
-               case PG_REGEX_STRATEGY_ICU:
+       }
 #ifdef USE_ICU
-                       return u_isalpha(c);
+       else if (pg_regex_locale->provider == COLLPROVIDER_ICU)
+               return u_isalpha(c);
 #endif
-                       break;
-       }
+
        return 0;                                       /* can't get here, but 
keep compiler quiet */
 }
 
 static int
 pg_wc_isalnum(pg_wchar c)
 {
-       switch (pg_regex_strategy)
+       if (pg_regex_locale->ctype_is_c)
+               return (c <= (pg_wchar) 127 && (pg_char_properties[c] & 
PG_ISALNUM));
+       else if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
+               return pg_u_isalnum(c, true);
+       else if (pg_regex_locale->provider == COLLPROVIDER_LIBC)
        {
-               case PG_REGEX_STRATEGY_C:
-                       return (c <= (pg_wchar) 127 &&
-                                       (pg_char_properties[c] & PG_ISALNUM));
-               case PG_REGEX_STRATEGY_BUILTIN:
-                       return pg_u_isalnum(c, true);
-               case PG_REGEX_STRATEGY_LIBC_WIDE:
-                       if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-                               return iswalnum_l((wint_t) c, 
pg_regex_locale->info.lt);
-                       /* FALL THRU */
-               case PG_REGEX_STRATEGY_LIBC_1BYTE:
+               if (GetDatabaseEncoding() == PG_UTF8 &&
+                       (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF))
+                       return iswalnum_l((wint_t) c, pg_regex_locale->info.lt);
+               else
                        return (c <= (pg_wchar) UCHAR_MAX &&
                                        isalnum_l((unsigned char) c, 
pg_regex_locale->info.lt));
-                       break;
-               case PG_REGEX_STRATEGY_ICU:
+       }
 #ifdef USE_ICU
-                       return u_isalnum(c);
+       else if (pg_regex_locale->provider == COLLPROVIDER_ICU)
+               return u_isalnum(c);
 #endif
-                       break;
-       }
+
        return 0;                                       /* can't get here, but 
keep compiler quiet */
 }
 
@@ -394,218 +342,206 @@ pg_wc_isword(pg_wchar c)
 static int
 pg_wc_isupper(pg_wchar c)
 {
-       switch (pg_regex_strategy)
+       if (pg_regex_locale->ctype_is_c)
+               return (c <= (pg_wchar) 127 && (pg_char_properties[c] & 
PG_ISUPPER));
+       else if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
+               return pg_u_isupper(c);
+       else if (pg_regex_locale->provider == COLLPROVIDER_LIBC)
        {
-               case PG_REGEX_STRATEGY_C:
-                       return (c <= (pg_wchar) 127 &&
-                                       (pg_char_properties[c] & PG_ISUPPER));
-               case PG_REGEX_STRATEGY_BUILTIN:
-                       return pg_u_isupper(c);
-               case PG_REGEX_STRATEGY_LIBC_WIDE:
-                       if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-                               return iswupper_l((wint_t) c, 
pg_regex_locale->info.lt);
-                       /* FALL THRU */
-               case PG_REGEX_STRATEGY_LIBC_1BYTE:
+               if (GetDatabaseEncoding() == PG_UTF8 &&
+                       (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF))
+                       return iswupper_l((wint_t) c, pg_regex_locale->info.lt);
+               else
                        return (c <= (pg_wchar) UCHAR_MAX &&
                                        isupper_l((unsigned char) c, 
pg_regex_locale->info.lt));
-                       break;
-               case PG_REGEX_STRATEGY_ICU:
+       }
 #ifdef USE_ICU
-                       return u_isupper(c);
+       else if (pg_regex_locale->provider == COLLPROVIDER_ICU)
+               return u_isupper(c);
 #endif
-                       break;
-       }
+
        return 0;                                       /* can't get here, but 
keep compiler quiet */
 }
 
 static int
 pg_wc_islower(pg_wchar c)
 {
-       switch (pg_regex_strategy)
+       if (pg_regex_locale->ctype_is_c)
+               return (c <= (pg_wchar) 127 && (pg_char_properties[c] & 
PG_ISLOWER));
+       else if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
+               return pg_u_islower(c);
+       else if (pg_regex_locale->provider == COLLPROVIDER_LIBC)
        {
-               case PG_REGEX_STRATEGY_C:
-                       return (c <= (pg_wchar) 127 &&
-                                       (pg_char_properties[c] & PG_ISLOWER));
-               case PG_REGEX_STRATEGY_BUILTIN:
-                       return pg_u_islower(c);
-               case PG_REGEX_STRATEGY_LIBC_WIDE:
-                       if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-                               return iswlower_l((wint_t) c, 
pg_regex_locale->info.lt);
-                       /* FALL THRU */
-               case PG_REGEX_STRATEGY_LIBC_1BYTE:
+               if (GetDatabaseEncoding() == PG_UTF8 &&
+                       (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF))
+                       return iswlower_l((wint_t) c, pg_regex_locale->info.lt);
+               else
                        return (c <= (pg_wchar) UCHAR_MAX &&
                                        islower_l((unsigned char) c, 
pg_regex_locale->info.lt));
-                       break;
-               case PG_REGEX_STRATEGY_ICU:
+       }
 #ifdef USE_ICU
-                       return u_islower(c);
+       else if (pg_regex_locale->provider == COLLPROVIDER_ICU)
+               return u_islower(c);
 #endif
-                       break;
-       }
+
        return 0;                                       /* can't get here, but 
keep compiler quiet */
 }
 
 static int
 pg_wc_isgraph(pg_wchar c)
 {
-       switch (pg_regex_strategy)
+       if (pg_regex_locale->ctype_is_c)
+               return (c <= (pg_wchar) 127 && (pg_char_properties[c] & 
PG_ISGRAPH));
+       else if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
+               return pg_u_isgraph(c);
+       else if (pg_regex_locale->provider == COLLPROVIDER_LIBC)
        {
-               case PG_REGEX_STRATEGY_C:
-                       return (c <= (pg_wchar) 127 &&
-                                       (pg_char_properties[c] & PG_ISGRAPH));
-               case PG_REGEX_STRATEGY_BUILTIN:
-                       return pg_u_isgraph(c);
-               case PG_REGEX_STRATEGY_LIBC_WIDE:
-                       if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-                               return iswgraph_l((wint_t) c, 
pg_regex_locale->info.lt);
-                       /* FALL THRU */
-               case PG_REGEX_STRATEGY_LIBC_1BYTE:
+               if (GetDatabaseEncoding() == PG_UTF8 &&
+                       (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF))
+                       return iswgraph_l((wint_t) c, pg_regex_locale->info.lt);
+               else
                        return (c <= (pg_wchar) UCHAR_MAX &&
                                        isgraph_l((unsigned char) c, 
pg_regex_locale->info.lt));
-                       break;
-               case PG_REGEX_STRATEGY_ICU:
+       }
 #ifdef USE_ICU
-                       return u_isgraph(c);
+       else if (pg_regex_locale->provider == COLLPROVIDER_ICU)
+               return u_isgraph(c);
 #endif
-                       break;
-       }
+
        return 0;                                       /* can't get here, but 
keep compiler quiet */
 }
 
 static int
 pg_wc_isprint(pg_wchar c)
 {
-       switch (pg_regex_strategy)
+       if (pg_regex_locale->ctype_is_c)
+               return (c <= (pg_wchar) 127 && (pg_char_properties[c] & 
PG_ISPRINT));
+       else if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
+               return pg_u_isprint(c);
+       else if (pg_regex_locale->provider == COLLPROVIDER_LIBC)
        {
-               case PG_REGEX_STRATEGY_C:
-                       return (c <= (pg_wchar) 127 &&
-                                       (pg_char_properties[c] & PG_ISPRINT));
-               case PG_REGEX_STRATEGY_BUILTIN:
-                       return pg_u_isprint(c);
-               case PG_REGEX_STRATEGY_LIBC_WIDE:
-                       if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-                               return iswprint_l((wint_t) c, 
pg_regex_locale->info.lt);
-                       /* FALL THRU */
-               case PG_REGEX_STRATEGY_LIBC_1BYTE:
+               if (GetDatabaseEncoding() == PG_UTF8 &&
+                       (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF))
+                       return iswprint_l((wint_t) c, pg_regex_locale->info.lt);
+               else
                        return (c <= (pg_wchar) UCHAR_MAX &&
                                        isprint_l((unsigned char) c, 
pg_regex_locale->info.lt));
-                       break;
-               case PG_REGEX_STRATEGY_ICU:
+       }
 #ifdef USE_ICU
-                       return u_isprint(c);
+       else if (pg_regex_locale->provider == COLLPROVIDER_ICU)
+               return u_isprint(c);
 #endif
-                       break;
-       }
+
        return 0;                                       /* can't get here, but 
keep compiler quiet */
 }
 
 static int
 pg_wc_ispunct(pg_wchar c)
 {
-       switch (pg_regex_strategy)
+       if (pg_regex_locale->ctype_is_c)
+               return (c <= (pg_wchar) 127 && (pg_char_properties[c] & 
PG_ISPUNCT));
+       else if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
+               return pg_u_ispunct(c, true);
+       else if (pg_regex_locale->provider == COLLPROVIDER_LIBC)
        {
-               case PG_REGEX_STRATEGY_C:
-                       return (c <= (pg_wchar) 127 &&
-                                       (pg_char_properties[c] & PG_ISPUNCT));
-               case PG_REGEX_STRATEGY_BUILTIN:
-                       return pg_u_ispunct(c, true);
-               case PG_REGEX_STRATEGY_LIBC_WIDE:
-                       if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-                               return iswpunct_l((wint_t) c, 
pg_regex_locale->info.lt);
-                       /* FALL THRU */
-               case PG_REGEX_STRATEGY_LIBC_1BYTE:
+               if (GetDatabaseEncoding() == PG_UTF8 &&
+                       (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF))
+                       return iswpunct_l((wint_t) c, pg_regex_locale->info.lt);
+               else
                        return (c <= (pg_wchar) UCHAR_MAX &&
                                        ispunct_l((unsigned char) c, 
pg_regex_locale->info.lt));
-                       break;
-               case PG_REGEX_STRATEGY_ICU:
+       }
 #ifdef USE_ICU
-                       return u_ispunct(c);
+       else if (pg_regex_locale->provider == COLLPROVIDER_ICU)
+               return u_ispunct(c);
 #endif
-                       break;
-       }
+
        return 0;                                       /* can't get here, but 
keep compiler quiet */
 }
 
 static int
 pg_wc_isspace(pg_wchar c)
 {
-       switch (pg_regex_strategy)
+       if (pg_regex_locale->ctype_is_c)
+               return (c <= (pg_wchar) 127 && (pg_char_properties[c] & 
PG_ISSPACE));
+       else if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
+               return pg_u_isspace(c);
+       else if (pg_regex_locale->provider == COLLPROVIDER_LIBC)
        {
-               case PG_REGEX_STRATEGY_C:
-                       return (c <= (pg_wchar) 127 &&
-                                       (pg_char_properties[c] & PG_ISSPACE));
-               case PG_REGEX_STRATEGY_BUILTIN:
-                       return pg_u_isspace(c);
-               case PG_REGEX_STRATEGY_LIBC_WIDE:
-                       if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-                               return iswspace_l((wint_t) c, 
pg_regex_locale->info.lt);
-                       /* FALL THRU */
-               case PG_REGEX_STRATEGY_LIBC_1BYTE:
+               if (GetDatabaseEncoding() == PG_UTF8 &&
+                       (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF))
+                       return iswspace_l((wint_t) c, pg_regex_locale->info.lt);
+               else
                        return (c <= (pg_wchar) UCHAR_MAX &&
                                        isspace_l((unsigned char) c, 
pg_regex_locale->info.lt));
-                       break;
-               case PG_REGEX_STRATEGY_ICU:
+       }
 #ifdef USE_ICU
-                       return u_isspace(c);
+       else if (pg_regex_locale->provider == COLLPROVIDER_ICU)
+               return u_isspace(c);
 #endif
-                       break;
-       }
+
        return 0;                                       /* can't get here, but 
keep compiler quiet */
 }
 
 static pg_wchar
 pg_wc_toupper(pg_wchar c)
 {
-       switch (pg_regex_strategy)
+       if (pg_regex_locale->ctype_is_c)
        {
-               case PG_REGEX_STRATEGY_C:
-                       if (c <= (pg_wchar) 127)
-                               return pg_ascii_toupper((unsigned char) c);
-                       return c;
-               case PG_REGEX_STRATEGY_BUILTIN:
-                       return unicode_uppercase_simple(c);
-               case PG_REGEX_STRATEGY_LIBC_WIDE:
-                       if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-                               return towupper_l((wint_t) c, 
pg_regex_locale->info.lt);
-                       /* FALL THRU */
-               case PG_REGEX_STRATEGY_LIBC_1BYTE:
+               if (c <= (pg_wchar) 127)
+                       return pg_ascii_toupper((unsigned char) c);
+               return c;
+       }
+       else if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
+               return unicode_uppercase_simple(c);
+       else if (pg_regex_locale->provider == COLLPROVIDER_LIBC)
+       {
+               if (GetDatabaseEncoding() == PG_UTF8 &&
+                       (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF))
+                       return towupper_l((wint_t) c, pg_regex_locale->info.lt);
+               else
+               {
                        if (c <= (pg_wchar) UCHAR_MAX)
                                return toupper_l((unsigned char) c, 
pg_regex_locale->info.lt);
                        return c;
-               case PG_REGEX_STRATEGY_ICU:
+               }
+       }
 #ifdef USE_ICU
-                       return u_toupper(c);
+       else if (pg_regex_locale->provider == COLLPROVIDER_ICU)
+               return u_toupper(c);
 #endif
-                       break;
-       }
+
        return 0;                                       /* can't get here, but 
keep compiler quiet */
 }
 
 static pg_wchar
 pg_wc_tolower(pg_wchar c)
 {
-       switch (pg_regex_strategy)
+       if (pg_regex_locale->ctype_is_c)
        {
-               case PG_REGEX_STRATEGY_C:
-                       if (c <= (pg_wchar) 127)
-                               return pg_ascii_tolower((unsigned char) c);
-                       return c;
-               case PG_REGEX_STRATEGY_BUILTIN:
-                       return unicode_lowercase_simple(c);
-               case PG_REGEX_STRATEGY_LIBC_WIDE:
-                       if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-                               return towlower_l((wint_t) c, 
pg_regex_locale->info.lt);
-                       /* FALL THRU */
-               case PG_REGEX_STRATEGY_LIBC_1BYTE:
+               if (c <= (pg_wchar) 127)
+                       return pg_ascii_tolower((unsigned char) c);
+               return c;
+       }
+       else if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
+               return unicode_lowercase_simple(c);
+       else if (pg_regex_locale->provider == COLLPROVIDER_LIBC)
+       {
+               if (GetDatabaseEncoding() == PG_UTF8 &&
+                       (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF))
+                       return towlower_l((wint_t) c, pg_regex_locale->info.lt);
+               else
+               {
                        if (c <= (pg_wchar) UCHAR_MAX)
                                return tolower_l((unsigned char) c, 
pg_regex_locale->info.lt);
                        return c;
-               case PG_REGEX_STRATEGY_ICU:
+               }
+       }
 #ifdef USE_ICU
-                       return u_tolower(c);
+       else if (pg_regex_locale->provider == COLLPROVIDER_ICU)
+               return u_tolower(c);
 #endif
-                       break;
-       }
+
        return 0;                                       /* can't get here, but 
keep compiler quiet */
 }
 
@@ -628,7 +564,7 @@ typedef int (*pg_wc_probefunc) (pg_wchar c);
 typedef struct pg_ctype_cache
 {
        pg_wc_probefunc probefunc;      /* pg_wc_isalpha or a sibling */
-       Oid                     collation;              /* collation this entry 
is for */
+       pg_locale_t locale;                     /* locale this entry is for */
        struct cvec cv;                         /* cache entry contents */
        struct pg_ctype_cache *next;    /* chain link */
 } pg_ctype_cache;
@@ -697,7 +633,7 @@ pg_ctype_get_cache(pg_wc_probefunc probefunc, int 
cclasscode)
        for (pcc = pg_ctype_cache_list; pcc != NULL; pcc = pcc->next)
        {
                if (pcc->probefunc == probefunc &&
-                       pcc->collation == pg_regex_collation)
+                       pcc->locale == pg_regex_locale)
                        return &pcc->cv;
        }
 
@@ -708,7 +644,7 @@ pg_ctype_get_cache(pg_wc_probefunc probefunc, int 
cclasscode)
        if (pcc == NULL)
                return NULL;
        pcc->probefunc = probefunc;
-       pcc->collation = pg_regex_collation;
+       pcc->locale = pg_regex_locale;
        pcc->cv.nchrs = 0;
        pcc->cv.chrspace = 128;
        pcc->cv.chrs = (chr *) malloc(pcc->cv.chrspace * sizeof(chr));
@@ -732,37 +668,41 @@ pg_ctype_get_cache(pg_wc_probefunc probefunc, int 
cclasscode)
         * would always be true for production values of MAX_SIMPLE_CHR, but 
it's
         * useful to allow it to be small for testing purposes.)
         */
-       switch (pg_regex_strategy)
+       if (pg_regex_locale->ctype_is_c)
        {
-               case PG_REGEX_STRATEGY_C:
 #if MAX_SIMPLE_CHR >= 127
-                       max_chr = (pg_wchar) 127;
-                       pcc->cv.cclasscode = -1;
+               max_chr = (pg_wchar) 127;
+               pcc->cv.cclasscode = -1;
 #else
-                       max_chr = (pg_wchar) MAX_SIMPLE_CHR;
+               max_chr = (pg_wchar) MAX_SIMPLE_CHR;
 #endif
-                       break;
-               case PG_REGEX_STRATEGY_BUILTIN:
-                       max_chr = (pg_wchar) MAX_SIMPLE_CHR;
-                       break;
-               case PG_REGEX_STRATEGY_LIBC_WIDE:
+       }
+       else if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
+       {
+               max_chr = (pg_wchar) MAX_SIMPLE_CHR;
+       }
+       else if (pg_regex_locale->provider == COLLPROVIDER_LIBC)
+       {
+               if (GetDatabaseEncoding() == PG_UTF8)
                        max_chr = (pg_wchar) MAX_SIMPLE_CHR;
-                       break;
-               case PG_REGEX_STRATEGY_LIBC_1BYTE:
+               else
+               {
 #if MAX_SIMPLE_CHR >= UCHAR_MAX
                        max_chr = (pg_wchar) UCHAR_MAX;
                        pcc->cv.cclasscode = -1;
 #else
                        max_chr = (pg_wchar) MAX_SIMPLE_CHR;
 #endif
-                       break;
-               case PG_REGEX_STRATEGY_ICU:
-                       max_chr = (pg_wchar) MAX_SIMPLE_CHR;
-                       break;
-               default:
-                       Assert(false);
-                       max_chr = 0;            /* can't get here, but keep 
compiler quiet */
-                       break;
+               }
+       }
+       else if (pg_regex_locale->provider == COLLPROVIDER_ICU)
+       {
+               max_chr = (pg_wchar) MAX_SIMPLE_CHR;
+       }
+       else
+       {
+               Assert(false);
+               max_chr = 0;                    /* can't get here, but keep 
compiler quiet */
        }
 
        /*

base-commit: 7cdfeee320e72162b62dddddee638e713c2b8680
-- 
2.47.0

From dc4b2e9b8b89feb3a687e7a8906c5e496f53706f Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <pe...@eisentraut.org>
Date: Tue, 15 Oct 2024 08:01:41 +0200
Subject: [PATCH 2/3] Remove unneeded pg_set_regex_collation() call

The call in pg_regprefix() was apparently never necessary, because
this code doesn't actually execute a regular expression, but it just
looks at it, which doesn't invoke any locale-using functionality.
---
 src/backend/regex/regprefix.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/backend/regex/regprefix.c b/src/backend/regex/regprefix.c
index 47a8cebe075..6ba32ac1de0 100644
--- a/src/backend/regex/regprefix.c
+++ b/src/backend/regex/regprefix.c
@@ -61,9 +61,6 @@ pg_regprefix(regex_t *re,
        if (re->re_csize != sizeof(chr))
                return REG_MIXED;
 
-       /* Initialize locale-dependent support */
-       pg_set_regex_collation(re->re_collation);
-
        /* setup */
        g = (struct guts *) re->re_guts;
        if (g->info & REG_UIMPOSSIBLE)
-- 
2.47.0

From d1ce2c40a15443891ddf9c2340cca66d5e141c92 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <pe...@eisentraut.org>
Date: Tue, 15 Oct 2024 08:01:41 +0200
Subject: [PATCH 3/3] WIP: Add pg_unset_regex_collation()

Add a function pg_unset_regex_collation() that complements
pg_set_regex_collation().  This unsets the global locale variable for
regular expression use.  This also adds assertions into both functions
to make sure they are used correctly in pairs.

This allows us to detect if pg_set_regex_collation() is not called
when it should have been.  Before, this would not be detected but the
locale settings lingering from a previous use would be used.
---
 src/backend/regex/regc_pg_locale.c | 10 ++++++++++
 src/backend/regex/regcomp.c        |  9 ++++++++-
 src/backend/regex/regexec.c        |  8 +++++---
 src/include/regex/regguts.h        |  1 +
 4 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/src/backend/regex/regc_pg_locale.c 
b/src/backend/regex/regc_pg_locale.c
index 4691e796385..8f38507b4bf 100644
--- a/src/backend/regex/regc_pg_locale.c
+++ b/src/backend/regex/regc_pg_locale.c
@@ -222,6 +222,8 @@ pg_set_regex_collation(Oid collation)
 {
        pg_locale_t locale = 0;
 
+       Assert(!pg_regex_locale);
+
        if (!OidIsValid(collation))
        {
                /*
@@ -258,6 +260,14 @@ pg_set_regex_collation(Oid collation)
        pg_regex_locale = locale;
 }
 
+void
+pg_unset_regex_collation(void)
+{
+       Assert(pg_regex_locale);
+       pg_regex_locale = 0;
+}
+
+
 static int
 pg_wc_isdigit(pg_wchar c)
 {
diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c
index 8a6cfb2973d..f1442272065 100644
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@@ -385,7 +385,7 @@ pg_regcomp(regex_t *re,
        FILE       *debug = (FILE *) NULL;
 #endif
 
-#define  CNOERR()       { if (ISERR()) return freev(v, v->err); }
+#define  CNOERR()       { if (ISERR()) { pg_unset_regex_collation(); return 
freev(v, v->err); } }
 
        /* sanity checks */
 
@@ -433,7 +433,10 @@ pg_regcomp(regex_t *re,
        /* more complex setup, malloced things */
        re->re_guts = VS(MALLOC(sizeof(struct guts)));
        if (re->re_guts == NULL)
+       {
+               pg_unset_regex_collation();
                return freev(v, REG_ESPACE);
+       }
        g = (struct guts *) re->re_guts;
        g->tree = NULL;
        initcm(v, &g->cmap);
@@ -446,7 +449,10 @@ pg_regcomp(regex_t *re,
        /* set up a reasonably-sized transient cvec for getcvec usage */
        v->cv = newcvec(100, 20);
        if (v->cv == NULL)
+       {
+               pg_unset_regex_collation();
                return freev(v, REG_ESPACE);
+       }
 
        /* parsing */
        lexstart(v);                            /* also handles prefixes */
@@ -542,6 +548,7 @@ pg_regcomp(regex_t *re,
        }
 #endif
 
+       pg_unset_regex_collation();
        assert(v->err == 0);
        return freev(v, 0);
 }
diff --git a/src/backend/regex/regexec.c b/src/backend/regex/regexec.c
index 2a1d5bebda3..406fd2f7a1b 100644
--- a/src/backend/regex/regexec.c
+++ b/src/backend/regex/regexec.c
@@ -212,9 +212,6 @@ pg_regexec(regex_t *re,
        if (search_start > len)
                return REG_NOMATCH;
 
-       /* Initialize locale-dependent support */
-       pg_set_regex_collation(re->re_collation);
-
        /* setup */
        v->re = re;
        v->g = (struct guts *) re->re_guts;
@@ -257,6 +254,10 @@ pg_regexec(regex_t *re,
        v->ladfas = NULL;
        v->lblastcss = NULL;
        v->lblastcp = NULL;
+
+       /* Initialize locale-dependent support */
+       pg_set_regex_collation(re->re_collation);
+
        /* below this point, "goto cleanup" will behave sanely */
 
        assert(v->g->ntree >= 0);
@@ -326,6 +327,7 @@ pg_regexec(regex_t *re,
 
        /* clean up */
 cleanup:
+       pg_unset_regex_collation();
        if (v->pmatch != pmatch && v->pmatch != mat)
                FREE(v->pmatch);
        if (v->subdfas != NULL)
diff --git a/src/include/regex/regguts.h b/src/include/regex/regguts.h
index 3ca3647e118..a35b85b463c 100644
--- a/src/include/regex/regguts.h
+++ b/src/include/regex/regguts.h
@@ -545,4 +545,5 @@ struct guts
 
 /* prototypes for functions that are exported from regcomp.c to regexec.c */
 extern void pg_set_regex_collation(Oid collation);
+extern void pg_unset_regex_collation(void);
 extern color pg_reg_getcolor(struct colormap *cm, chr c);
-- 
2.47.0

Reply via email to