C11: should we use char32_t for unicode code points?

Jeff Davis Thu, 23 Oct 2025 11:16:20 -0700

Now that we're using C11, should we use char32_t for unicode code
points?

Right now, we use pg_wchar for two purposes:


  1. to abstract away some problems with wchar_t on platforms where
it's 16 bits; and
  2. hold unicode code point values

In UTF8, they are are equivalent and can be freely cast back and forth,
but not necessarily in other encodings. That can be confusing in some
contexts. Attached is a patch to use char32_t for the second purpose.

Both are equivalent to uint32, so there's no functional change and no
actual typechecking, it's just for readability.

Is this helpful, or needless code churn?

Regards,
        Jeff Davis

From b5b65eb496ff0365f8cde297c5486755e65fc4b1 Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Tue, 21 Oct 2025 13:16:47 -0700
Subject: [PATCH v1] Use C11 char32_t for Unicode code points.

---
 src/backend/parser/parser.c                   |  2 +-
 src/backend/utils/adt/pg_locale_builtin.c     | 44 ++++++++++-----
 src/backend/utils/adt/varlena.c               | 40 ++++++-------
 src/backend/utils/mb/mbutils.c                |  4 +-
 src/common/saslprep.c                         | 48 ++++++++--------
 src/common/unicode/case_test.c                | 23 ++++----
 src/common/unicode/category_test.c            |  3 +-
 .../unicode/generate-norm_test_table.pl       |  4 +-
 .../unicode/generate-unicode_case_table.pl    |  7 +--
 .../generate-unicode_category_table.pl        |  8 +--
 src/common/unicode/norm_test.c                |  6 +-
 src/common/unicode_case.c                     | 56 +++++++++----------
 src/common/unicode_category.c                 | 50 ++++++++---------
 src/common/unicode_norm.c                     | 56 +++++++++----------
 src/fe_utils/mbprint.c                        |  2 +-
 src/include/c.h                               |  5 ++
 src/include/common/unicode_case.h             | 10 ++--
 src/include/common/unicode_case_table.h       | 13 ++---
 src/include/common/unicode_category.h         | 46 ++++++++-------
 src/include/common/unicode_category_table.h   |  8 +--
 src/include/common/unicode_norm.h             |  6 +-
 src/include/mb/pg_wchar.h                     | 16 +++---
 src/tools/pgindent/typedefs.list              |  1 +
 23 files changed, 237 insertions(+), 221 deletions(-)

diff --git a/src/backend/parser/parser.c b/src/backend/parser/parser.c
index 33a040506b4..88126626fb1 100644
--- a/src/backend/parser/parser.c
+++ b/src/backend/parser/parser.c
@@ -339,7 +339,7 @@ hexval(unsigned char c)
 
 /* is Unicode code point acceptable? */
 static void
-check_unicode_value(pg_wchar c)
+check_unicode_value(char32_t c)
 {
 	if (!is_valid_unicode_codepoint(c))
 		ereport(ERROR,
diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c
index 3dc611b50e1..1021e0d129b 100644
--- a/src/backend/utils/adt/pg_locale_builtin.c
+++ b/src/backend/utils/adt/pg_locale_builtin.c
@@ -15,7 +15,6 @@
 #include "catalog/pg_collation.h"
 #include "common/unicode_case.h"
 #include "common/unicode_category.h"
-#include "mb/pg_wchar.h"
 #include "miscadmin.h"
 #include "utils/builtins.h"
 #include "utils/pg_locale.h"
@@ -35,6 +34,23 @@ struct WordBoundaryState
 	bool		prev_alnum;
 };
 
+/*
+ * In UTF-8, pg_wchar is guaranteed to be the code point value.
+ */
+static inline char32_t
+to_char32(pg_wchar wc)
+{
+	Assert(GetDatabaseEncoding() == PG_UTF8);
+	return (char32_t) wc;
+}
+
+static inline pg_wchar
+to_pg_wchar(char32_t c32)
+{
+	Assert(GetDatabaseEncoding() == PG_UTF8);
+	return (pg_wchar) c32;
+}
+
 /*
  * Simple word boundary iterator that draws boundaries each time the result of
  * pg_u_isalnum() changes.
@@ -47,7 +63,7 @@ initcap_wbnext(void *state)
 	while (wbstate->offset < wbstate->len &&
 		   wbstate->str[wbstate->offset] != '\0')
 	{
-		pg_wchar	u = utf8_to_unicode((unsigned char *) wbstate->str +
+		char32_t	u = utf8_to_unicode((unsigned char *) wbstate->str +
 										wbstate->offset);
 		bool		curr_alnum = pg_u_isalnum(u, wbstate->posix);
 
@@ -112,61 +128,61 @@ strfold_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
 static bool
 wc_isdigit_builtin(pg_wchar wc, pg_locale_t locale)
 {
-	return pg_u_isdigit(wc, !locale->builtin.casemap_full);
+	return pg_u_isdigit(to_char32(wc), !locale->builtin.casemap_full);
 }
 
 static bool
 wc_isalpha_builtin(pg_wchar wc, pg_locale_t locale)
 {
-	return pg_u_isalpha(wc);
+	return pg_u_isalpha(to_char32(wc));
 }
 
 static bool
 wc_isalnum_builtin(pg_wchar wc, pg_locale_t locale)
 {
-	return pg_u_isalnum(wc, !locale->builtin.casemap_full);
+	return pg_u_isalnum(to_char32(wc), !locale->builtin.casemap_full);
 }
 
 static bool
 wc_isupper_builtin(pg_wchar wc, pg_locale_t locale)
 {
-	return pg_u_isupper(wc);
+	return pg_u_isupper(to_char32(wc));
 }
 
 static bool
 wc_islower_builtin(pg_wchar wc, pg_locale_t locale)
 {
-	return pg_u_islower(wc);
+	return pg_u_islower(to_char32(wc));
 }
 
 static bool
 wc_isgraph_builtin(pg_wchar wc, pg_locale_t locale)
 {
-	return pg_u_isgraph(wc);
+	return pg_u_isgraph(to_char32(wc));
 }
 
 static bool
 wc_isprint_builtin(pg_wchar wc, pg_locale_t locale)
 {
-	return pg_u_isprint(wc);
+	return pg_u_isprint(to_char32(wc));
 }
 
 static bool
 wc_ispunct_builtin(pg_wchar wc, pg_locale_t locale)
 {
-	return pg_u_ispunct(wc, !locale->builtin.casemap_full);
+	return pg_u_ispunct(to_char32(wc), !locale->builtin.casemap_full);
 }
 
 static bool
 wc_isspace_builtin(pg_wchar wc, pg_locale_t locale)
 {
-	return pg_u_isspace(wc);
+	return pg_u_isspace(to_char32(wc));
 }
 
 static bool
 wc_isxdigit_builtin(pg_wchar wc, pg_locale_t locale)
 {
-	return pg_u_isxdigit(wc, !locale->builtin.casemap_full);
+	return pg_u_isxdigit(to_char32(wc), !locale->builtin.casemap_full);
 }
 
 static bool
@@ -179,13 +195,13 @@ char_is_cased_builtin(char ch, pg_locale_t locale)
 static pg_wchar
 wc_toupper_builtin(pg_wchar wc, pg_locale_t locale)
 {
-	return unicode_uppercase_simple(wc);
+	return to_pg_wchar(unicode_uppercase_simple(to_char32(wc)));
 }
 
 static pg_wchar
 wc_tolower_builtin(pg_wchar wc, pg_locale_t locale)
 {
-	return unicode_lowercase_simple(wc);
+	return to_pg_wchar(unicode_lowercase_simple(to_char32(wc)));
 }
 
 static const struct ctype_methods ctype_methods_builtin = {
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index 2c398cd9e5c..fa1a975cab9 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -5419,12 +5419,12 @@ unicode_assigned(PG_FUNCTION_ARGS)
 		ereport(ERROR,
 				(errmsg("Unicode categorization can only be performed if server encoding is UTF8")));
 
-	/* convert to pg_wchar */
+	/* convert to char32_t */
 	size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
 	p = (unsigned char *) VARDATA_ANY(input);
 	for (int i = 0; i < size; i++)
 	{
-		pg_wchar	uchar = utf8_to_unicode(p);
+		char32_t	uchar = utf8_to_unicode(p);
 		int			category = unicode_category(uchar);
 
 		if (category == PG_U_UNASSIGNED)
@@ -5443,24 +5443,24 @@ unicode_normalize_func(PG_FUNCTION_ARGS)
 	char	   *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
 	UnicodeNormalizationForm form;
 	int			size;
-	pg_wchar   *input_chars;
-	pg_wchar   *output_chars;
+	char32_t   *input_chars;
+	char32_t   *output_chars;
 	unsigned char *p;
 	text	   *result;
 	int			i;
 
 	form = unicode_norm_form_from_string(formstr);
 
-	/* convert to pg_wchar */
+	/* convert to char32_t */
 	size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
-	input_chars = palloc((size + 1) * sizeof(pg_wchar));
+	input_chars = palloc((size + 1) * sizeof(char32_t));
 	p = (unsigned char *) VARDATA_ANY(input);
 	for (i = 0; i < size; i++)
 	{
 		input_chars[i] = utf8_to_unicode(p);
 		p += pg_utf_mblen(p);
 	}
-	input_chars[i] = (pg_wchar) '\0';
+	input_chars[i] = (char32_t) '\0';
 	Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
 
 	/* action */
@@ -5468,7 +5468,7 @@ unicode_normalize_func(PG_FUNCTION_ARGS)
 
 	/* convert back to UTF-8 string */
 	size = 0;
-	for (pg_wchar *wp = output_chars; *wp; wp++)
+	for (char32_t *wp = output_chars; *wp; wp++)
 	{
 		unsigned char buf[4];
 
@@ -5480,7 +5480,7 @@ unicode_normalize_func(PG_FUNCTION_ARGS)
 	SET_VARSIZE(result, size + VARHDRSZ);
 
 	p = (unsigned char *) VARDATA_ANY(result);
-	for (pg_wchar *wp = output_chars; *wp; wp++)
+	for (char32_t *wp = output_chars; *wp; wp++)
 	{
 		unicode_to_utf8(*wp, p);
 		p += pg_utf_mblen(p);
@@ -5509,8 +5509,8 @@ unicode_is_normalized(PG_FUNCTION_ARGS)
 	char	   *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
 	UnicodeNormalizationForm form;
 	int			size;
-	pg_wchar   *input_chars;
-	pg_wchar   *output_chars;
+	char32_t   *input_chars;
+	char32_t   *output_chars;
 	unsigned char *p;
 	int			i;
 	UnicodeNormalizationQC quickcheck;
@@ -5519,16 +5519,16 @@ unicode_is_normalized(PG_FUNCTION_ARGS)
 
 	form = unicode_norm_form_from_string(formstr);
 
-	/* convert to pg_wchar */
+	/* convert to char32_t */
 	size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
-	input_chars = palloc((size + 1) * sizeof(pg_wchar));
+	input_chars = palloc((size + 1) * sizeof(char32_t));
 	p = (unsigned char *) VARDATA_ANY(input);
 	for (i = 0; i < size; i++)
 	{
 		input_chars[i] = utf8_to_unicode(p);
 		p += pg_utf_mblen(p);
 	}
-	input_chars[i] = (pg_wchar) '\0';
+	input_chars[i] = (char32_t) '\0';
 	Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
 
 	/* quick check (see UAX #15) */
@@ -5542,11 +5542,11 @@ unicode_is_normalized(PG_FUNCTION_ARGS)
 	output_chars = unicode_normalize(form, input_chars);
 
 	output_size = 0;
-	for (pg_wchar *wp = output_chars; *wp; wp++)
+	for (char32_t *wp = output_chars; *wp; wp++)
 		output_size++;
 
 	result = (size == output_size) &&
-		(memcmp(input_chars, output_chars, size * sizeof(pg_wchar)) == 0);
+		(memcmp(input_chars, output_chars, size * sizeof(char32_t)) == 0);
 
 	PG_RETURN_BOOL(result);
 }
@@ -5602,7 +5602,7 @@ unistr(PG_FUNCTION_ARGS)
 	int			len;
 	StringInfoData str;
 	text	   *result;
-	pg_wchar	pair_first = 0;
+	char32_t	pair_first = 0;
 	char		cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
 
 	instr = VARDATA_ANY(input_text);
@@ -5626,7 +5626,7 @@ unistr(PG_FUNCTION_ARGS)
 			else if ((len >= 5 && isxdigits_n(instr + 1, 4)) ||
 					 (len >= 6 && instr[1] == 'u' && isxdigits_n(instr + 2, 4)))
 			{
-				pg_wchar	unicode;
+				char32_t	unicode;
 				int			offset = instr[1] == 'u' ? 2 : 1;
 
 				unicode = hexval_n(instr + offset, 4);
@@ -5662,7 +5662,7 @@ unistr(PG_FUNCTION_ARGS)
 			}
 			else if (len >= 8 && instr[1] == '+' && isxdigits_n(instr + 2, 6))
 			{
-				pg_wchar	unicode;
+				char32_t	unicode;
 
 				unicode = hexval_n(instr + 2, 6);
 
@@ -5697,7 +5697,7 @@ unistr(PG_FUNCTION_ARGS)
 			}
 			else if (len >= 10 && instr[1] == 'U' && isxdigits_n(instr + 2, 8))
 			{
-				pg_wchar	unicode;
+				char32_t	unicode;
 
 				unicode = hexval_n(instr + 2, 8);
 
diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c
index 886ecbad871..fb629ed5c8f 100644
--- a/src/backend/utils/mb/mbutils.c
+++ b/src/backend/utils/mb/mbutils.c
@@ -862,7 +862,7 @@ perform_default_encoding_conversion(const char *src, int len,
  * may call this outside any transaction, or in an aborted transaction.
  */
 void
-pg_unicode_to_server(pg_wchar c, unsigned char *s)
+pg_unicode_to_server(char32_t c, unsigned char *s)
 {
 	unsigned char c_as_utf8[MAX_MULTIBYTE_CHAR_LEN + 1];
 	int			c_as_utf8_len;
@@ -924,7 +924,7 @@ pg_unicode_to_server(pg_wchar c, unsigned char *s)
  * but simply return false on conversion failure.
  */
 bool
-pg_unicode_to_server_noerror(pg_wchar c, unsigned char *s)
+pg_unicode_to_server_noerror(char32_t c, unsigned char *s)
 {
 	unsigned char c_as_utf8[MAX_MULTIBYTE_CHAR_LEN + 1];
 	int			c_as_utf8_len;
diff --git a/src/common/saslprep.c b/src/common/saslprep.c
index 97beb47940b..101e8d65a4d 100644
--- a/src/common/saslprep.c
+++ b/src/common/saslprep.c
@@ -47,7 +47,7 @@
 
 /* Prototypes for local functions */
 static int	codepoint_range_cmp(const void *a, const void *b);
-static bool is_code_in_table(pg_wchar code, const pg_wchar *map, int mapsize);
+static bool is_code_in_table(char32_t code, const char32_t *map, int mapsize);
 static int	pg_utf8_string_len(const char *source);
 
 /*
@@ -64,7 +64,7 @@ static int	pg_utf8_string_len(const char *source);
  *
  * These are all mapped to the ASCII space character (U+00A0).
  */
-static const pg_wchar non_ascii_space_ranges[] =
+static const char32_t non_ascii_space_ranges[] =
 {
 	0x00A0, 0x00A0,
 	0x1680, 0x1680,
@@ -79,7 +79,7 @@ static const pg_wchar non_ascii_space_ranges[] =
  *
  * If any of these appear in the input, they are removed.
  */
-static const pg_wchar commonly_mapped_to_nothing_ranges[] =
+static const char32_t commonly_mapped_to_nothing_ranges[] =
 {
 	0x00AD, 0x00AD,
 	0x034F, 0x034F,
@@ -114,7 +114,7 @@ static const pg_wchar commonly_mapped_to_nothing_ranges[] =
  * tables, so one code might originate from multiple source tables.
  * Adjacent ranges have also been merged together, to save space.
  */
-static const pg_wchar prohibited_output_ranges[] =
+static const char32_t prohibited_output_ranges[] =
 {
 	0x0000, 0x001F,				/* C.2.1 */
 	0x007F, 0x00A0,				/* C.1.2, C.2.1, C.2.2 */
@@ -155,7 +155,7 @@ static const pg_wchar prohibited_output_ranges[] =
 };
 
 /* A.1 Unassigned code points in Unicode 3.2 */
-static const pg_wchar unassigned_codepoint_ranges[] =
+static const char32_t unassigned_codepoint_ranges[] =
 {
 	0x0221, 0x0221,
 	0x0234, 0x024F,
@@ -556,7 +556,7 @@ static const pg_wchar unassigned_codepoint_ranges[] =
 };
 
 /* D.1 Characters with bidirectional property "R" or "AL" */
-static const pg_wchar RandALCat_codepoint_ranges[] =
+static const char32_t RandALCat_codepoint_ranges[] =
 {
 	0x05BE, 0x05BE,
 	0x05C0, 0x05C0,
@@ -595,7 +595,7 @@ static const pg_wchar RandALCat_codepoint_ranges[] =
 };
 
 /* D.2 Characters with bidirectional property "L" */
-static const pg_wchar LCat_codepoint_ranges[] =
+static const char32_t LCat_codepoint_ranges[] =
 {
 	0x0041, 0x005A,
 	0x0061, 0x007A,
@@ -968,8 +968,8 @@ static const pg_wchar LCat_codepoint_ranges[] =
 static int
 codepoint_range_cmp(const void *a, const void *b)
 {
-	const pg_wchar *key = (const pg_wchar *) a;
-	const pg_wchar *range = (const pg_wchar *) b;
+	const char32_t *key = (const char32_t *) a;
+	const char32_t *range = (const char32_t *) b;
 
 	if (*key < range[0])
 		return -1;				/* less than lower bound */
@@ -980,14 +980,14 @@ codepoint_range_cmp(const void *a, const void *b)
 }
 
 static bool
-is_code_in_table(pg_wchar code, const pg_wchar *map, int mapsize)
+is_code_in_table(char32_t code, const char32_t *map, int mapsize)
 {
 	Assert(mapsize % 2 == 0);
 
 	if (code < map[0] || code > map[mapsize - 1])
 		return false;
 
-	if (bsearch(&code, map, mapsize / 2, sizeof(pg_wchar) * 2,
+	if (bsearch(&code, map, mapsize / 2, sizeof(char32_t) * 2,
 				codepoint_range_cmp))
 		return true;
 	else
@@ -1046,8 +1046,8 @@ pg_utf8_string_len(const char *source)
 pg_saslprep_rc
 pg_saslprep(const char *input, char **output)
 {
-	pg_wchar   *input_chars = NULL;
-	pg_wchar   *output_chars = NULL;
+	char32_t   *input_chars = NULL;
+	char32_t   *output_chars = NULL;
 	int			input_size;
 	char	   *result;
 	int			result_size;
@@ -1055,7 +1055,7 @@ pg_saslprep(const char *input, char **output)
 	int			i;
 	bool		contains_RandALCat;
 	unsigned char *p;
-	pg_wchar   *wp;
+	char32_t   *wp;
 
 	/* Ensure we return *output as NULL on failure */
 	*output = NULL;
@@ -1080,10 +1080,10 @@ pg_saslprep(const char *input, char **output)
 	input_size = pg_utf8_string_len(input);
 	if (input_size < 0)
 		return SASLPREP_INVALID_UTF8;
-	if (input_size >= MaxAllocSize / sizeof(pg_wchar))
+	if (input_size >= MaxAllocSize / sizeof(char32_t))
 		goto oom;
 
-	input_chars = ALLOC((input_size + 1) * sizeof(pg_wchar));
+	input_chars = ALLOC((input_size + 1) * sizeof(char32_t));
 	if (!input_chars)
 		goto oom;
 
@@ -1093,7 +1093,7 @@ pg_saslprep(const char *input, char **output)
 		input_chars[i] = utf8_to_unicode(p);
 		p += pg_utf_mblen(p);
 	}
-	input_chars[i] = (pg_wchar) '\0';
+	input_chars[i] = (char32_t) '\0';
 
 	/*
 	 * The steps below correspond to the steps listed in [RFC3454], Section
@@ -1107,7 +1107,7 @@ pg_saslprep(const char *input, char **output)
 	count = 0;
 	for (i = 0; i < input_size; i++)
 	{
-		pg_wchar	code = input_chars[i];
+		char32_t	code = input_chars[i];
 
 		if (IS_CODE_IN_TABLE(code, non_ascii_space_ranges))
 			input_chars[count++] = 0x0020;
@@ -1118,7 +1118,7 @@ pg_saslprep(const char *input, char **output)
 		else
 			input_chars[count++] = code;
 	}
-	input_chars[count] = (pg_wchar) '\0';
+	input_chars[count] = (char32_t) '\0';
 	input_size = count;
 
 	if (input_size == 0)
@@ -1138,7 +1138,7 @@ pg_saslprep(const char *input, char **output)
 	 */
 	for (i = 0; i < input_size; i++)
 	{
-		pg_wchar	code = input_chars[i];
+		char32_t	code = input_chars[i];
 
 		if (IS_CODE_IN_TABLE(code, prohibited_output_ranges))
 			goto prohibited;
@@ -1170,7 +1170,7 @@ pg_saslprep(const char *input, char **output)
 	contains_RandALCat = false;
 	for (i = 0; i < input_size; i++)
 	{
-		pg_wchar	code = input_chars[i];
+		char32_t	code = input_chars[i];
 
 		if (IS_CODE_IN_TABLE(code, RandALCat_codepoint_ranges))
 		{
@@ -1181,12 +1181,12 @@ pg_saslprep(const char *input, char **output)
 
 	if (contains_RandALCat)
 	{
-		pg_wchar	first = input_chars[0];
-		pg_wchar	last = input_chars[input_size - 1];
+		char32_t	first = input_chars[0];
+		char32_t	last = input_chars[input_size - 1];
 
 		for (i = 0; i < input_size; i++)
 		{
-			pg_wchar	code = input_chars[i];
+			char32_t	code = input_chars[i];
 
 			if (IS_CODE_IN_TABLE(code, LCat_codepoint_ranges))
 				goto prohibited;
diff --git a/src/common/unicode/case_test.c b/src/common/unicode/case_test.c
index fdfb62e8552..00d4f85e5a5 100644
--- a/src/common/unicode/case_test.c
+++ b/src/common/unicode/case_test.c
@@ -24,6 +24,7 @@
 #include "common/unicode_case.h"
 #include "common/unicode_category.h"
 #include "common/unicode_version.h"
+#include "mb/pg_wchar.h"
 
 /* enough to hold largest source or result string, including NUL */
 #define BUFSZ 256
@@ -54,7 +55,7 @@ initcap_wbnext(void *state)
 	while (wbstate->offset < wbstate->len &&
 		   wbstate->str[wbstate->offset] != '\0')
 	{
-		pg_wchar	u = utf8_to_unicode((unsigned char *) wbstate->str +
+		char32_t	u = utf8_to_unicode((unsigned char *) wbstate->str +
 										wbstate->offset);
 		bool		curr_alnum = pg_u_isalnum(u, wbstate->posix);
 
@@ -77,16 +78,16 @@ initcap_wbnext(void *state)
 #ifdef USE_ICU
 
 static void
-icu_test_simple(pg_wchar code)
+icu_test_simple(char32_t code)
 {
-	pg_wchar	lower = unicode_lowercase_simple(code);
-	pg_wchar	title = unicode_titlecase_simple(code);
-	pg_wchar	upper = unicode_uppercase_simple(code);
-	pg_wchar	fold = unicode_casefold_simple(code);
-	pg_wchar	iculower = u_tolower(code);
-	pg_wchar	icutitle = u_totitle(code);
-	pg_wchar	icuupper = u_toupper(code);
-	pg_wchar	icufold = u_foldCase(code, U_FOLD_CASE_DEFAULT);
+	char32_t	lower = unicode_lowercase_simple(code);
+	char32_t	title = unicode_titlecase_simple(code);
+	char32_t	upper = unicode_uppercase_simple(code);
+	char32_t	fold = unicode_casefold_simple(code);
+	char32_t	iculower = u_tolower(code);
+	char32_t	icutitle = u_totitle(code);
+	char32_t	icuupper = u_toupper(code);
+	char32_t	icufold = u_foldCase(code, U_FOLD_CASE_DEFAULT);
 
 	if (lower != iculower || title != icutitle || upper != icuupper ||
 		fold != icufold)
@@ -172,7 +173,7 @@ test_icu(void)
 	int			successful = 0;
 	int			skipped_mismatch = 0;
 
-	for (pg_wchar code = 0; code <= 0x10ffff; code++)
+	for (char32_t code = 0; code <= 0x10ffff; code++)
 	{
 		pg_unicode_category category = unicode_category(code);
 
diff --git a/src/common/unicode/category_test.c b/src/common/unicode/category_test.c
index 5d37ba39196..1e8c1f7905f 100644
--- a/src/common/unicode/category_test.c
+++ b/src/common/unicode/category_test.c
@@ -22,6 +22,7 @@
 
 #include "common/unicode_category.h"
 #include "common/unicode_version.h"
+#include "mb/pg_wchar.h"
 
 static int	pg_unicode_version = 0;
 #ifdef USE_ICU
@@ -59,7 +60,7 @@ icu_test()
 	int			pg_skipped_codepoints = 0;
 	int			icu_skipped_codepoints = 0;
 
-	for (pg_wchar code = 0; code <= 0x10ffff; code++)
+	for (char32_t code = 0; code <= 0x10ffff; code++)
 	{
 		uint8_t		pg_category = unicode_category(code);
 		uint8_t		icu_category = u_charType(code);
diff --git a/src/common/unicode/generate-norm_test_table.pl b/src/common/unicode/generate-norm_test_table.pl
index 1b401be9409..1a8b908ff33 100644
--- a/src/common/unicode/generate-norm_test_table.pl
+++ b/src/common/unicode/generate-norm_test_table.pl
@@ -47,8 +47,8 @@ print $OUTPUT <<HEADER;
 typedef struct
 {
 	int			linenum;
-	pg_wchar	input[50];
-	pg_wchar	output[4][50];
+	char32_t	input[50];
+	char32_t	output[4][50];
 } pg_unicode_test;
 
 /* test table */
diff --git a/src/common/unicode/generate-unicode_case_table.pl b/src/common/unicode/generate-unicode_case_table.pl
index 5d9ddd62803..f71eb25c94e 100644
--- a/src/common/unicode/generate-unicode_case_table.pl
+++ b/src/common/unicode/generate-unicode_case_table.pl
@@ -270,7 +270,6 @@ print $OT <<"EOS";
  */
 
 #include "common/unicode_case.h"
-#include "mb/pg_wchar.h"
 
 /*
  * The maximum number of codepoints that can result from case mapping
@@ -297,7 +296,7 @@ typedef enum
 typedef struct
 {
 	int16		conditions;
-	pg_wchar	map[NCaseKind][MAX_CASE_EXPANSION];
+	char32_t	map[NCaseKind][MAX_CASE_EXPANSION];
 } pg_special_case;
 
 /*
@@ -430,7 +429,7 @@ foreach my $kind ('lower', 'title', 'upper', 'fold')
  * The entry case_map_${kind}[case_index(codepoint)] is the mapping for the
  * given codepoint.
  */
-static const pg_wchar case_map_$kind\[$index\] =
+static const char32_t case_map_$kind\[$index\] =
 {
 EOS
 
@@ -502,7 +501,7 @@ print $OT <<"EOS";
  * the offset into the mapping tables.
  */
 static inline uint16
-case_index(pg_wchar cp)
+case_index(char32_t cp)
 {
 	/* Fast path for codepoints < $fastpath_limit */
 	if (cp < $fastpath_limit)
diff --git a/src/common/unicode/generate-unicode_category_table.pl b/src/common/unicode/generate-unicode_category_table.pl
index abab5cd9696..7e094b13720 100644
--- a/src/common/unicode/generate-unicode_category_table.pl
+++ b/src/common/unicode/generate-unicode_category_table.pl
@@ -366,15 +366,15 @@ print $OT <<"EOS";
  */
 typedef struct
 {
-	uint32		first;			/* Unicode codepoint */
-	uint32		last;			/* Unicode codepoint */
+	char32_t	first;			/* Unicode codepoint */
+	char32_t	last;			/* Unicode codepoint */
 	uint8		category;		/* General Category */
 } pg_category_range;
 
 typedef struct
 {
-	uint32		first;			/* Unicode codepoint */
-	uint32		last;			/* Unicode codepoint */
+	char32_t	first;			/* Unicode codepoint */
+	char32_t	last;			/* Unicode codepoint */
 } pg_unicode_range;
 
 typedef struct
diff --git a/src/common/unicode/norm_test.c b/src/common/unicode/norm_test.c
index 25bc59463f2..058817f1719 100644
--- a/src/common/unicode/norm_test.c
+++ b/src/common/unicode/norm_test.c
@@ -20,7 +20,7 @@
 #include "norm_test_table.h"
 
 static char *
-print_wchar_str(const pg_wchar *s)
+print_wchar_str(const char32_t *s)
 {
 #define BUF_DIGITS 50
 	static char buf[BUF_DIGITS * 11 + 1];
@@ -41,7 +41,7 @@ print_wchar_str(const pg_wchar *s)
 }
 
 static int
-pg_wcscmp(const pg_wchar *s1, const pg_wchar *s2)
+pg_wcscmp(const char32_t *s1, const char32_t *s2)
 {
 	for (;;)
 	{
@@ -65,7 +65,7 @@ main(int argc, char **argv)
 	{
 		for (int form = 0; form < 4; form++)
 		{
-			pg_wchar   *result;
+			char32_t   *result;
 
 			result = unicode_normalize(form, test->input);
 
diff --git a/src/common/unicode_case.c b/src/common/unicode_case.c
index 073faf6a0d5..e5e494db43c 100644
--- a/src/common/unicode_case.c
+++ b/src/common/unicode_case.c
@@ -30,7 +30,7 @@ enum CaseMapResult
 /*
  * Map for each case kind.
  */
-static const pg_wchar *const casekind_map[NCaseKind] =
+static const char32_t *const casekind_map[NCaseKind] =
 {
 	[CaseLower] = case_map_lower,
 	[CaseTitle] = case_map_title,
@@ -38,42 +38,42 @@ static const pg_wchar *const casekind_map[NCaseKind] =
 	[CaseFold] = case_map_fold,
 };
 
-static pg_wchar find_case_map(pg_wchar ucs, const pg_wchar *map);
+static char32_t find_case_map(char32_t ucs, const char32_t *map);
 static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
 						   CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
 						   void *wbstate);
-static enum CaseMapResult casemap(pg_wchar u1, CaseKind casekind, bool full,
+static enum CaseMapResult casemap(char32_t u1, CaseKind casekind, bool full,
 								  const char *src, size_t srclen, size_t srcoff,
-								  pg_wchar *simple, const pg_wchar **special);
+								  char32_t *simple, const char32_t **special);
 
-pg_wchar
-unicode_lowercase_simple(pg_wchar code)
+char32_t
+unicode_lowercase_simple(char32_t code)
 {
-	pg_wchar	cp = find_case_map(code, case_map_lower);
+	char32_t	cp = find_case_map(code, case_map_lower);
 
 	return cp != 0 ? cp : code;
 }
 
-pg_wchar
-unicode_titlecase_simple(pg_wchar code)
+char32_t
+unicode_titlecase_simple(char32_t code)
 {
-	pg_wchar	cp = find_case_map(code, case_map_title);
+	char32_t	cp = find_case_map(code, case_map_title);
 
 	return cp != 0 ? cp : code;
 }
 
-pg_wchar
-unicode_uppercase_simple(pg_wchar code)
+char32_t
+unicode_uppercase_simple(char32_t code)
 {
-	pg_wchar	cp = find_case_map(code, case_map_upper);
+	char32_t	cp = find_case_map(code, case_map_upper);
 
 	return cp != 0 ? cp : code;
 }
 
-pg_wchar
-unicode_casefold_simple(pg_wchar code)
+char32_t
+unicode_casefold_simple(char32_t code)
 {
-	pg_wchar	cp = find_case_map(code, case_map_fold);
+	char32_t	cp = find_case_map(code, case_map_fold);
 
 	return cp != 0 ? cp : code;
 }
@@ -231,10 +231,10 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
 
 	while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0')
 	{
-		pg_wchar	u1 = utf8_to_unicode((unsigned char *) src + srcoff);
+		char32_t	u1 = utf8_to_unicode((unsigned char *) src + srcoff);
 		int			u1len = unicode_utf8len(u1);
-		pg_wchar	simple = 0;
-		const pg_wchar *special = NULL;
+		char32_t	simple = 0;
+		const char32_t *special = NULL;
 		enum CaseMapResult casemap_result;
 
 		if (str_casekind == CaseTitle)
@@ -265,8 +265,8 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
 			case CASEMAP_SIMPLE:
 				{
 					/* replace with single character */
-					pg_wchar	u2 = simple;
-					pg_wchar	u2len = unicode_utf8len(u2);
+					char32_t	u2 = simple;
+					char32_t	u2len = unicode_utf8len(u2);
 
 					Assert(special == NULL);
 					if (result_len + u2len <= dstsize)
@@ -280,7 +280,7 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
 				Assert(simple == 0);
 				for (int i = 0; i < MAX_CASE_EXPANSION && special[i]; i++)
 				{
-					pg_wchar	u2 = special[i];
+					char32_t	u2 = special[i];
 					size_t		u2len = unicode_utf8len(u2);
 
 					if (result_len + u2len <= dstsize)
@@ -320,7 +320,7 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
 	{
 		if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
 		{
-			pg_wchar	curr = utf8_to_unicode(str + i);
+			char32_t	curr = utf8_to_unicode(str + i);
 
 			if (pg_u_prop_case_ignorable(curr))
 				continue;
@@ -344,7 +344,7 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
 	{
 		if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
 		{
-			pg_wchar	curr = utf8_to_unicode(str + i);
+			char32_t	curr = utf8_to_unicode(str + i);
 
 			if (pg_u_prop_case_ignorable(curr))
 				continue;
@@ -394,9 +394,9 @@ check_special_conditions(int conditions, const char *str, size_t len,
  * character without modification.
  */
 static enum CaseMapResult
-casemap(pg_wchar u1, CaseKind casekind, bool full,
+casemap(char32_t u1, CaseKind casekind, bool full,
 		const char *src, size_t srclen, size_t srcoff,
-		pg_wchar *simple, const pg_wchar **special)
+		char32_t *simple, const char32_t **special)
 {
 	uint16		idx;
 
@@ -434,8 +434,8 @@ casemap(pg_wchar u1, CaseKind casekind, bool full,
  * Find entry in simple case map.
  * If the entry does not exist, 0 will be returned.
  */
-static pg_wchar
-find_case_map(pg_wchar ucs, const pg_wchar *map)
+static char32_t
+find_case_map(char32_t ucs, const char32_t *map)
 {
 	/* Fast path for codepoints < 0x80 */
 	if (ucs < 0x80)
diff --git a/src/common/unicode_category.c b/src/common/unicode_category.c
index 4136c4d4f92..aab667a7bb4 100644
--- a/src/common/unicode_category.c
+++ b/src/common/unicode_category.c
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  * unicode_category.c
  *		Determine general category and character properties of Unicode
- *		characters. Encoding must be UTF8, where we assume that the pg_wchar
+ *		characters. Encoding must be UTF8, where we assume that the char32_t
  *		representation is a code point.
  *
  * Portions Copyright (c) 2017-2025, PostgreSQL Global Development Group
@@ -76,13 +76,13 @@
 #define PG_U_CHARACTER_TAB	0x09
 
 static bool range_search(const pg_unicode_range *tbl, size_t size,
-						 pg_wchar code);
+						 char32_t code);
 
 /*
  * Unicode general category for the given codepoint.
  */
 pg_unicode_category
-unicode_category(pg_wchar code)
+unicode_category(char32_t code)
 {
 	int			min = 0;
 	int			mid;
@@ -108,7 +108,7 @@ unicode_category(pg_wchar code)
 }
 
 bool
-pg_u_prop_alphabetic(pg_wchar code)
+pg_u_prop_alphabetic(char32_t code)
 {
 	if (code < 0x80)
 		return unicode_opt_ascii[code].properties & PG_U_PROP_ALPHABETIC;
@@ -119,7 +119,7 @@ pg_u_prop_alphabetic(pg_wchar code)
 }
 
 bool
-pg_u_prop_lowercase(pg_wchar code)
+pg_u_prop_lowercase(char32_t code)
 {
 	if (code < 0x80)
 		return unicode_opt_ascii[code].properties & PG_U_PROP_LOWERCASE;
@@ -130,7 +130,7 @@ pg_u_prop_lowercase(pg_wchar code)
 }
 
 bool
-pg_u_prop_uppercase(pg_wchar code)
+pg_u_prop_uppercase(char32_t code)
 {
 	if (code < 0x80)
 		return unicode_opt_ascii[code].properties & PG_U_PROP_UPPERCASE;
@@ -141,7 +141,7 @@ pg_u_prop_uppercase(pg_wchar code)
 }
 
 bool
-pg_u_prop_cased(pg_wchar code)
+pg_u_prop_cased(char32_t code)
 {
 	uint32		category_mask;
 
@@ -156,7 +156,7 @@ pg_u_prop_cased(pg_wchar code)
 }
 
 bool
-pg_u_prop_case_ignorable(pg_wchar code)
+pg_u_prop_case_ignorable(char32_t code)
 {
 	if (code < 0x80)
 		return unicode_opt_ascii[code].properties & PG_U_PROP_CASE_IGNORABLE;
@@ -167,7 +167,7 @@ pg_u_prop_case_ignorable(pg_wchar code)
 }
 
 bool
-pg_u_prop_white_space(pg_wchar code)
+pg_u_prop_white_space(char32_t code)
 {
 	if (code < 0x80)
 		return unicode_opt_ascii[code].properties & PG_U_PROP_WHITE_SPACE;
@@ -178,7 +178,7 @@ pg_u_prop_white_space(pg_wchar code)
 }
 
 bool
-pg_u_prop_hex_digit(pg_wchar code)
+pg_u_prop_hex_digit(char32_t code)
 {
 	if (code < 0x80)
 		return unicode_opt_ascii[code].properties & PG_U_PROP_HEX_DIGIT;
@@ -189,7 +189,7 @@ pg_u_prop_hex_digit(pg_wchar code)
 }
 
 bool
-pg_u_prop_join_control(pg_wchar code)
+pg_u_prop_join_control(char32_t code)
 {
 	if (code < 0x80)
 		return unicode_opt_ascii[code].properties & PG_U_PROP_JOIN_CONTROL;
@@ -208,7 +208,7 @@ pg_u_prop_join_control(pg_wchar code)
  */
 
 bool
-pg_u_isdigit(pg_wchar code, bool posix)
+pg_u_isdigit(char32_t code, bool posix)
 {
 	if (posix)
 		return ('0' <= code && code <= '9');
@@ -217,19 +217,19 @@ pg_u_isdigit(pg_wchar code, bool posix)
 }
 
 bool
-pg_u_isalpha(pg_wchar code)
+pg_u_isalpha(char32_t code)
 {
 	return pg_u_prop_alphabetic(code);
 }
 
 bool
-pg_u_isalnum(pg_wchar code, bool posix)
+pg_u_isalnum(char32_t code, bool posix)
 {
 	return pg_u_isalpha(code) || pg_u_isdigit(code, posix);
 }
 
 bool
-pg_u_isword(pg_wchar code)
+pg_u_isword(char32_t code)
 {
 	uint32		category_mask = PG_U_CATEGORY_MASK(unicode_category(code));
 
@@ -240,32 +240,32 @@ pg_u_isword(pg_wchar code)
 }
 
 bool
-pg_u_isupper(pg_wchar code)
+pg_u_isupper(char32_t code)
 {
 	return pg_u_prop_uppercase(code);
 }
 
 bool
-pg_u_islower(pg_wchar code)
+pg_u_islower(char32_t code)
 {
 	return pg_u_prop_lowercase(code);
 }
 
 bool
-pg_u_isblank(pg_wchar code)
+pg_u_isblank(char32_t code)
 {
 	return code == PG_U_CHARACTER_TAB ||
 		unicode_category(code) == PG_U_SPACE_SEPARATOR;
 }
 
 bool
-pg_u_iscntrl(pg_wchar code)
+pg_u_iscntrl(char32_t code)
 {
 	return unicode_category(code) == PG_U_CONTROL;
 }
 
 bool
-pg_u_isgraph(pg_wchar code)
+pg_u_isgraph(char32_t code)
 {
 	uint32		category_mask = PG_U_CATEGORY_MASK(unicode_category(code));
 
@@ -276,7 +276,7 @@ pg_u_isgraph(pg_wchar code)
 }
 
 bool
-pg_u_isprint(pg_wchar code)
+pg_u_isprint(char32_t code)
 {
 	pg_unicode_category category = unicode_category(code);
 
@@ -287,7 +287,7 @@ pg_u_isprint(pg_wchar code)
 }
 
 bool
-pg_u_ispunct(pg_wchar code, bool posix)
+pg_u_ispunct(char32_t code, bool posix)
 {
 	uint32		category_mask;
 
@@ -308,13 +308,13 @@ pg_u_ispunct(pg_wchar code, bool posix)
 }
 
 bool
-pg_u_isspace(pg_wchar code)
+pg_u_isspace(char32_t code)
 {
 	return pg_u_prop_white_space(code);
 }
 
 bool
-pg_u_isxdigit(pg_wchar code, bool posix)
+pg_u_isxdigit(char32_t code, bool posix)
 {
 	if (posix)
 		return (('0' <= code && code <= '9') ||
@@ -478,7 +478,7 @@ unicode_category_abbrev(pg_unicode_category category)
  * given table.
  */
 static bool
-range_search(const pg_unicode_range *tbl, size_t size, pg_wchar code)
+range_search(const pg_unicode_range *tbl, size_t size, char32_t code)
 {
 	int			min = 0;
 	int			mid;
diff --git a/src/common/unicode_norm.c b/src/common/unicode_norm.c
index 6654b4cbc49..489d99cd5ab 100644
--- a/src/common/unicode_norm.c
+++ b/src/common/unicode_norm.c
@@ -69,7 +69,7 @@ conv_compare(const void *p1, const void *p2)
  * lookup, while the frontend version uses a binary search.
  */
 static const pg_unicode_decomposition *
-get_code_entry(pg_wchar code)
+get_code_entry(char32_t code)
 {
 #ifndef FRONTEND
 	int			h;
@@ -109,7 +109,7 @@ get_code_entry(pg_wchar code)
  * Get the combining class of the given codepoint.
  */
 static uint8
-get_canonical_class(pg_wchar code)
+get_canonical_class(char32_t code)
 {
 	const pg_unicode_decomposition *entry = get_code_entry(code);
 
@@ -130,15 +130,15 @@ get_canonical_class(pg_wchar code)
  * Note: the returned pointer can point to statically allocated buffer, and
  * is only valid until next call to this function!
  */
-static const pg_wchar *
+static const char32_t *
 get_code_decomposition(const pg_unicode_decomposition *entry, int *dec_size)
 {
-	static pg_wchar x;
+	static char32_t x;
 
 	if (DECOMPOSITION_IS_INLINE(entry))
 	{
 		Assert(DECOMPOSITION_SIZE(entry) == 1);
-		x = (pg_wchar) entry->dec_index;
+		x = (char32_t) entry->dec_index;
 		*dec_size = 1;
 		return &x;
 	}
@@ -156,7 +156,7 @@ get_code_decomposition(const pg_unicode_decomposition *entry, int *dec_size)
  * are, in turn, decomposable.
  */
 static int
-get_decomposed_size(pg_wchar code, bool compat)
+get_decomposed_size(char32_t code, bool compat)
 {
 	const pg_unicode_decomposition *entry;
 	int			size = 0;
@@ -318,7 +318,7 @@ recompose_code(uint32 start, uint32 code, uint32 *result)
  * in the array result.
  */
 static void
-decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
+decompose_code(char32_t code, bool compat, char32_t **result, int *current)
 {
 	const pg_unicode_decomposition *entry;
 	int			i;
@@ -337,7 +337,7 @@ decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
 					v,
 					tindex,
 					sindex;
-		pg_wchar   *res = *result;
+		char32_t   *res = *result;
 
 		sindex = code - SBASE;
 		l = LBASE + sindex / (VCOUNT * TCOUNT);
@@ -369,7 +369,7 @@ decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
 	if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
 		(!compat && DECOMPOSITION_IS_COMPAT(entry)))
 	{
-		pg_wchar   *res = *result;
+		char32_t   *res = *result;
 
 		res[*current] = code;
 		(*current)++;
@@ -382,7 +382,7 @@ decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
 	decomp = get_code_decomposition(entry, &dec_size);
 	for (i = 0; i < dec_size; i++)
 	{
-		pg_wchar	lcode = (pg_wchar) decomp[i];
+		char32_t	lcode = (char32_t) decomp[i];
 
 		/* Leave if no more decompositions */
 		decompose_code(lcode, compat, result, current);
@@ -398,17 +398,17 @@ decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
  * malloc. Or NULL if we run out of memory. In backend, the returned
  * string is palloc'd instead, and OOM is reported with ereport().
  */
-pg_wchar *
-unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
+char32_t *
+unicode_normalize(UnicodeNormalizationForm form, const char32_t *input)
 {
 	bool		compat = (form == UNICODE_NFKC || form == UNICODE_NFKD);
 	bool		recompose = (form == UNICODE_NFC || form == UNICODE_NFKC);
-	pg_wchar   *decomp_chars;
-	pg_wchar   *recomp_chars;
+	char32_t   *decomp_chars;
+	char32_t   *recomp_chars;
 	int			decomp_size,
 				current_size;
 	int			count;
-	const pg_wchar *p;
+	const char32_t *p;
 
 	/* variables for recomposition */
 	int			last_class;
@@ -425,7 +425,7 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
 	for (p = input; *p; p++)
 		decomp_size += get_decomposed_size(*p, compat);
 
-	decomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
+	decomp_chars = (char32_t *) ALLOC((decomp_size + 1) * sizeof(char32_t));
 	if (decomp_chars == NULL)
 		return NULL;
 
@@ -448,9 +448,9 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
 	 */
 	for (count = 1; count < decomp_size; count++)
 	{
-		pg_wchar	prev = decomp_chars[count - 1];
-		pg_wchar	next = decomp_chars[count];
-		pg_wchar	tmp;
+		char32_t	prev = decomp_chars[count - 1];
+		char32_t	next = decomp_chars[count];
+		char32_t	tmp;
 		const uint8 prevClass = get_canonical_class(prev);
 		const uint8 nextClass = get_canonical_class(next);
 
@@ -487,7 +487,7 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
 	 * longer than the decomposed one, so make the allocation of the output
 	 * string based on that assumption.
 	 */
-	recomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
+	recomp_chars = (char32_t *) ALLOC((decomp_size + 1) * sizeof(char32_t));
 	if (!recomp_chars)
 	{
 		FREE(decomp_chars);
@@ -501,9 +501,9 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
 
 	for (count = 1; count < decomp_size; count++)
 	{
-		pg_wchar	ch = decomp_chars[count];
+		char32_t	ch = decomp_chars[count];
 		int			ch_class = get_canonical_class(ch);
-		pg_wchar	composite;
+		char32_t	composite;
 
 		if (last_class < ch_class &&
 			recompose_code(starter_ch, ch, &composite))
@@ -524,7 +524,7 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
 			recomp_chars[target_pos++] = ch;
 		}
 	}
-	recomp_chars[target_pos] = (pg_wchar) '\0';
+	recomp_chars[target_pos] = (char32_t) '\0';
 
 	FREE(decomp_chars);
 
@@ -540,7 +540,7 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
 #ifndef FRONTEND
 
 static const pg_unicode_normprops *
-qc_hash_lookup(pg_wchar ch, const pg_unicode_norminfo *norminfo)
+qc_hash_lookup(char32_t ch, const pg_unicode_norminfo *norminfo)
 {
 	int			h;
 	uint32		hashkey;
@@ -571,7 +571,7 @@ qc_hash_lookup(pg_wchar ch, const pg_unicode_norminfo *norminfo)
  * Look up the normalization quick check character property
  */
 static UnicodeNormalizationQC
-qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch)
+qc_is_allowed(UnicodeNormalizationForm form, char32_t ch)
 {
 	const pg_unicode_normprops *found = NULL;
 
@@ -595,7 +595,7 @@ qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch)
 }
 
 UnicodeNormalizationQC
-unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const pg_wchar *input)
+unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const char32_t *input)
 {
 	uint8		lastCanonicalClass = 0;
 	UnicodeNormalizationQC result = UNICODE_NORM_QC_YES;
@@ -610,9 +610,9 @@ unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const pg_wchar *
 	if (form == UNICODE_NFD || form == UNICODE_NFKD)
 		return UNICODE_NORM_QC_MAYBE;
 
-	for (const pg_wchar *p = input; *p; p++)
+	for (const char32_t *p = input; *p; p++)
 	{
-		pg_wchar	ch = *p;
+		char32_t	ch = *p;
 		uint8		canonicalClass;
 		UnicodeNormalizationQC check;
 
diff --git a/src/fe_utils/mbprint.c b/src/fe_utils/mbprint.c
index eb3eeee9925..8bfce1d4e07 100644
--- a/src/fe_utils/mbprint.c
+++ b/src/fe_utils/mbprint.c
@@ -49,7 +49,7 @@ pg_get_utf8_id(void)
  *
  * No error checks here, c must point to a long-enough string.
  */
-static pg_wchar
+static char32_t
 utf8_to_unicode(const unsigned char *c)
 {
 	if ((*c & 0x80) == 0)
diff --git a/src/include/c.h b/src/include/c.h
index 9ab5e617995..a2ee108fd16 100644
--- a/src/include/c.h
+++ b/src/include/c.h
@@ -513,6 +513,11 @@ typedef void (*pg_funcptr_t) (void);
 
 #include <stdbool.h>
 
+/*
+ * char32_t
+ *      Unicode code point.
+ */
+#include <uchar.h>
 
 /* ----------------------------------------------------------------
  *				Section 3:	standard system types
diff --git a/src/include/common/unicode_case.h b/src/include/common/unicode_case.h
index 41e2c1f4b33..6bcffd349c2 100644
--- a/src/include/common/unicode_case.h
+++ b/src/include/common/unicode_case.h
@@ -14,14 +14,12 @@
 #ifndef UNICODE_CASE_H
 #define UNICODE_CASE_H
 
-#include "mb/pg_wchar.h"
-
 typedef size_t (*WordBoundaryNext) (void *wbstate);
 
-pg_wchar	unicode_lowercase_simple(pg_wchar code);
-pg_wchar	unicode_titlecase_simple(pg_wchar code);
-pg_wchar	unicode_uppercase_simple(pg_wchar code);
-pg_wchar	unicode_casefold_simple(pg_wchar code);
+char32_t	unicode_lowercase_simple(char32_t code);
+char32_t	unicode_titlecase_simple(char32_t code);
+char32_t	unicode_uppercase_simple(char32_t code);
+char32_t	unicode_casefold_simple(char32_t code);
 size_t		unicode_strlower(char *dst, size_t dstsize, const char *src,
 							 ssize_t srclen, bool full);
 size_t		unicode_strtitle(char *dst, size_t dstsize, const char *src,
diff --git a/src/include/common/unicode_case_table.h b/src/include/common/unicode_case_table.h
index d5311786582..0a14fb2d97b 100644
--- a/src/include/common/unicode_case_table.h
+++ b/src/include/common/unicode_case_table.h
@@ -18,7 +18,6 @@
  */
 
 #include "common/unicode_case.h"
-#include "mb/pg_wchar.h"
 
 /*
  * The maximum number of codepoints that can result from case mapping
@@ -45,7 +44,7 @@ typedef enum
 typedef struct
 {
 	int16		conditions;
-	pg_wchar	map[NCaseKind][MAX_CASE_EXPANSION];
+	char32_t	map[NCaseKind][MAX_CASE_EXPANSION];
 } pg_special_case;
 
 /*
@@ -166,7 +165,7 @@ static const pg_special_case special_case[106] =
  * The entry case_map_lower[case_index(codepoint)] is the mapping for the
  * given codepoint.
  */
-static const pg_wchar case_map_lower[1704] =
+static const char32_t case_map_lower[1704] =
 {
 	0x000000,					/* reserved */
 	0x000000,					/* U+000000 */
@@ -1879,7 +1878,7 @@ static const pg_wchar case_map_lower[1704] =
  * The entry case_map_title[case_index(codepoint)] is the mapping for the
  * given codepoint.
  */
-static const pg_wchar case_map_title[1704] =
+static const char32_t case_map_title[1704] =
 {
 	0x000000,					/* reserved */
 	0x000000,					/* U+000000 */
@@ -3592,7 +3591,7 @@ static const pg_wchar case_map_title[1704] =
  * The entry case_map_upper[case_index(codepoint)] is the mapping for the
  * given codepoint.
  */
-static const pg_wchar case_map_upper[1704] =
+static const char32_t case_map_upper[1704] =
 {
 	0x000000,					/* reserved */
 	0x000000,					/* U+000000 */
@@ -5305,7 +5304,7 @@ static const pg_wchar case_map_upper[1704] =
  * The entry case_map_fold[case_index(codepoint)] is the mapping for the
  * given codepoint.
  */
-static const pg_wchar case_map_fold[1704] =
+static const char32_t case_map_fold[1704] =
 {
 	0x000000,					/* reserved */
 	0x000000,					/* U+000000 */
@@ -13522,7 +13521,7 @@ static const uint16 case_map[4778] =
  * the offset into the mapping tables.
  */
 static inline uint16
-case_index(pg_wchar cp)
+case_index(char32_t cp)
 {
 	/* Fast path for codepoints < 0x0588 */
 	if (cp < 0x0588)
diff --git a/src/include/common/unicode_category.h b/src/include/common/unicode_category.h
index 8fd8b67a416..684143d3c8a 100644
--- a/src/include/common/unicode_category.h
+++ b/src/include/common/unicode_category.h
@@ -14,8 +14,6 @@
 #ifndef UNICODE_CATEGORY_H
 #define UNICODE_CATEGORY_H
 
-#include "mb/pg_wchar.h"
-
 /*
  * Unicode General Category Values
  *
@@ -61,31 +59,31 @@ typedef enum pg_unicode_category
 	PG_U_FINAL_PUNCTUATION = 29 /* Pf */
 } pg_unicode_category;
 
-extern pg_unicode_category unicode_category(pg_wchar code);
+extern pg_unicode_category unicode_category(char32_t code);
 extern const char *unicode_category_string(pg_unicode_category category);
 extern const char *unicode_category_abbrev(pg_unicode_category category);
 
-extern bool pg_u_prop_alphabetic(pg_wchar code);
-extern bool pg_u_prop_lowercase(pg_wchar code);
-extern bool pg_u_prop_uppercase(pg_wchar code);
-extern bool pg_u_prop_cased(pg_wchar code);
-extern bool pg_u_prop_case_ignorable(pg_wchar code);
-extern bool pg_u_prop_white_space(pg_wchar code);
-extern bool pg_u_prop_hex_digit(pg_wchar code);
-extern bool pg_u_prop_join_control(pg_wchar code);
+extern bool pg_u_prop_alphabetic(char32_t code);
+extern bool pg_u_prop_lowercase(char32_t code);
+extern bool pg_u_prop_uppercase(char32_t code);
+extern bool pg_u_prop_cased(char32_t code);
+extern bool pg_u_prop_case_ignorable(char32_t code);
+extern bool pg_u_prop_white_space(char32_t code);
+extern bool pg_u_prop_hex_digit(char32_t code);
+extern bool pg_u_prop_join_control(char32_t code);
 
-extern bool pg_u_isdigit(pg_wchar code, bool posix);
-extern bool pg_u_isalpha(pg_wchar code);
-extern bool pg_u_isalnum(pg_wchar code, bool posix);
-extern bool pg_u_isword(pg_wchar code);
-extern bool pg_u_isupper(pg_wchar code);
-extern bool pg_u_islower(pg_wchar code);
-extern bool pg_u_isblank(pg_wchar code);
-extern bool pg_u_iscntrl(pg_wchar code);
-extern bool pg_u_isgraph(pg_wchar code);
-extern bool pg_u_isprint(pg_wchar code);
-extern bool pg_u_ispunct(pg_wchar code, bool posix);
-extern bool pg_u_isspace(pg_wchar code);
-extern bool pg_u_isxdigit(pg_wchar code, bool posix);
+extern bool pg_u_isdigit(char32_t code, bool posix);
+extern bool pg_u_isalpha(char32_t code);
+extern bool pg_u_isalnum(char32_t code, bool posix);
+extern bool pg_u_isword(char32_t code);
+extern bool pg_u_isupper(char32_t code);
+extern bool pg_u_islower(char32_t code);
+extern bool pg_u_isblank(char32_t code);
+extern bool pg_u_iscntrl(char32_t code);
+extern bool pg_u_isgraph(char32_t code);
+extern bool pg_u_isprint(char32_t code);
+extern bool pg_u_ispunct(char32_t code, bool posix);
+extern bool pg_u_isspace(char32_t code);
+extern bool pg_u_isxdigit(char32_t code, bool posix);
 
 #endif							/* UNICODE_CATEGORY_H */
diff --git a/src/include/common/unicode_category_table.h b/src/include/common/unicode_category_table.h
index 95a1c65da7e..466a41b72b0 100644
--- a/src/include/common/unicode_category_table.h
+++ b/src/include/common/unicode_category_table.h
@@ -20,15 +20,15 @@
  */
 typedef struct
 {
-	uint32		first;			/* Unicode codepoint */
-	uint32		last;			/* Unicode codepoint */
+	char32_t	first;			/* Unicode codepoint */
+	char32_t	last;			/* Unicode codepoint */
 	uint8		category;		/* General Category */
 } pg_category_range;
 
 typedef struct
 {
-	uint32		first;			/* Unicode codepoint */
-	uint32		last;			/* Unicode codepoint */
+	char32_t	first;			/* Unicode codepoint */
+	char32_t	last;			/* Unicode codepoint */
 } pg_unicode_range;
 
 typedef struct
diff --git a/src/include/common/unicode_norm.h b/src/include/common/unicode_norm.h
index 5bc3b79e78e..516c192cc4c 100644
--- a/src/include/common/unicode_norm.h
+++ b/src/include/common/unicode_norm.h
@@ -14,8 +14,6 @@
 #ifndef UNICODE_NORM_H
 #define UNICODE_NORM_H
 
-#include "mb/pg_wchar.h"
-
 typedef enum
 {
 	UNICODE_NFC = 0,
@@ -32,8 +30,8 @@ typedef enum
 	UNICODE_NORM_QC_MAYBE = -1,
 } UnicodeNormalizationQC;
 
-extern pg_wchar *unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input);
+extern char32_t *unicode_normalize(UnicodeNormalizationForm form, const char32_t *input);
 
-extern UnicodeNormalizationQC unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const pg_wchar *input);
+extern UnicodeNormalizationQC unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const char32_t *input);
 
 #endif							/* UNICODE_NORM_H */
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index 4b4a9974b75..a41bf47649e 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -532,7 +532,7 @@ typedef uint32 (*utf_local_conversion_func) (uint32 code);
  * Some handy functions for Unicode-specific tests.
  */
 static inline bool
-is_valid_unicode_codepoint(pg_wchar c)
+is_valid_unicode_codepoint(char32_t c)
 {
 	return (c > 0 && c <= 0x10FFFF);
 }
@@ -549,7 +549,7 @@ is_utf16_surrogate_second(pg_wchar c)
 	return (c >= 0xDC00 && c <= 0xDFFF);
 }
 
-static inline pg_wchar
+static inline char32_t
 surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second)
 {
 	return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF);
@@ -588,7 +588,7 @@ utf8_to_unicode(const unsigned char *c)
  * unicode_utf8len(c) bytes available.
  */
 static inline unsigned char *
-unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
+unicode_to_utf8(char32_t c, unsigned char *utf8string)
 {
 	if (c <= 0x7F)
 	{
@@ -620,7 +620,7 @@ unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
  * Number of bytes needed to represent the given char in UTF8.
  */
 static inline int
-unicode_utf8len(pg_wchar c)
+unicode_utf8len(char32_t c)
 {
 	if (c <= 0x7F)
 		return 1;
@@ -676,8 +676,8 @@ extern int	pg_valid_server_encoding(const char *name);
 extern bool is_encoding_supported_by_icu(int encoding);
 extern const char *get_encoding_name_for_icu(int encoding);
 
-extern unsigned char *unicode_to_utf8(pg_wchar c, unsigned char *utf8string);
-extern pg_wchar utf8_to_unicode(const unsigned char *c);
+extern unsigned char *unicode_to_utf8(char32_t c, unsigned char *utf8string);
+extern char32_t utf8_to_unicode(const unsigned char *c);
 extern bool pg_utf8_islegal(const unsigned char *source, int length);
 extern int	pg_utf_mblen(const unsigned char *s);
 extern int	pg_mule_mblen(const unsigned char *s);
@@ -739,8 +739,8 @@ extern char *pg_server_to_client(const char *s, int len);
 extern char *pg_any_to_server(const char *s, int len, int encoding);
 extern char *pg_server_to_any(const char *s, int len, int encoding);
 
-extern void pg_unicode_to_server(pg_wchar c, unsigned char *s);
-extern bool pg_unicode_to_server_noerror(pg_wchar c, unsigned char *s);
+extern void pg_unicode_to_server(char32_t c, unsigned char *s);
+extern bool pg_unicode_to_server_noerror(char32_t c, unsigned char *s);
 
 extern unsigned short BIG5toCNS(unsigned short big5, unsigned char *lc);
 extern unsigned short CNStoBIG5(unsigned short cns, unsigned char lc);
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 377a7946585..f2bb9b4bc7b 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -3505,6 +3505,7 @@ cb_cleanup_dir
 cb_options
 cb_tablespace
 cb_tablespace_mapping
+char32_t
 check_agg_arguments_context
 check_function_callback
 check_network_data
-- 
2.43.0

C11: should we use char32_t for unicode code points?

Reply via email to