Re: The "char" type versus non-ASCII characters

Tom Lane Sun, 31 Jul 2022 15:26:25 -0700

I wrote:
> This came up again today [1], so here's a concrete proposal.
> Let's use \ooo for high-bit-set chars, but keep backslash as just
> backslash (so it's only semi-compatible with bytea).


Hearing no howls of protest, here's a fleshed out, potentially-committable
version.  I added some regression test coverage for the modified code.
(I also fixed an astonishingly obsolete comment about what the regular
char type does.)  I looked at the SGML docs too, but I don't think there
is anything to change there.  The docs say "single-byte internal type"
and are silent about "char" beyond that.  I think that's exactly where
we want to be: any more detail would encourage people to use the type,
which we don't really want.  Possibly we could change the text to
"single-byte internal type, meant to hold ASCII characters" but I'm
not sure that's better.

The next question is what to do with this.  I propose to commit it into
HEAD and v15 before next week's beta3 release.  If we don't get a lot
of pushback, we could consider back-patching further for the November
releases; but I'm hesitant to shove something like this into stable
branches with only a week's notice.

                        regards, tom lane

diff --git a/src/backend/utils/adt/char.c b/src/backend/utils/adt/char.c
index 0df41c2253..e50293bf14 100644
--- a/src/backend/utils/adt/char.c
+++ b/src/backend/utils/adt/char.c
@@ -20,6 +20,11 @@
 #include "libpq/pqformat.h"
 #include "utils/builtins.h"
 
+#define ISOCTAL(c)   (((c) >= '0') && ((c) <= '7'))
+#define TOOCTAL(c)   ((c) + '0')
+#define FROMOCTAL(c) ((unsigned char) (c) - '0')
+
+
 /*****************************************************************************
  *	 USER I/O ROUTINES														 *
  *****************************************************************************/
@@ -27,31 +32,53 @@
 /*
  *		charin			- converts "x" to 'x'
  *
- * Note that an empty input string will implicitly be converted to \0.
+ * This accepts the formats charout produces.  If we have multibyte input
+ * that is not in the form '\ooo', then we take its first byte as the value
+ * and silently discard the rest; this is a backwards-compatibility provision.
  */
 Datum
 charin(PG_FUNCTION_ARGS)
 {
 	char	   *ch = PG_GETARG_CSTRING(0);
 
+	if (strlen(ch) == 4 && ch[0] == '\\' &&
+		ISOCTAL(ch[1]) && ISOCTAL(ch[2]) && ISOCTAL(ch[3]))
+		PG_RETURN_CHAR((FROMOCTAL(ch[1]) << 6) +
+					   (FROMOCTAL(ch[2]) << 3) +
+					   FROMOCTAL(ch[3]));
+	/* This will do the right thing for a zero-length input string */
 	PG_RETURN_CHAR(ch[0]);
 }
 
 /*
  *		charout			- converts 'x' to "x"
  *
- * Note that if the char value is \0, the resulting string will appear
- * to be empty (null-terminated after zero characters).  So this is the
- * inverse of the charin() function for such data.
+ * The possible output formats are:
+ * 1. 0x00 is represented as an empty string.
+ * 2. 0x01..0x7F are represented as a single ASCII byte.
+ * 3. 0x80..0xFF are represented as \ooo (backslash and 3 octal digits).
+ * Case 3 is meant to match the traditional "escape" format of bytea.
  */
 Datum
 charout(PG_FUNCTION_ARGS)
 {
 	char		ch = PG_GETARG_CHAR(0);
-	char	   *result = (char *) palloc(2);
+	char	   *result = (char *) palloc(5);
 
-	result[0] = ch;
-	result[1] = '\0';
+	if (IS_HIGHBIT_SET(ch))
+	{
+		result[0] = '\\';
+		result[1] = TOOCTAL(((unsigned char) ch) >> 6);
+		result[2] = TOOCTAL((((unsigned char) ch) >> 3) & 07);
+		result[3] = TOOCTAL(((unsigned char) ch) & 07);
+		result[4] = '\0';
+	}
+	else
+	{
+		/* This produces acceptable results for 0x00 as well */
+		result[0] = ch;
+		result[1] = '\0';
+	}
 	PG_RETURN_CSTRING(result);
 }
 
@@ -176,15 +203,20 @@ Datum
 text_char(PG_FUNCTION_ARGS)
 {
 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
+	char	   *ch = VARDATA_ANY(arg1);
 	char		result;
 
 	/*
-	 * An empty input string is converted to \0 (for consistency with charin).
-	 * If the input is longer than one character, the excess data is silently
-	 * discarded.
+	 * Conversion rules are the same as in charin(), but here we need to
+	 * handle the empty-string case honestly.
 	 */
-	if (VARSIZE_ANY_EXHDR(arg1) > 0)
-		result = *(VARDATA_ANY(arg1));
+	if (VARSIZE_ANY_EXHDR(arg1) == 4 && ch[0] == '\\' &&
+		ISOCTAL(ch[1]) && ISOCTAL(ch[2]) && ISOCTAL(ch[3]))
+		result = (FROMOCTAL(ch[1]) << 6) +
+			(FROMOCTAL(ch[2]) << 3) +
+			FROMOCTAL(ch[3]);
+	else if (VARSIZE_ANY_EXHDR(arg1) > 0)
+		result = ch[0];
 	else
 		result = '\0';
 
@@ -195,13 +227,21 @@ Datum
 char_text(PG_FUNCTION_ARGS)
 {
 	char		arg1 = PG_GETARG_CHAR(0);
-	text	   *result = palloc(VARHDRSZ + 1);
+	text	   *result = palloc(VARHDRSZ + 4);
 
 	/*
-	 * Convert \0 to an empty string, for consistency with charout (and
-	 * because the text stuff doesn't like embedded nulls all that well).
+	 * Conversion rules are the same as in charout(), but here we need to be
+	 * honest about converting 0x00 to an empty string.
 	 */
-	if (arg1 != '\0')
+	if (IS_HIGHBIT_SET(arg1))
+	{
+		SET_VARSIZE(result, VARHDRSZ + 4);
+		(VARDATA(result))[0] = '\\';
+		(VARDATA(result))[1] = TOOCTAL(((unsigned char) arg1) >> 6);
+		(VARDATA(result))[2] = TOOCTAL((((unsigned char) arg1) >> 3) & 07);
+		(VARDATA(result))[3] = TOOCTAL(((unsigned char) arg1) & 07);
+	}
+	else if (arg1 != '\0')
 	{
 		SET_VARSIZE(result, VARHDRSZ + 1);
 		*(VARDATA(result)) = arg1;
diff --git a/src/test/regress/expected/char.out b/src/test/regress/expected/char.out
index 2d78f90f3b..ea9b0b8eeb 100644
--- a/src/test/regress/expected/char.out
+++ b/src/test/regress/expected/char.out
@@ -1,8 +1,8 @@
 --
 -- CHAR
 --
--- fixed-length by value
--- internally passed by value if <= 4 bytes in storage
+-- Per SQL standard, CHAR means character(1), that is a varlena type
+-- with a constraint restricting it to one character (not byte)
 SELECT char 'c' = char 'c' AS true;
  true 
 ------
@@ -119,3 +119,62 @@ SELECT * FROM CHAR_TBL;
  abcd
 (4 rows)
 
+--
+-- Also test "char", which is an ad-hoc one-byte type.  It can only
+-- really store ASCII characters, but we allow high-bit-set characters
+-- to be accessed via bytea-like escapes.
+--
+SELECT 'a'::"char";
+ char 
+------
+ a
+(1 row)
+
+SELECT '\101'::"char";
+ char 
+------
+ A
+(1 row)
+
+SELECT '\377'::"char";
+ char 
+------
+ \377
+(1 row)
+
+SELECT 'a'::"char"::text;
+ text 
+------
+ a
+(1 row)
+
+SELECT '\377'::"char"::text;
+ text 
+------
+ \377
+(1 row)
+
+SELECT '\000'::"char"::text;
+ text 
+------
+ 
+(1 row)
+
+SELECT 'a'::text::"char";
+ char 
+------
+ a
+(1 row)
+
+SELECT '\377'::text::"char";
+ char 
+------
+ \377
+(1 row)
+
+SELECT ''::text::"char";
+ char 
+------
+ 
+(1 row)
+
diff --git a/src/test/regress/expected/char_1.out b/src/test/regress/expected/char_1.out
index fa6644d692..ffd31551de 100644
--- a/src/test/regress/expected/char_1.out
+++ b/src/test/regress/expected/char_1.out
@@ -1,8 +1,8 @@
 --
 -- CHAR
 --
--- fixed-length by value
--- internally passed by value if <= 4 bytes in storage
+-- Per SQL standard, CHAR means character(1), that is a varlena type
+-- with a constraint restricting it to one character (not byte)
 SELECT char 'c' = char 'c' AS true;
  true 
 ------
@@ -119,3 +119,62 @@ SELECT * FROM CHAR_TBL;
  abcd
 (4 rows)
 
+--
+-- Also test "char", which is an ad-hoc one-byte type.  It can only
+-- really store ASCII characters, but we allow high-bit-set characters
+-- to be accessed via bytea-like escapes.
+--
+SELECT 'a'::"char";
+ char 
+------
+ a
+(1 row)
+
+SELECT '\101'::"char";
+ char 
+------
+ A
+(1 row)
+
+SELECT '\377'::"char";
+ char 
+------
+ \377
+(1 row)
+
+SELECT 'a'::"char"::text;
+ text 
+------
+ a
+(1 row)
+
+SELECT '\377'::"char"::text;
+ text 
+------
+ \377
+(1 row)
+
+SELECT '\000'::"char"::text;
+ text 
+------
+ 
+(1 row)
+
+SELECT 'a'::text::"char";
+ char 
+------
+ a
+(1 row)
+
+SELECT '\377'::text::"char";
+ char 
+------
+ \377
+(1 row)
+
+SELECT ''::text::"char";
+ char 
+------
+ 
+(1 row)
+
diff --git a/src/test/regress/expected/char_2.out b/src/test/regress/expected/char_2.out
index 09434a44cd..56818f824b 100644
--- a/src/test/regress/expected/char_2.out
+++ b/src/test/regress/expected/char_2.out
@@ -1,8 +1,8 @@
 --
 -- CHAR
 --
--- fixed-length by value
--- internally passed by value if <= 4 bytes in storage
+-- Per SQL standard, CHAR means character(1), that is a varlena type
+-- with a constraint restricting it to one character (not byte)
 SELECT char 'c' = char 'c' AS true;
  true 
 ------
@@ -119,3 +119,62 @@ SELECT * FROM CHAR_TBL;
  abcd
 (4 rows)
 
+--
+-- Also test "char", which is an ad-hoc one-byte type.  It can only
+-- really store ASCII characters, but we allow high-bit-set characters
+-- to be accessed via bytea-like escapes.
+--
+SELECT 'a'::"char";
+ char 
+------
+ a
+(1 row)
+
+SELECT '\101'::"char";
+ char 
+------
+ A
+(1 row)
+
+SELECT '\377'::"char";
+ char 
+------
+ \377
+(1 row)
+
+SELECT 'a'::"char"::text;
+ text 
+------
+ a
+(1 row)
+
+SELECT '\377'::"char"::text;
+ text 
+------
+ \377
+(1 row)
+
+SELECT '\000'::"char"::text;
+ text 
+------
+ 
+(1 row)
+
+SELECT 'a'::text::"char";
+ char 
+------
+ a
+(1 row)
+
+SELECT '\377'::text::"char";
+ char 
+------
+ \377
+(1 row)
+
+SELECT ''::text::"char";
+ char 
+------
+ 
+(1 row)
+
diff --git a/src/test/regress/sql/char.sql b/src/test/regress/sql/char.sql
index 9c83c45e34..120fed53e5 100644
--- a/src/test/regress/sql/char.sql
+++ b/src/test/regress/sql/char.sql
@@ -2,8 +2,8 @@
 -- CHAR
 --
 
--- fixed-length by value
--- internally passed by value if <= 4 bytes in storage
+-- Per SQL standard, CHAR means character(1), that is a varlena type
+-- with a constraint restricting it to one character (not byte)
 
 SELECT char 'c' = char 'c' AS true;
 
@@ -71,3 +71,19 @@ DROP TABLE CHAR_TBL;
 INSERT INTO CHAR_TBL (f1) VALUES ('abcde');
 
 SELECT * FROM CHAR_TBL;
+
+--
+-- Also test "char", which is an ad-hoc one-byte type.  It can only
+-- really store ASCII characters, but we allow high-bit-set characters
+-- to be accessed via bytea-like escapes.
+--
+
+SELECT 'a'::"char";
+SELECT '\101'::"char";
+SELECT '\377'::"char";
+SELECT 'a'::"char"::text;
+SELECT '\377'::"char"::text;
+SELECT '\000'::"char"::text;
+SELECT 'a'::text::"char";
+SELECT '\377'::text::"char";
+SELECT ''::text::"char";

Re: The "char" type versus non-ASCII characters

Reply via email to