From 9564aa018c7eae9d66589b2be4303f8aede94f77 Mon Sep 17 00:00:00 2001
From: Arjen Nienhuis <a.g.nienhuis@gmail.com>
Date: Sun, 3 May 2015 22:28:26 +0200
Subject: [PATCH] Have GB18030 handle more than 2-byte Unicode code points

BUG #12845: The GB18030 encoding doesn't support Unicode characters over 0xFFFF

SELECT convert_to(chr(128512), 'GB18030');

expected result:

 convert_to
------------
 \x9439fc36
(1 row)
---
 .../utf8_and_gb18030/utf8_and_gb18030.c            | 279 ++++++++++++++++++++-
 src/backend/utils/mb/wchar.c                       |   2 +-
 src/include/mb/pg_wchar.h                          |   1 +
 src/test/regress/expected/conversion.out           |  16 +-
 src/test/regress/sql/conversion.sql                |   4 +-
 5 files changed, 287 insertions(+), 15 deletions(-)

diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c b/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c
index 4427fea..c645831 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c
@@ -25,6 +25,16 @@ PG_FUNCTION_INFO_V1(utf8_to_gb18030);
 extern Datum gb18030_to_utf8(PG_FUNCTION_ARGS);
 extern Datum utf8_to_gb18030(PG_FUNCTION_ARGS);
 
+static uint32 utf8_to_gb18030_hi(uint32 utf8);
+static uint32 gb18030_to_utf8_hi(uint32 gb);
+static int compare1(const void *p1, const void *p2);
+static int compare2(const void *p1, const void *p2);
+
+/* All Unicode codepoints over U+FFFF are mapped to one range in the GB18030 encoding */
+static const uint32 UTF32_FIRST = 0x10000;
+static const uint32 GB18030_FIRST = 0x90308130;
+static const uint32 GB18030_LAST = 0xe3329a35;
+
 /* ----------
  * conv_proc(
  *		INTEGER,	-- source encoding id
@@ -41,11 +51,84 @@ gb18030_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	unsigned int iiso;
+	unsigned int outf;
+	int			l;
+	pg_local_to_utf *p;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_GB18030, PG_UTF8);
 
-	LocalToUtf(src, dest, LUmapGB18030, NULL,
-		 sizeof(LUmapGB18030) / sizeof(pg_local_to_utf), 0, PG_GB18030, len);
+	for (; len > 0; len -= l)
+	{
+		/* "break" cases all represent errors */
+		if (*src == '\0')
+			break;
+
+		if (!IS_HIGHBIT_SET(*src))
+		{
+			/* ASCII case is easy */
+			*dest++ = *src++;
+			l = 1;
+			continue;
+		}
+
+		l = pg_gb18030_verifier(src, len);
+		if (l < 0)
+			break;
+
+		if (l == 2)
+		{
+			iiso = *src++ << 8;
+			iiso |= *src++;
+		}
+		else if (l == 4)
+		{
+			iiso = *src++ << 24;
+			iiso |= *src++ << 16;
+			iiso |= *src++ << 8;
+			iiso |= *src++;
+
+			if (iiso >= GB18030_FIRST && iiso <= GB18030_LAST)
+			{
+				outf = gb18030_to_utf8_hi(iiso);
+				*dest++ = outf >> 24;
+				*dest++ = (outf & 0x00ff0000) >> 16;
+				*dest++ = (outf & 0x0000ff00) >> 8;
+				*dest++ = outf & 0x000000ff;
+				continue;
+			}
+		}
+		else
+		{
+			elog(ERROR, "unsupported character length %d", l);
+			iiso = 0;			/* keep compiler quiet */
+		}
+
+		p = bsearch(&iiso, LUmapGB18030, sizeof(LUmapGB18030) / sizeof(pg_local_to_utf),
+					sizeof(pg_local_to_utf), compare2);
+
+		if (p == NULL)
+		{
+			report_untranslatable_char(PG_GB18030, PG_UTF8,
+									   (const char *) (src - l), len);
+		}
+		else
+		{
+			if (p->utf & 0xff000000)
+				*dest++ = p->utf >> 24;
+			if (p->utf & 0x00ff0000)
+				*dest++ = (p->utf & 0x00ff0000) >> 16;
+			if (p->utf & 0x0000ff00)
+				*dest++ = (p->utf & 0x0000ff00) >> 8;
+			if (p->utf & 0x000000ff)
+				*dest++ = p->utf & 0x000000ff;
+		}
+	}
+
+	if (len > 0)
+		report_invalid_encoding(PG_GB18030, (const char *) src, len);
+
+	*dest = '\0';
 
 	PG_RETURN_VOID();
 }
@@ -56,11 +139,199 @@ utf8_to_gb18030(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	uint32		iutf;
+	uint32		code;
+	pg_utf_to_local *p;
+	int			l;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_GB18030);
 
-	UtfToLocal(src, dest, ULmapGB18030, NULL,
-		 sizeof(ULmapGB18030) / sizeof(pg_utf_to_local), 0, PG_GB18030, len);
+	for (; len > 0; len -= l)
+	{
+		/* "break" cases all represent errors */
+		if (*src == '\0')
+			break;
+
+		l = pg_utf_mblen(src);
+
+		if (len < l)
+			break;
+
+		if (!pg_utf8_islegal(src, l))
+			break;
+
+		if (l == 1)
+		{
+			/* ASCII case is easy */
+			*dest++ = *src++;
+			continue;
+		}
+		else if (l == 2)
+		{
+			iutf = *src++ << 8;
+			iutf |= *src++;
+		}
+		else if (l == 3)
+		{
+			iutf = *src++ << 16;
+			iutf |= *src++ << 8;
+			iutf |= *src++;
+		}
+		else if (l == 4)
+		{
+			iutf = *src++ << 24;
+			iutf |= *src++ << 16;
+			iutf |= *src++ << 8;
+			iutf |= *src++;
+			/* 4 byte codes all map to the linear range */
+			code = utf8_to_gb18030_hi(iutf);
+			*dest++ = code >> 24;
+			*dest++ = (code & 0x00ff0000) >> 16;
+			*dest++ = (code & 0x0000ff00) >> 8;
+			*dest++ = code & 0x000000ff;
+			continue;
+		}
+		else
+		{
+			elog(ERROR, "unsupported character length %d", l);
+			iutf = 0;			/* keep compiler quiet */
+		}
+
+		p = bsearch(&iutf, ULmapGB18030, sizeof(ULmapGB18030) / sizeof(pg_utf_to_local),
+					sizeof(pg_utf_to_local), compare1);
+		if (p == NULL)
+			report_untranslatable_char(PG_UTF8, PG_GB18030,
+									   (const char *) (src - l), len);
+		code = p->code;
+		/* GB18030 is always 1, 2 or 4 bytes. 1 byte is handled above  */
+		if (code & 0xffff0000)
+		{
+			*dest++ = code >> 24;
+			*dest++ = (code & 0x00ff0000) >> 16;
+		}
+		*dest++ = (code & 0x0000ff00) >> 8;
+		*dest++ = code & 0x000000ff;
+	}
+
+	if (len > 0)
+		report_invalid_encoding(PG_UTF8, (const char *) src, len);
+
+	*dest = '\0';
 
 	PG_RETURN_VOID();
 }
+
+/*
+ * comparison routine for bsearch()
+ * this routine is intended for UTF8 -> local code
+ */
+static int
+compare1(const void *p1, const void *p2)
+{
+	uint32		v1,
+				v2;
+
+	v1 = *(const uint32 *) p1;
+	v2 = ((const pg_utf_to_local *) p2)->utf;
+	return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
+}
+
+/*
+ * comparison routine for bsearch()
+ * this routine is intended for local code -> UTF8
+ */
+static int
+compare2(const void *p1, const void *p2)
+{
+	uint32		v1,
+				v2;
+
+	v1 = *(const uint32 *) p1;
+	v2 = ((const pg_local_to_utf *) p2)->code;
+	return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
+}
+
+/*
+ * Convert UTF32 to UTF-8
+ * Works only for >= U+10000
+ */
+static uint32
+utf32_to_utf8_hi(uint32 utf32) {
+	uint32 b1 = (utf32 >> 18) | 0xF0;
+	uint32 b2 = ((utf32 >> 12) & 0x3F) | 0x80;
+	uint32 b3 = ((utf32 >> 6) & 0x3F) | 0x80;
+	uint32 b4 = (utf32 & 0x3F) | 0x80;
+	return (b1 << 24) | (b2 << 16) | (b3 << 8) | (b4 << 0);
+}
+
+/*
+ * Convert UTF-8 to UTF32
+ * Works only for >= U+10000
+ */
+static uint32
+utf8_to_utf32_hi(uint32 utf8) {
+	/* assert(utf8 > 0xffffff); */
+	uint32 b1 = (utf8 & 0x07000000) >> 6;
+	uint32 b2 = (utf8 & 0x003f0000) >> 4;
+	uint32 b3 = (utf8 & 0x00003f00) >> 2;
+	uint32 b4 = (utf8 & 0x0000003f) >> 0;
+	return b1 | b2 | b3 | b4;
+}
+
+static uint32
+gb_linear(uint32 gb) {
+	uint32 b0 = (gb & 0xff000000) >> 24;
+	uint32 b1 = (gb & 0x00ff0000) >> 16;
+	uint32 b2 = (gb & 0x0000ff00) >> 8;
+	uint32 b3 = (gb & 0x000000ff) >> 0;
+	return b0 * 12600 + b1 * 1260 + b2 * 10 + b3;
+}
+
+static uint32
+gb_unlinear(uint32 lin) {
+	uint32 zlin = lin - gb_linear(0x81308130);
+	uint32 r3 = 0x30 + zlin % 10;
+	uint32 r2 = 0x81 + (zlin / 10) % 126;
+	uint32 r1 = 0x30 + (zlin / 1260) % 10;
+	uint32 r0 = 0x81 + zlin / 12600;
+	return (r0 << 24) | (r1 << 16) | (r2 << 8) | (r3 << 0);
+}
+
+/*
+ * Convert GB18030 to UTF32
+ * Works only for >= U+10000
+ */
+static uint32
+gb_to_utf32_hi(uint32 gb)
+{
+	return UTF32_FIRST + (gb_linear(gb) - gb_linear(GB18030_FIRST));
+}
+
+/*
+ * Convert UTF32 to GB18030
+ * Works only for >= U+10000
+ */
+static uint32
+utf32_to_gb18030_hi(uint32 utf32) {
+	return gb_unlinear(gb_linear(GB18030_FIRST) + utf32 - UTF32_FIRST);
+}
+
+/*
+ * Convert UTF-8 to GB18030
+ * Works only for >= U+10000
+ */
+static uint32
+utf8_to_gb18030_hi(uint32 utf8) {
+	uint32 utf32 = utf8_to_utf32_hi(utf8);
+	return utf32_to_gb18030_hi(utf32);
+}
+
+/*
+ * Convert UTF-8 to GB18030
+ * Works only for >= U+10000
+ */
+static uint32
+gb18030_to_utf8_hi(uint32 gb) {
+	uint32 utf32 = gb_to_utf32_hi(gb);
+	return utf32_to_utf8_hi(utf32);
+}
diff --git a/src/backend/utils/mb/wchar.c b/src/backend/utils/mb/wchar.c
index 0cc753e..f19a19c 100644
--- a/src/backend/utils/mb/wchar.c
+++ b/src/backend/utils/mb/wchar.c
@@ -1400,7 +1400,7 @@ pg_uhc_verifier(const unsigned char *s, int len)
 	return mbl;
 }
 
-static int
+int
 pg_gb18030_verifier(const unsigned char *s, int len)
 {
 	int			l,
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index f7222fc..ce757b9 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -531,6 +531,7 @@ extern void mic2latin_with_table(const unsigned char *mic, unsigned char *p,
 					 int len, int lc, int encoding,
 					 const unsigned char *tab);
 
+extern int pg_gb18030_verifier(const unsigned char *s, int len);
 extern bool pg_utf8_islegal(const unsigned char *source, int length);
 
 #ifdef WIN32
diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
index 82eca26..13f1cf3 100644
--- a/src/test/regress/expected/conversion.out
+++ b/src/test/regress/expected/conversion.out
@@ -523,17 +523,17 @@ SELECT CONVERT('foo', 'UTF8', 'EUC_TW');
 (1 row)
 
 -- GB18030 --> UTF8
-SELECT CONVERT('foo', 'GB18030', 'UTF8');
- convert 
----------
- foo
+SELECT CONVERT('Postgres \247\343\247\335\247\340\247\337 \2249\3138 \317\363 \250\246le\2010\2747phant', 'GB18030', 'UTF8');
+                                             convert                                             
+-------------------------------------------------------------------------------------------------
+ Postgres \321\201\320\273\320\276\320\275 \360\237\220\230 \350\261\241 \303\251le\314\201phant
 (1 row)
 
 -- UTF8 --> GB18030
-SELECT CONVERT('foo', 'UTF8', 'GB18030');
- convert 
----------
- foo
+SELECT CONVERT('Postgres \321\201\320\273\320\276\320\275 \360\237\220\230 \350\261\241 \303\251le\314\201phant', 'UTF-8', 'GB18030');
+                                         convert                                         
+-----------------------------------------------------------------------------------------
+ Postgres \247\343\247\335\247\340\247\337 \2249\3138 \317\363 \250\246le\2010\2747phant
 (1 row)
 
 -- GBK --> UTF8
diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
index be194ee..e27f06f 100644
--- a/src/test/regress/sql/conversion.sql
+++ b/src/test/regress/sql/conversion.sql
@@ -171,9 +171,9 @@ SELECT CONVERT('foo', 'EUC_TW', 'UTF8');
 -- UTF8 --> EUC_TW
 SELECT CONVERT('foo', 'UTF8', 'EUC_TW');
 -- GB18030 --> UTF8
-SELECT CONVERT('foo', 'GB18030', 'UTF8');
+SELECT CONVERT('Postgres \247\343\247\335\247\340\247\337 \2249\3138 \317\363 \250\246le\2010\2747phant', 'GB18030', 'UTF8');
 -- UTF8 --> GB18030
-SELECT CONVERT('foo', 'UTF8', 'GB18030');
+SELECT CONVERT('Postgres \321\201\320\273\320\276\320\275 \360\237\220\230 \350\261\241 \303\251le\314\201phant', 'UTF-8', 'GB18030');
 -- GBK --> UTF8
 SELECT CONVERT('foo', 'GBK', 'UTF8');
 -- UTF8 --> GBK
-- 
2.1.0