Hello

Attached is a patch for adding uri as an encoding option for
encode/decode. It uses what's called "percent-encoding" in rfc3986
(https://tools.ietf.org/html/rfc3986#section-2.1).

The background for this patch is that I could easily build urls in
plpgsql, but doing the actual encoding of the url parts is painfully
slow. The list of available encodings for encode/decode looks quite
arbitrary to me, so I can't see any reason this one couldn't be in
there.

In modern web scenarios one would probably most likely want to encode
the utf8 representation of a text string for inclusion in a url, in
which case correct invocation would be ENCODE(CONVERT_TO('some text in
database encoding goes here', 'UTF8'), 'uri'), but uri
percent-encoding can of course also be used for other text encodings
and arbitrary binary data.

Regards,
Anders
diff --git a/src/backend/utils/adt/encode.c b/src/backend/utils/adt/encode.c
index 7293d66de5..33cf7bb57c 100644
--- a/src/backend/utils/adt/encode.c
+++ b/src/backend/utils/adt/encode.c
@@ -512,6 +512,131 @@ esc_dec_len(const char *src, unsigned srclen)
 	return len;
 }
 
+/*
+ * URI percent encoding
+ *
+ * Percent encodes all byte values except the unreserved ASCII characters as per RFC3986.
+ */
+
+static const char upper_hex_digits[] = "0123456789ABCDEF";
+
+static unsigned
+uri_encode(const char *src, unsigned srclen, char *dst)
+{
+	char		*d = dst;
+
+	for (const char *s = src; s < src + srclen; s++)
+	{
+		if ((*s >= 'A' && *s <= 'Z') ||
+			(*s >= 'a' && *s <= 'z') ||
+			(*s >= '0' && *s <= '9') ||
+			*s == '-' ||
+			*s == '_' ||
+			*s == '.' ||
+			*s == '~')
+		{
+			*d++ = *s;
+		}
+		else
+		{
+			*d++ = '%';
+			*d++ = upper_hex_digits[(*s >> 4) & 0xF];
+			*d++ = upper_hex_digits[*s & 0xF];
+		}
+	}
+	return d - dst;
+}
+
+static unsigned
+uri_decode(const char *src, unsigned srclen, char *dst)
+{
+	const char *s = src;
+	const char *srcend = src + srclen;
+	char		*d = dst;
+	char		val;
+
+	while (s < srcend)
+	{
+		if (*s == '%')
+		{
+			if (s > srcend - 3) {
+				/* This will never get triggered since uri_dec_len already takes care of validation
+				 */
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+						 errmsg("invalid uri percent encoding"),
+						 errhint("Input data ends prematurely.")));
+			}
+
+			/* Skip '%' */
+			s++;
+
+			val = get_hex(*s++) << 4;
+			val += get_hex(*s++);
+			*d++ = val;
+		}
+		else
+		{
+			*d++ = *s++;
+		}
+	}
+	return d - dst;
+}
+
+static unsigned
+uri_enc_len(const char *src, unsigned srclen)
+{
+	int			len = 0;
+
+	for (const char *s = src; s < src + srclen; s++)
+	{
+		if ((*s >= 'A' && *s <= 'Z') ||
+			(*s >= 'a' && *s <= 'z') ||
+			(*s >= '0' && *s <= '9') ||
+			*s == '-' ||
+			*s == '_' ||
+			*s == '.' ||
+			*s == '~')
+		{
+			len++;
+		}
+		else
+		{
+			len += 3;
+		}
+	}
+	return len;
+}
+
+static unsigned
+uri_dec_len(const char *src, unsigned srclen)
+{
+	const char *s = src;
+	const char *srcend = src + srclen;
+	int			len = 0;
+
+	while (s < srcend)
+	{
+		if (*s == '%')
+		{
+			if (s > srcend - 3) {
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+						 errmsg("invalid uri percent encoding"),
+						 errhint("Input data ends prematurely.")));
+			}
+			s++;
+			get_hex(*s++);
+			get_hex(*s++);
+		}
+		else {
+			s++;
+		}
+		len++;
+	}
+	return len;
+}
+
 /*
  * Common
  */
@@ -541,6 +666,12 @@ static const struct
 			esc_enc_len, esc_dec_len, esc_encode, esc_decode
 		}
 	},
+	{
+		"uri",
+		{
+			uri_enc_len, uri_dec_len, uri_encode, uri_decode
+		}
+	},
 	{
 		NULL,
 		{
diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out
index 2483966576..f89c5ec1c3 100644
--- a/src/test/regress/expected/strings.out
+++ b/src/test/regress/expected/strings.out
@@ -1870,3 +1870,24 @@ SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5
  Th\000o\x02\x03
 (1 row)
 
+SET bytea_output TO hex;
+SELECT encode(E'en\\300\\336d'::bytea, 'uri');
+  encode   
+-----------
+ en%C0%DEd
+(1 row)
+
+SELECT decode('%De%c0%DEd', 'uri');
+   decode   
+------------
+ \xdec0de64
+(1 row)
+
+SELECT decode('error%Ex', 'uri');
+ERROR:  invalid hexadecimal digit: "x"
+SELECT decode('error%E', 'uri');
+ERROR:  invalid uri percent encoding
+HINT:  Input data ends prematurely.
+SELECT decode('error%', 'uri');
+ERROR:  invalid uri percent encoding
+HINT:  Input data ends prematurely.
diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql
index b5e75c344f..1d03836b6e 100644
--- a/src/test/regress/sql/strings.sql
+++ b/src/test/regress/sql/strings.sql
@@ -641,3 +641,10 @@ SELECT btrim(E'\\000trim\\000'::bytea, ''::bytea);
 SELECT encode(overlay(E'Th\\000omas'::bytea placing E'Th\\001omas'::bytea from 2),'escape');
 SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 8),'escape');
 SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5 for 3),'escape');
+
+SET bytea_output TO hex;
+SELECT encode(E'en\\300\\336d'::bytea, 'uri');
+SELECT decode('%De%c0%DEd', 'uri');
+SELECT decode('error%Ex', 'uri');
+SELECT decode('error%E', 'uri');
+SELECT decode('error%', 'uri');

Reply via email to