On 2019-Oct-07, Anders Åstrand wrote:

> Attached is a patch for adding uri as an encoding option for
> encode/decode. It uses what's called "percent-encoding" in rfc3986
> (https://tools.ietf.org/html/rfc3986#section-2.1).

Thanks.  Seems useful.  I made a few cosmetic tweaks and it looks almost
ready to me; however, documentation is missing.  I added a stub; can you
please complete that?

To answer Arthur Zakirov's question: yes, the standard recommends
("should") to use uppercase characters:

:  For consistency, URI producers and
:  normalizers should use uppercase hexadecimal digits for all percent-
:  encodings.

Thanks,

-- 
Álvaro Herrera                https://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
>From 44475f709762ba1a2a881d20345cc6a4cb086f01 Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvhe...@alvh.no-ip.org>
Date: Thu, 20 Feb 2020 18:46:15 -0300
Subject: [PATCH v2] URI encode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Author: Anders Åstrand
Discussion: https://postgr.es/m/APwPebtwJnjjt=euusml1zz6w3jvna1cvjezhbouccytjc9...@mail.gmail.com
---
 doc/src/sgml/func.sgml                |  16 +++-
 src/backend/utils/adt/encode.c        | 129 ++++++++++++++++++++++++++
 src/test/regress/expected/strings.out |  21 +++++
 src/test/regress/sql/strings.sql      |   7 ++
 4 files changed, 172 insertions(+), 1 deletion(-)

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index ceda48e0fc..c60ad4f4e2 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -3180,7 +3180,8 @@ SELECT format('Testing %3$s, %2$s, %s', 'one', 'two', 'three');
        <parameter>format</parameter> values are:
        <link linkend="encode-format-base64"><literal>base64</literal></link>,
        <link linkend="encode-format-escape"><literal>escape</literal></link>,
-       <link linkend="encode-format-hex"><literal>hex</literal></link>
+       <link linkend="encode-format-hex"><literal>hex</literal></link>,
+       <link linkend="encode-format-uri"><literal>uri</literal></link>
       </entry>
       <entry><literal>encode('123\000\001', 'base64')</literal></entry>
       <entry><literal>MTIzAAE=</literal></entry>
@@ -3274,6 +3275,19 @@ SELECT format('Testing %3$s, %2$s, %s', 'one', 'two', 'three');
       </para>
      </listitem>
     </varlistentry>
+
+    <varlistentry id="encode-format-uri">
+     <term>uri
+     <indexterm>
+      <primary>uri format</primary>
+     </indexterm></term>
+     <listitem>
+      <para>
+       The <literal>uri</literal> format represents ...
+      </para>
+     </listitem>
+    </varlistentry>
+
    </variablelist>
   </para>
 
diff --git a/src/backend/utils/adt/encode.c b/src/backend/utils/adt/encode.c
index b8d9ec7e00..81d4ea8400 100644
--- a/src/backend/utils/adt/encode.c
+++ b/src/backend/utils/adt/encode.c
@@ -110,6 +110,7 @@ binary_decode(PG_FUNCTION_ARGS)
  */
 
 static const char hextbl[] = "0123456789abcdef";
+static const char hextbl_upper[] = "0123456789ABCDEF";
 
 static const int8 hexlookup[128] = {
 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
@@ -512,6 +513,128 @@ esc_dec_len(const char *src, unsigned srclen)
 	return len;
 }
 
+/*
+ * URI percent encoding
+ *
+ * Percent encodes all byte values except the unreserved ASCII characters as
+ * per RFC3986.
+ */
+
+static unsigned
+uri_encode(const char *src, unsigned srclen, char *dst)
+{
+	char	   *d = dst;
+
+	for (const char *s = src; s < src + srclen; s++)
+	{
+		/*
+		 * RFC3986:
+		 *
+		 * unreserved  = ALPHA / DIGIT / "-" / "." / "_" / "~"
+		 */
+		if ((*s >= 'A' && *s <= 'Z') ||
+			(*s >= 'a' && *s <= 'z') ||
+			(*s >= '0' && *s <= '9') ||
+			*s == '-' ||
+			*s == '.' ||
+			*s == '_' ||
+			*s == '~')
+		{
+			*d++ = *s;
+		}
+		else
+		{
+			*d++ = '%';
+			*d++ = hextbl_upper[(*s >> 4) & 0xF];
+			*d++ = hextbl_upper[*s & 0xF];
+		}
+	}
+	return d - dst;
+}
+
+static unsigned
+uri_decode(const char *src, unsigned srclen, char *dst)
+{
+	const char *s = src;
+	const char *srcend = src + srclen;
+	char	   *d = dst;
+	char		val;
+
+	while (s < srcend)
+	{
+		if (*s == '%')
+		{
+			/*
+			 * Verify we have the needed bytes.  This doesn't happen, since
+			 * uri_dec_len already takes care of validation.
+			 */
+			if (s > srcend - 3)
+				elog(ERROR, "invalid uri percent encoding");
+
+			/* Skip '%' */
+			s++;
+
+			val = get_hex(*s++) << 4;
+			val += get_hex(*s++);
+			*d++ = val;
+		}
+		else
+			*d++ = *s++;
+	}
+	return d - dst;
+}
+
+static unsigned
+uri_enc_len(const char *src, unsigned srclen)
+{
+	int			len = 0;
+
+	for (const char *s = src; s < src + srclen; s++)
+	{
+		if ((*s >= 'A' && *s <= 'Z') ||
+			(*s >= 'a' && *s <= 'z') ||
+			(*s >= '0' && *s <= '9') ||
+			*s == '-' ||
+			*s == '_' ||
+			*s == '.' ||
+			*s == '~')
+		{
+			len++;
+		}
+		else
+			len += 3;
+	}
+	return len;
+}
+
+static unsigned
+uri_dec_len(const char *src, unsigned srclen)
+{
+	const char *s = src;
+	const char *srcend = src + srclen;
+	int			len = 0;
+
+	while (s < srcend)
+	{
+		if (*s == '%')
+		{
+			if (s > srcend - 3)
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+						 errmsg("invalid uri percent encoding"),
+						 errhint("Input data ends prematurely.")));
+			s++;
+			get_hex(*s++);
+			get_hex(*s++);
+		}
+		else
+			s++;
+		len++;
+	}
+
+	return len;
+}
+
 /*
  * Common
  */
@@ -541,6 +664,12 @@ static const struct
 			esc_enc_len, esc_dec_len, esc_encode, esc_decode
 		}
 	},
+	{
+		"uri",
+		{
+			uri_enc_len, uri_dec_len, uri_encode, uri_decode
+		}
+	},
 	{
 		NULL,
 		{
diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out
index 60cb86193c..a79ef6ac10 100644
--- a/src/test/regress/expected/strings.out
+++ b/src/test/regress/expected/strings.out
@@ -1892,3 +1892,24 @@ SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5
  Th\000o\x02\x03
 (1 row)
 
+SET bytea_output TO hex;
+SELECT encode(E'en\\300\\336d'::bytea, 'uri');
+  encode   
+-----------
+ en%C0%DEd
+(1 row)
+
+SELECT decode('%De%c0%DEd', 'uri');
+   decode   
+------------
+ \xdec0de64
+(1 row)
+
+SELECT decode('error%Ex', 'uri');
+ERROR:  invalid hexadecimal digit: "x"
+SELECT decode('error%E', 'uri');
+ERROR:  invalid uri percent encoding
+HINT:  Input data ends prematurely.
+SELECT decode('error%', 'uri');
+ERROR:  invalid uri percent encoding
+HINT:  Input data ends prematurely.
diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql
index c5cd15142a..8a7b103681 100644
--- a/src/test/regress/sql/strings.sql
+++ b/src/test/regress/sql/strings.sql
@@ -648,3 +648,10 @@ SELECT btrim(E'\\000trim\\000'::bytea, ''::bytea);
 SELECT encode(overlay(E'Th\\000omas'::bytea placing E'Th\\001omas'::bytea from 2),'escape');
 SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 8),'escape');
 SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5 for 3),'escape');
+
+SET bytea_output TO hex;
+SELECT encode(E'en\\300\\336d'::bytea, 'uri');
+SELECT decode('%De%c0%DEd', 'uri');
+SELECT decode('error%Ex', 'uri');
+SELECT decode('error%E', 'uri');
+SELECT decode('error%', 'uri');
-- 
2.20.1

Reply via email to