On 2019-Oct-07, Anders Åstrand wrote: > Attached is a patch for adding uri as an encoding option for > encode/decode. It uses what's called "percent-encoding" in rfc3986 > (https://tools.ietf.org/html/rfc3986#section-2.1).
Thanks. Seems useful. I made a few cosmetic tweaks and it looks almost ready to me; however, documentation is missing. I added a stub; can you please complete that? To answer Arthur Zakirov's question: yes, the standard recommends ("should") to use uppercase characters: : For consistency, URI producers and : normalizers should use uppercase hexadecimal digits for all percent- : encodings. Thanks, -- Álvaro Herrera https://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
>From 44475f709762ba1a2a881d20345cc6a4cb086f01 Mon Sep 17 00:00:00 2001 From: Alvaro Herrera <alvhe...@alvh.no-ip.org> Date: Thu, 20 Feb 2020 18:46:15 -0300 Subject: [PATCH v2] URI encode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Author: Anders Åstrand Discussion: https://postgr.es/m/APwPebtwJnjjt=euusml1zz6w3jvna1cvjezhbouccytjc9...@mail.gmail.com --- doc/src/sgml/func.sgml | 16 +++- src/backend/utils/adt/encode.c | 129 ++++++++++++++++++++++++++ src/test/regress/expected/strings.out | 21 +++++ src/test/regress/sql/strings.sql | 7 ++ 4 files changed, 172 insertions(+), 1 deletion(-) diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index ceda48e0fc..c60ad4f4e2 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -3180,7 +3180,8 @@ SELECT format('Testing %3$s, %2$s, %s', 'one', 'two', 'three'); <parameter>format</parameter> values are: <link linkend="encode-format-base64"><literal>base64</literal></link>, <link linkend="encode-format-escape"><literal>escape</literal></link>, - <link linkend="encode-format-hex"><literal>hex</literal></link> + <link linkend="encode-format-hex"><literal>hex</literal></link>, + <link linkend="encode-format-uri"><literal>uri</literal></link> </entry> <entry><literal>encode('123\000\001', 'base64')</literal></entry> <entry><literal>MTIzAAE=</literal></entry> @@ -3274,6 +3275,19 @@ SELECT format('Testing %3$s, %2$s, %s', 'one', 'two', 'three'); </para> </listitem> </varlistentry> + + <varlistentry id="encode-format-uri"> + <term>uri + <indexterm> + <primary>uri format</primary> + </indexterm></term> + <listitem> + <para> + The <literal>uri</literal> format represents ... + </para> + </listitem> + </varlistentry> + </variablelist> </para> diff --git a/src/backend/utils/adt/encode.c b/src/backend/utils/adt/encode.c index b8d9ec7e00..81d4ea8400 100644 --- a/src/backend/utils/adt/encode.c +++ b/src/backend/utils/adt/encode.c @@ -110,6 +110,7 @@ binary_decode(PG_FUNCTION_ARGS) */ static const char hextbl[] = "0123456789abcdef"; +static const char hextbl_upper[] = "0123456789ABCDEF"; static const int8 hexlookup[128] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, @@ -512,6 +513,128 @@ esc_dec_len(const char *src, unsigned srclen) return len; } +/* + * URI percent encoding + * + * Percent encodes all byte values except the unreserved ASCII characters as + * per RFC3986. + */ + +static unsigned +uri_encode(const char *src, unsigned srclen, char *dst) +{ + char *d = dst; + + for (const char *s = src; s < src + srclen; s++) + { + /* + * RFC3986: + * + * unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" + */ + if ((*s >= 'A' && *s <= 'Z') || + (*s >= 'a' && *s <= 'z') || + (*s >= '0' && *s <= '9') || + *s == '-' || + *s == '.' || + *s == '_' || + *s == '~') + { + *d++ = *s; + } + else + { + *d++ = '%'; + *d++ = hextbl_upper[(*s >> 4) & 0xF]; + *d++ = hextbl_upper[*s & 0xF]; + } + } + return d - dst; +} + +static unsigned +uri_decode(const char *src, unsigned srclen, char *dst) +{ + const char *s = src; + const char *srcend = src + srclen; + char *d = dst; + char val; + + while (s < srcend) + { + if (*s == '%') + { + /* + * Verify we have the needed bytes. This doesn't happen, since + * uri_dec_len already takes care of validation. + */ + if (s > srcend - 3) + elog(ERROR, "invalid uri percent encoding"); + + /* Skip '%' */ + s++; + + val = get_hex(*s++) << 4; + val += get_hex(*s++); + *d++ = val; + } + else + *d++ = *s++; + } + return d - dst; +} + +static unsigned +uri_enc_len(const char *src, unsigned srclen) +{ + int len = 0; + + for (const char *s = src; s < src + srclen; s++) + { + if ((*s >= 'A' && *s <= 'Z') || + (*s >= 'a' && *s <= 'z') || + (*s >= '0' && *s <= '9') || + *s == '-' || + *s == '_' || + *s == '.' || + *s == '~') + { + len++; + } + else + len += 3; + } + return len; +} + +static unsigned +uri_dec_len(const char *src, unsigned srclen) +{ + const char *s = src; + const char *srcend = src + srclen; + int len = 0; + + while (s < srcend) + { + if (*s == '%') + { + if (s > srcend - 3) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid uri percent encoding"), + errhint("Input data ends prematurely."))); + s++; + get_hex(*s++); + get_hex(*s++); + } + else + s++; + len++; + } + + return len; +} + /* * Common */ @@ -541,6 +664,12 @@ static const struct esc_enc_len, esc_dec_len, esc_encode, esc_decode } }, + { + "uri", + { + uri_enc_len, uri_dec_len, uri_encode, uri_decode + } + }, { NULL, { diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out index 60cb86193c..a79ef6ac10 100644 --- a/src/test/regress/expected/strings.out +++ b/src/test/regress/expected/strings.out @@ -1892,3 +1892,24 @@ SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5 Th\000o\x02\x03 (1 row) +SET bytea_output TO hex; +SELECT encode(E'en\\300\\336d'::bytea, 'uri'); + encode +----------- + en%C0%DEd +(1 row) + +SELECT decode('%De%c0%DEd', 'uri'); + decode +------------ + \xdec0de64 +(1 row) + +SELECT decode('error%Ex', 'uri'); +ERROR: invalid hexadecimal digit: "x" +SELECT decode('error%E', 'uri'); +ERROR: invalid uri percent encoding +HINT: Input data ends prematurely. +SELECT decode('error%', 'uri'); +ERROR: invalid uri percent encoding +HINT: Input data ends prematurely. diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql index c5cd15142a..8a7b103681 100644 --- a/src/test/regress/sql/strings.sql +++ b/src/test/regress/sql/strings.sql @@ -648,3 +648,10 @@ SELECT btrim(E'\\000trim\\000'::bytea, ''::bytea); SELECT encode(overlay(E'Th\\000omas'::bytea placing E'Th\\001omas'::bytea from 2),'escape'); SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 8),'escape'); SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5 for 3),'escape'); + +SET bytea_output TO hex; +SELECT encode(E'en\\300\\336d'::bytea, 'uri'); +SELECT decode('%De%c0%DEd', 'uri'); +SELECT decode('error%Ex', 'uri'); +SELECT decode('error%E', 'uri'); +SELECT decode('error%', 'uri'); -- 2.20.1