On Mon, Oct  7, 2019 at 09:14:38AM +0200, Anders Åstrand wrote:
> Hello
> 
> Attached is a patch for adding uri as an encoding option for
> encode/decode. It uses what's called "percent-encoding" in rfc3986
> (https://tools.ietf.org/html/rfc3986#section-2.1).

Oh, that's a cool idea.  Can you add it to the commit-fest?

        https://commitfest.postgresql.org/25/

---------------------------------------------------------------------------


> 
> The background for this patch is that I could easily build urls in
> plpgsql, but doing the actual encoding of the url parts is painfully
> slow. The list of available encodings for encode/decode looks quite
> arbitrary to me, so I can't see any reason this one couldn't be in
> there.
> 
> In modern web scenarios one would probably most likely want to encode
> the utf8 representation of a text string for inclusion in a url, in
> which case correct invocation would be ENCODE(CONVERT_TO('some text in
> database encoding goes here', 'UTF8'), 'uri'), but uri
> percent-encoding can of course also be used for other text encodings
> and arbitrary binary data.
> 
> Regards,
> Anders

> diff --git a/src/backend/utils/adt/encode.c b/src/backend/utils/adt/encode.c
> index 7293d66de5..33cf7bb57c 100644
> --- a/src/backend/utils/adt/encode.c
> +++ b/src/backend/utils/adt/encode.c
> @@ -512,6 +512,131 @@ esc_dec_len(const char *src, unsigned srclen)
>       return len;
>  }
>  
> +/*
> + * URI percent encoding
> + *
> + * Percent encodes all byte values except the unreserved ASCII characters as 
> per RFC3986.
> + */
> +
> +static const char upper_hex_digits[] = "0123456789ABCDEF";
> +
> +static unsigned
> +uri_encode(const char *src, unsigned srclen, char *dst)
> +{
> +     char            *d = dst;
> +
> +     for (const char *s = src; s < src + srclen; s++)
> +     {
> +             if ((*s >= 'A' && *s <= 'Z') ||
> +                     (*s >= 'a' && *s <= 'z') ||
> +                     (*s >= '0' && *s <= '9') ||
> +                     *s == '-' ||
> +                     *s == '_' ||
> +                     *s == '.' ||
> +                     *s == '~')
> +             {
> +                     *d++ = *s;
> +             }
> +             else
> +             {
> +                     *d++ = '%';
> +                     *d++ = upper_hex_digits[(*s >> 4) & 0xF];
> +                     *d++ = upper_hex_digits[*s & 0xF];
> +             }
> +     }
> +     return d - dst;
> +}
> +
> +static unsigned
> +uri_decode(const char *src, unsigned srclen, char *dst)
> +{
> +     const char *s = src;
> +     const char *srcend = src + srclen;
> +     char            *d = dst;
> +     char            val;
> +
> +     while (s < srcend)
> +     {
> +             if (*s == '%')
> +             {
> +                     if (s > srcend - 3) {
> +                             /* This will never get triggered since 
> uri_dec_len already takes care of validation
> +                              */
> +                             ereport(ERROR,
> +                                             
> (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
> +                                              errmsg("invalid uri percent 
> encoding"),
> +                                              errhint("Input data ends 
> prematurely.")));
> +                     }
> +
> +                     /* Skip '%' */
> +                     s++;
> +
> +                     val = get_hex(*s++) << 4;
> +                     val += get_hex(*s++);
> +                     *d++ = val;
> +             }
> +             else
> +             {
> +                     *d++ = *s++;
> +             }
> +     }
> +     return d - dst;
> +}
> +
> +static unsigned
> +uri_enc_len(const char *src, unsigned srclen)
> +{
> +     int                     len = 0;
> +
> +     for (const char *s = src; s < src + srclen; s++)
> +     {
> +             if ((*s >= 'A' && *s <= 'Z') ||
> +                     (*s >= 'a' && *s <= 'z') ||
> +                     (*s >= '0' && *s <= '9') ||
> +                     *s == '-' ||
> +                     *s == '_' ||
> +                     *s == '.' ||
> +                     *s == '~')
> +             {
> +                     len++;
> +             }
> +             else
> +             {
> +                     len += 3;
> +             }
> +     }
> +     return len;
> +}
> +
> +static unsigned
> +uri_dec_len(const char *src, unsigned srclen)
> +{
> +     const char *s = src;
> +     const char *srcend = src + srclen;
> +     int                     len = 0;
> +
> +     while (s < srcend)
> +     {
> +             if (*s == '%')
> +             {
> +                     if (s > srcend - 3) {
> +                             ereport(ERROR,
> +                                             
> (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
> +                                              errmsg("invalid uri percent 
> encoding"),
> +                                              errhint("Input data ends 
> prematurely.")));
> +                     }
> +                     s++;
> +                     get_hex(*s++);
> +                     get_hex(*s++);
> +             }
> +             else {
> +                     s++;
> +             }
> +             len++;
> +     }
> +     return len;
> +}
> +
>  /*
>   * Common
>   */
> @@ -541,6 +666,12 @@ static const struct
>                       esc_enc_len, esc_dec_len, esc_encode, esc_decode
>               }
>       },
> +     {
> +             "uri",
> +             {
> +                     uri_enc_len, uri_dec_len, uri_encode, uri_decode
> +             }
> +     },
>       {
>               NULL,
>               {
> diff --git a/src/test/regress/expected/strings.out 
> b/src/test/regress/expected/strings.out
> index 2483966576..f89c5ec1c3 100644
> --- a/src/test/regress/expected/strings.out
> +++ b/src/test/regress/expected/strings.out
> @@ -1870,3 +1870,24 @@ SELECT encode(overlay(E'Th\\000omas'::bytea placing 
> E'\\002\\003'::bytea from 5
>   Th\000o\x02\x03
>  (1 row)
>  
> +SET bytea_output TO hex;
> +SELECT encode(E'en\\300\\336d'::bytea, 'uri');
> +  encode   
> +-----------
> + en%C0%DEd
> +(1 row)
> +
> +SELECT decode('%De%c0%DEd', 'uri');
> +   decode   
> +------------
> + \xdec0de64
> +(1 row)
> +
> +SELECT decode('error%Ex', 'uri');
> +ERROR:  invalid hexadecimal digit: "x"
> +SELECT decode('error%E', 'uri');
> +ERROR:  invalid uri percent encoding
> +HINT:  Input data ends prematurely.
> +SELECT decode('error%', 'uri');
> +ERROR:  invalid uri percent encoding
> +HINT:  Input data ends prematurely.
> diff --git a/src/test/regress/sql/strings.sql 
> b/src/test/regress/sql/strings.sql
> index b5e75c344f..1d03836b6e 100644
> --- a/src/test/regress/sql/strings.sql
> +++ b/src/test/regress/sql/strings.sql
> @@ -641,3 +641,10 @@ SELECT btrim(E'\\000trim\\000'::bytea, ''::bytea);
>  SELECT encode(overlay(E'Th\\000omas'::bytea placing E'Th\\001omas'::bytea 
> from 2),'escape');
>  SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea 
> from 8),'escape');
>  SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea 
> from 5 for 3),'escape');
> +
> +SET bytea_output TO hex;
> +SELECT encode(E'en\\300\\336d'::bytea, 'uri');
> +SELECT decode('%De%c0%DEd', 'uri');
> +SELECT decode('error%Ex', 'uri');
> +SELECT decode('error%E', 'uri');
> +SELECT decode('error%', 'uri');


-- 
  Bruce Momjian  <br...@momjian.us>        http://momjian.us
  EnterpriseDB                             http://enterprisedb.com

+ As you are, so once was I.  As I am, so you will be. +
+                      Ancient Roman grave inscription +


Reply via email to