On Mon, Oct 7, 2019 at 09:14:38AM +0200, Anders Åstrand wrote: > Hello > > Attached is a patch for adding uri as an encoding option for > encode/decode. It uses what's called "percent-encoding" in rfc3986 > (https://tools.ietf.org/html/rfc3986#section-2.1).
Oh, that's a cool idea. Can you add it to the commit-fest? https://commitfest.postgresql.org/25/ --------------------------------------------------------------------------- > > The background for this patch is that I could easily build urls in > plpgsql, but doing the actual encoding of the url parts is painfully > slow. The list of available encodings for encode/decode looks quite > arbitrary to me, so I can't see any reason this one couldn't be in > there. > > In modern web scenarios one would probably most likely want to encode > the utf8 representation of a text string for inclusion in a url, in > which case correct invocation would be ENCODE(CONVERT_TO('some text in > database encoding goes here', 'UTF8'), 'uri'), but uri > percent-encoding can of course also be used for other text encodings > and arbitrary binary data. > > Regards, > Anders > diff --git a/src/backend/utils/adt/encode.c b/src/backend/utils/adt/encode.c > index 7293d66de5..33cf7bb57c 100644 > --- a/src/backend/utils/adt/encode.c > +++ b/src/backend/utils/adt/encode.c > @@ -512,6 +512,131 @@ esc_dec_len(const char *src, unsigned srclen) > return len; > } > > +/* > + * URI percent encoding > + * > + * Percent encodes all byte values except the unreserved ASCII characters as > per RFC3986. > + */ > + > +static const char upper_hex_digits[] = "0123456789ABCDEF"; > + > +static unsigned > +uri_encode(const char *src, unsigned srclen, char *dst) > +{ > + char *d = dst; > + > + for (const char *s = src; s < src + srclen; s++) > + { > + if ((*s >= 'A' && *s <= 'Z') || > + (*s >= 'a' && *s <= 'z') || > + (*s >= '0' && *s <= '9') || > + *s == '-' || > + *s == '_' || > + *s == '.' || > + *s == '~') > + { > + *d++ = *s; > + } > + else > + { > + *d++ = '%'; > + *d++ = upper_hex_digits[(*s >> 4) & 0xF]; > + *d++ = upper_hex_digits[*s & 0xF]; > + } > + } > + return d - dst; > +} > + > +static unsigned > +uri_decode(const char *src, unsigned srclen, char *dst) > +{ > + const char *s = src; > + const char *srcend = src + srclen; > + char *d = dst; > + char val; > + > + while (s < srcend) > + { > + if (*s == '%') > + { > + if (s > srcend - 3) { > + /* This will never get triggered since > uri_dec_len already takes care of validation > + */ > + ereport(ERROR, > + > (errcode(ERRCODE_INVALID_PARAMETER_VALUE), > + errmsg("invalid uri percent > encoding"), > + errhint("Input data ends > prematurely."))); > + } > + > + /* Skip '%' */ > + s++; > + > + val = get_hex(*s++) << 4; > + val += get_hex(*s++); > + *d++ = val; > + } > + else > + { > + *d++ = *s++; > + } > + } > + return d - dst; > +} > + > +static unsigned > +uri_enc_len(const char *src, unsigned srclen) > +{ > + int len = 0; > + > + for (const char *s = src; s < src + srclen; s++) > + { > + if ((*s >= 'A' && *s <= 'Z') || > + (*s >= 'a' && *s <= 'z') || > + (*s >= '0' && *s <= '9') || > + *s == '-' || > + *s == '_' || > + *s == '.' || > + *s == '~') > + { > + len++; > + } > + else > + { > + len += 3; > + } > + } > + return len; > +} > + > +static unsigned > +uri_dec_len(const char *src, unsigned srclen) > +{ > + const char *s = src; > + const char *srcend = src + srclen; > + int len = 0; > + > + while (s < srcend) > + { > + if (*s == '%') > + { > + if (s > srcend - 3) { > + ereport(ERROR, > + > (errcode(ERRCODE_INVALID_PARAMETER_VALUE), > + errmsg("invalid uri percent > encoding"), > + errhint("Input data ends > prematurely."))); > + } > + s++; > + get_hex(*s++); > + get_hex(*s++); > + } > + else { > + s++; > + } > + len++; > + } > + return len; > +} > + > /* > * Common > */ > @@ -541,6 +666,12 @@ static const struct > esc_enc_len, esc_dec_len, esc_encode, esc_decode > } > }, > + { > + "uri", > + { > + uri_enc_len, uri_dec_len, uri_encode, uri_decode > + } > + }, > { > NULL, > { > diff --git a/src/test/regress/expected/strings.out > b/src/test/regress/expected/strings.out > index 2483966576..f89c5ec1c3 100644 > --- a/src/test/regress/expected/strings.out > +++ b/src/test/regress/expected/strings.out > @@ -1870,3 +1870,24 @@ SELECT encode(overlay(E'Th\\000omas'::bytea placing > E'\\002\\003'::bytea from 5 > Th\000o\x02\x03 > (1 row) > > +SET bytea_output TO hex; > +SELECT encode(E'en\\300\\336d'::bytea, 'uri'); > + encode > +----------- > + en%C0%DEd > +(1 row) > + > +SELECT decode('%De%c0%DEd', 'uri'); > + decode > +------------ > + \xdec0de64 > +(1 row) > + > +SELECT decode('error%Ex', 'uri'); > +ERROR: invalid hexadecimal digit: "x" > +SELECT decode('error%E', 'uri'); > +ERROR: invalid uri percent encoding > +HINT: Input data ends prematurely. > +SELECT decode('error%', 'uri'); > +ERROR: invalid uri percent encoding > +HINT: Input data ends prematurely. > diff --git a/src/test/regress/sql/strings.sql > b/src/test/regress/sql/strings.sql > index b5e75c344f..1d03836b6e 100644 > --- a/src/test/regress/sql/strings.sql > +++ b/src/test/regress/sql/strings.sql > @@ -641,3 +641,10 @@ SELECT btrim(E'\\000trim\\000'::bytea, ''::bytea); > SELECT encode(overlay(E'Th\\000omas'::bytea placing E'Th\\001omas'::bytea > from 2),'escape'); > SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea > from 8),'escape'); > SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea > from 5 for 3),'escape'); > + > +SET bytea_output TO hex; > +SELECT encode(E'en\\300\\336d'::bytea, 'uri'); > +SELECT decode('%De%c0%DEd', 'uri'); > +SELECT decode('error%Ex', 'uri'); > +SELECT decode('error%E', 'uri'); > +SELECT decode('error%', 'uri'); -- Bruce Momjian <br...@momjian.us> http://momjian.us EnterpriseDB http://enterprisedb.com + As you are, so once was I. As I am, so you will be. + + Ancient Roman grave inscription +