Hello Attached is a patch for adding uri as an encoding option for encode/decode. It uses what's called "percent-encoding" in rfc3986 (https://tools.ietf.org/html/rfc3986#section-2.1).
The background for this patch is that I could easily build urls in plpgsql, but doing the actual encoding of the url parts is painfully slow. The list of available encodings for encode/decode looks quite arbitrary to me, so I can't see any reason this one couldn't be in there. In modern web scenarios one would probably most likely want to encode the utf8 representation of a text string for inclusion in a url, in which case correct invocation would be ENCODE(CONVERT_TO('some text in database encoding goes here', 'UTF8'), 'uri'), but uri percent-encoding can of course also be used for other text encodings and arbitrary binary data. Regards, Anders
diff --git a/src/backend/utils/adt/encode.c b/src/backend/utils/adt/encode.c index 7293d66de5..33cf7bb57c 100644 --- a/src/backend/utils/adt/encode.c +++ b/src/backend/utils/adt/encode.c @@ -512,6 +512,131 @@ esc_dec_len(const char *src, unsigned srclen) return len; } +/* + * URI percent encoding + * + * Percent encodes all byte values except the unreserved ASCII characters as per RFC3986. + */ + +static const char upper_hex_digits[] = "0123456789ABCDEF"; + +static unsigned +uri_encode(const char *src, unsigned srclen, char *dst) +{ + char *d = dst; + + for (const char *s = src; s < src + srclen; s++) + { + if ((*s >= 'A' && *s <= 'Z') || + (*s >= 'a' && *s <= 'z') || + (*s >= '0' && *s <= '9') || + *s == '-' || + *s == '_' || + *s == '.' || + *s == '~') + { + *d++ = *s; + } + else + { + *d++ = '%'; + *d++ = upper_hex_digits[(*s >> 4) & 0xF]; + *d++ = upper_hex_digits[*s & 0xF]; + } + } + return d - dst; +} + +static unsigned +uri_decode(const char *src, unsigned srclen, char *dst) +{ + const char *s = src; + const char *srcend = src + srclen; + char *d = dst; + char val; + + while (s < srcend) + { + if (*s == '%') + { + if (s > srcend - 3) { + /* This will never get triggered since uri_dec_len already takes care of validation + */ + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid uri percent encoding"), + errhint("Input data ends prematurely."))); + } + + /* Skip '%' */ + s++; + + val = get_hex(*s++) << 4; + val += get_hex(*s++); + *d++ = val; + } + else + { + *d++ = *s++; + } + } + return d - dst; +} + +static unsigned +uri_enc_len(const char *src, unsigned srclen) +{ + int len = 0; + + for (const char *s = src; s < src + srclen; s++) + { + if ((*s >= 'A' && *s <= 'Z') || + (*s >= 'a' && *s <= 'z') || + (*s >= '0' && *s <= '9') || + *s == '-' || + *s == '_' || + *s == '.' || + *s == '~') + { + len++; + } + else + { + len += 3; + } + } + return len; +} + +static unsigned +uri_dec_len(const char *src, unsigned srclen) +{ + const char *s = src; + const char *srcend = src + srclen; + int len = 0; + + while (s < srcend) + { + if (*s == '%') + { + if (s > srcend - 3) { + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid uri percent encoding"), + errhint("Input data ends prematurely."))); + } + s++; + get_hex(*s++); + get_hex(*s++); + } + else { + s++; + } + len++; + } + return len; +} + /* * Common */ @@ -541,6 +666,12 @@ static const struct esc_enc_len, esc_dec_len, esc_encode, esc_decode } }, + { + "uri", + { + uri_enc_len, uri_dec_len, uri_encode, uri_decode + } + }, { NULL, { diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out index 2483966576..f89c5ec1c3 100644 --- a/src/test/regress/expected/strings.out +++ b/src/test/regress/expected/strings.out @@ -1870,3 +1870,24 @@ SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5 Th\000o\x02\x03 (1 row) +SET bytea_output TO hex; +SELECT encode(E'en\\300\\336d'::bytea, 'uri'); + encode +----------- + en%C0%DEd +(1 row) + +SELECT decode('%De%c0%DEd', 'uri'); + decode +------------ + \xdec0de64 +(1 row) + +SELECT decode('error%Ex', 'uri'); +ERROR: invalid hexadecimal digit: "x" +SELECT decode('error%E', 'uri'); +ERROR: invalid uri percent encoding +HINT: Input data ends prematurely. +SELECT decode('error%', 'uri'); +ERROR: invalid uri percent encoding +HINT: Input data ends prematurely. diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql index b5e75c344f..1d03836b6e 100644 --- a/src/test/regress/sql/strings.sql +++ b/src/test/regress/sql/strings.sql @@ -641,3 +641,10 @@ SELECT btrim(E'\\000trim\\000'::bytea, ''::bytea); SELECT encode(overlay(E'Th\\000omas'::bytea placing E'Th\\001omas'::bytea from 2),'escape'); SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 8),'escape'); SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5 for 3),'escape'); + +SET bytea_output TO hex; +SELECT encode(E'en\\300\\336d'::bytea, 'uri'); +SELECT decode('%De%c0%DEd', 'uri'); +SELECT decode('error%Ex', 'uri'); +SELECT decode('error%E', 'uri'); +SELECT decode('error%', 'uri');