po 22. 6. 2020 v 5:48 odesílatel Pavel Stehule <pavel.steh...@gmail.com> napsal:
> Hi > > There is one user request for unescape function in core. > > > https://stackoverflow.com/questions/20124393/convert-escaped-unicode-character-back-to-actual-character-in-postgresql/20125412?noredirect=1#comment110502526_20125412 > > This request is about possibility that we do with string literal via > functional interface instead string literals only > > I wrote plpgsql function, but built in function can be simpler: > > CREATE OR REPLACE FUNCTION public.unescape(text, text) > RETURNS text > LANGUAGE plpgsql > AS $function$ > DECLARE result text; > BEGIN > EXECUTE format('SELECT U&%s UESCAPE %s', > quote_literal(replace($1, '\u','^')), > quote_literal($2)) INTO result; > RETURN result; > END; > $function$ > > postgres=# select unescape('Odpov\u011Bdn\u00E1 osoba','^'); > unescape ----------------- > Odpovědná osoba(1 row) > > What do you think about this? > I changed the name to more accurately "unicode_unescape". Patch is assigned Regards Pavel > Regards > > Pavel >
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index b7c450ea29..365ea17946 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -3533,6 +3533,24 @@ repeat('Pg', 4) <returnvalue>PgPgPgPg</returnvalue> </para></entry> </row> + <row> + <entry role="func_table_entry"><para role="func_signature"> + <indexterm> + <primary>unicode_unescape</primary> + </indexterm> + <function>unicode_unescape</function> ( <parameter>string</parameter> <type>text</type> + <optional>, <parameter>escape_char</parameter> <type>text</type> </optional> ) + <returnvalue>text</returnvalue> + </para> + <para> + Evaluate escaped unicode chars (4 or 6 digits) to chars. + </para> + <para> + <literal>unicode_unescape('\0441\043B\043E\043D')</literal> + <returnvalue>слон</returnvalue> + </para></entry> + </row> + </tbody> </tgroup> </table> diff --git a/src/backend/parser/parser.c b/src/backend/parser/parser.c index be86eb37fe..c7f94298c1 100644 --- a/src/backend/parser/parser.c +++ b/src/backend/parser/parser.c @@ -26,7 +26,6 @@ #include "parser/parser.h" #include "parser/scansup.h" -static bool check_uescapechar(unsigned char escape); static char *str_udeescape(const char *str, char escape, int position, core_yyscan_t yyscanner); @@ -278,44 +277,6 @@ base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner) return cur_token; } -/* convert hex digit (caller should have verified that) to value */ -static unsigned int -hexval(unsigned char c) -{ - if (c >= '0' && c <= '9') - return c - '0'; - if (c >= 'a' && c <= 'f') - return c - 'a' + 0xA; - if (c >= 'A' && c <= 'F') - return c - 'A' + 0xA; - elog(ERROR, "invalid hexadecimal digit"); - return 0; /* not reached */ -} - -/* is Unicode code point acceptable? */ -static void -check_unicode_value(pg_wchar c) -{ - if (!is_valid_unicode_codepoint(c)) - ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("invalid Unicode escape value"))); -} - -/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */ -static bool -check_uescapechar(unsigned char escape) -{ - if (isxdigit(escape) - || escape == '+' - || escape == '\'' - || escape == '"' - || scanner_isspace(escape)) - return false; - else - return true; -} - /* * Process Unicode escapes in "str", producing a palloc'd plain string * diff --git a/src/backend/parser/scansup.c b/src/backend/parser/scansup.c index 18169ec4f4..5a39edf450 100644 --- a/src/backend/parser/scansup.c +++ b/src/backend/parser/scansup.c @@ -228,3 +228,41 @@ scanner_isspace(char ch) return true; return false; } + +/* convert hex digit (caller should have verified that) to value */ +unsigned int +hexval(unsigned char c) +{ + if (c >= '0' && c <= '9') + return c - '0'; + if (c >= 'a' && c <= 'f') + return c - 'a' + 0xA; + if (c >= 'A' && c <= 'F') + return c - 'A' + 0xA; + elog(ERROR, "invalid hexadecimal digit"); + return 0; /* not reached */ +} + +/* is Unicode code point acceptable? */ +Oid +check_unicode_value(pg_wchar c) +{ + if (!is_valid_unicode_codepoint(c)) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("invalid Unicode escape value"))); +} + +/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */ +bool +check_uescapechar(unsigned char escape) +{ + if (isxdigit(escape) + || escape == '+' + || escape == '\'' + || escape == '"' + || scanner_isspace(escape)) + return false; + else + return true; +} diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index 2eaabd6231..2934a1d9da 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -6139,3 +6139,202 @@ unicode_is_normalized(PG_FUNCTION_ARGS) PG_RETURN_BOOL(result); } + +/* + * Process Unicode escapes in "str" + * + * escape: the escape character to use + */ +static void +udeescape(StringInfo str, const char *instr, size_t len, char escape) +{ + pg_wchar pair_first = 0; + char cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1]; + + while (len > 0) + { + if (instr[0] == escape) + { + if (len >= 2 && + instr[1] == escape) + { + if (pair_first) + goto invalid_pair; + appendStringInfoChar(str, escape); + instr += 2; + len -= 2; + } + else if (len >= 5 && + isxdigit((unsigned char) instr[1]) && + isxdigit((unsigned char) instr[2]) && + isxdigit((unsigned char) instr[3]) && + isxdigit((unsigned char) instr[4])) + { + pg_wchar unicode; + + unicode = (hexval(instr[1]) << 12) + + (hexval(instr[2]) << 8) + + (hexval(instr[3]) << 4) + + hexval(instr[4]); + check_unicode_value(unicode); + if (pair_first) + { + if (is_utf16_surrogate_second(unicode)) + { + unicode = surrogate_pair_to_codepoint(pair_first, unicode); + pair_first = 0; + } + else + goto invalid_pair; + } + else if (is_utf16_surrogate_second(unicode)) + goto invalid_pair; + + if (is_utf16_surrogate_first(unicode)) + pair_first = unicode; + else + { + pg_unicode_to_server(unicode, (unsigned char *) cbuf); + appendStringInfoString(str, cbuf); + } + instr += 5; + len -= 5; + } + else if (len >= 8 && + instr[1] == '+' && + isxdigit((unsigned char) instr[2]) && + isxdigit((unsigned char) instr[3]) && + isxdigit((unsigned char) instr[4]) && + isxdigit((unsigned char) instr[5]) && + isxdigit((unsigned char) instr[6]) && + isxdigit((unsigned char) instr[7])) + { + pg_wchar unicode; + + unicode = (hexval(instr[2]) << 20) + + (hexval(instr[3]) << 16) + + (hexval(instr[4]) << 12) + + (hexval(instr[5]) << 8) + + (hexval(instr[6]) << 4) + + hexval(instr[7]); + check_unicode_value(unicode); + if (pair_first) + { + if (is_utf16_surrogate_second(unicode)) + { + unicode = surrogate_pair_to_codepoint(pair_first, unicode); + pair_first = 0; + } + else + goto invalid_pair; + } + else if (is_utf16_surrogate_second(unicode)) + goto invalid_pair; + + if (is_utf16_surrogate_first(unicode)) + pair_first = unicode; + else + { + pg_unicode_to_server(unicode, (unsigned char *) cbuf); + appendStringInfoString(str, cbuf); + } + instr += 8; + len -= 8; + } + else + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("invalid Unicode escape"), + errhint("Unicode escapes must be \\XXXX or \\+XXXXXX."))); + } + else + { + if (pair_first) + goto invalid_pair; + + appendStringInfoChar(str, *instr++); + len--; + } + } + + /* unfinished surrogate pair? */ + if (pair_first) + goto invalid_pair; + + return; + +invalid_pair: + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("invalid Unicode surrogate pair"))); +} + +/* + * Unescape unicode strings + */ +Datum +unicode_uescape_with_escape_char(PG_FUNCTION_ARGS) +{ + StringInfoData str; + text *input_text; + text *escchr_text; + text *result; + const char *escchr_ptr; + + /* when input string is NULL, then result is NULL too */ + if (PG_ARGISNULL(0)) + PG_RETURN_NULL(); + + input_text = PG_GETARG_TEXT_PP(0); + + if (PG_ARGISNULL(1)) + ereport(ERROR, + (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), + errmsg("null value not allowed for escape char"))); + + escchr_text = PG_GETARG_TEXT_PP(1); + escchr_ptr = VARDATA_ANY(escchr_text); + + if (VARSIZE_ANY_EXHDR(escchr_text) == 1 && !check_uescapechar(*escchr_ptr)) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("invalid Unicode escape character"))); + + initStringInfo(&str); + + udeescape(&str, + VARDATA_ANY(input_text), + VARSIZE_ANY_EXHDR(input_text), + *escchr_ptr); + + result = cstring_to_text_with_len(str.data, str.len); + pfree(str.data); + + PG_RETURN_TEXT_P(result); +} + +Datum +unicode_uescape(PG_FUNCTION_ARGS) +{ + StringInfoData str; + text *input_text; + text *result; + + /* when input string is NULL, then result is NULL too */ + if (PG_ARGISNULL(0)) + PG_RETURN_NULL(); + + input_text = PG_GETARG_TEXT_PP(0); + + initStringInfo(&str); + + udeescape(&str, + VARDATA_ANY(input_text), + VARSIZE_ANY_EXHDR(input_text), + '\\'); + + result = cstring_to_text_with_len(str.data, str.len); + pfree(str.data); + + PG_RETURN_TEXT_P(result); +} diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 61f2c2f5b4..42792fca3c 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -10936,4 +10936,11 @@ proname => 'is_normalized', prorettype => 'bool', proargtypes => 'text text', prosrc => 'unicode_is_normalized' }, +{ oid => '1136', descr => 'unescape Unicode chars in strings', + proname => 'unicode_unescape', prorettype => 'text', proargtypes => 'text text', + proisstrict => 'f', prosrc => 'unicode_uescape_with_escape_char' }, + +{ oid => '1137', descr => 'unescape Unicode chars in strings', + proname => 'unicode_unescape', prorettype => 'text', proargtypes => 'text', + proisstrict => 't', prosrc => 'unicode_uescape' } ] diff --git a/src/include/parser/scansup.h b/src/include/parser/scansup.h index 7a6ee529ae..e1dc7b8a2a 100644 --- a/src/include/parser/scansup.h +++ b/src/include/parser/scansup.h @@ -15,6 +15,8 @@ #ifndef SCANSUP_H #define SCANSUP_H +#include "mb/pg_wchar.h" + extern char *scanstr(const char *s); extern char *downcase_truncate_identifier(const char *ident, int len, @@ -27,4 +29,10 @@ extern void truncate_identifier(char *ident, int len, bool warn); extern bool scanner_isspace(char ch); +extern unsigned int hexval(unsigned char c); + +extern Oid check_unicode_value(pg_wchar c); + +extern bool check_uescapechar(unsigned char escape); + #endif /* SCANSUP_H */ diff --git a/src/test/regress/expected/unicode.out b/src/test/regress/expected/unicode.out index 2a1e903696..deb67b566b 100644 --- a/src/test/regress/expected/unicode.out +++ b/src/test/regress/expected/unicode.out @@ -79,3 +79,15 @@ ORDER BY num; SELECT is_normalized('abc', 'def'); -- run-time error ERROR: invalid normalization form: def +SELECT unicode_unescape('\0441\043B\043E\043D'); + unicode_unescape +------------------ + слон +(1 row) + +SELECT unicode_unescape('d!0061t!+000061', '!'); + unicode_unescape +------------------ + data +(1 row) + diff --git a/src/test/regress/sql/unicode.sql b/src/test/regress/sql/unicode.sql index ccfc6fa77a..fd99031a1a 100644 --- a/src/test/regress/sql/unicode.sql +++ b/src/test/regress/sql/unicode.sql @@ -30,3 +30,6 @@ FROM ORDER BY num; SELECT is_normalized('abc', 'def'); -- run-time error + +SELECT unicode_unescape('\0441\043B\043E\043D'); +SELECT unicode_unescape('d!0061t!+000061', '!');