Hi
> > Hi Pavel, > > Since the idea originated from unescaping unicode string literals i.e. > select unescape('Odpov\u011Bdn\u00E1 osoba'); > > Shouldn't the built-in function support the above syntax as well? > good idea. The prefixes u (4 digits) and U (8 digits) are supported Regards Pavel > -- > Asif Rehman > Highgo Software (Canada/China/Pakistan) > URL : www.highgo.ca > >
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 959f6a1c2f..126d3483e6 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -3539,6 +3539,38 @@ repeat('Pg', 4) <returnvalue>PgPgPgPg</returnvalue> </para></entry> </row> + <row> + <entry role="func_table_entry"><para role="func_signature"> + <indexterm> + <primary>unicode_unescape</primary> + </indexterm> + <function>unicode_unescape</function> ( <parameter>string</parameter> <type>text</type> + <optional>, <parameter>escape_char</parameter> <type>text</type> </optional> ) + <returnvalue>text</returnvalue> + </para> + <para> + Evaluate escaped unicode chars (4 or 6 digits), with prefix + <literal>u</literal> (4 digits) or with prefix + <literal>U</literal> (8 digits) to chars. + </para> + <para> + <literal>unicode_unescape('\0441\043B\043E\043D')</literal> + <returnvalue>слон</returnvalue> + </para> + <para> + <literal>unicode_unescape('d\0061t\+000061')</literal> + <returnvalue>data</returnvalue> + </para> + <para> + <literal>unicode_unescape('d!0061t!+000061', '!')</literal> + <returnvalue>data</returnvalue> + </para> + <para> + <literal>unicode_unescape('d\u0061t\U00000061')</literal> + <returnvalue>data</returnvalue> + </para></entry> + </row> + </tbody> </tgroup> </table> diff --git a/src/backend/parser/parser.c b/src/backend/parser/parser.c index be86eb37fe..c7f94298c1 100644 --- a/src/backend/parser/parser.c +++ b/src/backend/parser/parser.c @@ -26,7 +26,6 @@ #include "parser/parser.h" #include "parser/scansup.h" -static bool check_uescapechar(unsigned char escape); static char *str_udeescape(const char *str, char escape, int position, core_yyscan_t yyscanner); @@ -278,44 +277,6 @@ base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner) return cur_token; } -/* convert hex digit (caller should have verified that) to value */ -static unsigned int -hexval(unsigned char c) -{ - if (c >= '0' && c <= '9') - return c - '0'; - if (c >= 'a' && c <= 'f') - return c - 'a' + 0xA; - if (c >= 'A' && c <= 'F') - return c - 'A' + 0xA; - elog(ERROR, "invalid hexadecimal digit"); - return 0; /* not reached */ -} - -/* is Unicode code point acceptable? */ -static void -check_unicode_value(pg_wchar c) -{ - if (!is_valid_unicode_codepoint(c)) - ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("invalid Unicode escape value"))); -} - -/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */ -static bool -check_uescapechar(unsigned char escape) -{ - if (isxdigit(escape) - || escape == '+' - || escape == '\'' - || escape == '"' - || scanner_isspace(escape)) - return false; - else - return true; -} - /* * Process Unicode escapes in "str", producing a palloc'd plain string * diff --git a/src/backend/parser/scansup.c b/src/backend/parser/scansup.c index cac70d5df7..9d3173bc6d 100644 --- a/src/backend/parser/scansup.c +++ b/src/backend/parser/scansup.c @@ -218,3 +218,41 @@ scanner_isspace(char ch) return true; return false; } + +/* convert hex digit (caller should have verified that) to value */ +unsigned int +hexval(unsigned char c) +{ + if (c >= '0' && c <= '9') + return c - '0'; + if (c >= 'a' && c <= 'f') + return c - 'a' + 0xA; + if (c >= 'A' && c <= 'F') + return c - 'A' + 0xA; + elog(ERROR, "invalid hexadecimal digit"); + return 0; /* not reached */ +} + +/* is Unicode code point acceptable? */ +void +check_unicode_value(pg_wchar c) +{ + if (!is_valid_unicode_codepoint(c)) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("invalid Unicode escape value"))); +} + +/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */ +bool +check_uescapechar(unsigned char escape) +{ + if (isxdigit(escape) + || escape == '+' + || escape == '\'' + || escape == '"' + || scanner_isspace(escape)) + return false; + else + return true; +} diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index df10bfb906..5ca9817708 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -6139,3 +6139,256 @@ unicode_is_normalized(PG_FUNCTION_ARGS) PG_RETURN_BOOL(result); } + +/* + * First four chars should be hexnum digits + */ +static bool +isxdigit_four(const char *instr) +{ + return isxdigit((unsigned char) instr[0]) && + isxdigit((unsigned char) instr[1]) && + isxdigit((unsigned char) instr[2]) && + isxdigit((unsigned char) instr[3]); +} + +/* + * Translate string with hexadecimal digits to number + */ +static long int +hexval_four(const char *instr) +{ + return (hexval(instr[0]) << 12) + + (hexval(instr[1]) << 8) + + (hexval(instr[2]) << 4) + + hexval(instr[3]); +} + +/* + * Process Unicode escapes in "str" + * + * escape: the escape character to use + */ +static void +udeescape(StringInfo str, const char *instr, size_t len, char escape) +{ + pg_wchar pair_first = 0; + char cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1]; + + while (len > 0) + { + if (instr[0] == escape) + { + if (len >= 2 && + instr[1] == escape) + { + if (pair_first) + goto invalid_pair; + appendStringInfoChar(str, escape); + instr += 2; + len -= 2; + } + else if ((len >= 5 && isxdigit_four(&instr[1])) || + (len >= 6 && instr[1] == 'u' && isxdigit_four(&instr[2]))) + { + pg_wchar unicode; + int offset = instr[1] == 'u' ? 2 : 1; + + unicode = hexval_four(instr + offset); + + check_unicode_value(unicode); + + if (pair_first) + { + if (is_utf16_surrogate_second(unicode)) + { + unicode = surrogate_pair_to_codepoint(pair_first, unicode); + pair_first = 0; + } + else + goto invalid_pair; + } + else if (is_utf16_surrogate_second(unicode)) + goto invalid_pair; + + if (is_utf16_surrogate_first(unicode)) + pair_first = unicode; + else + { + pg_unicode_to_server(unicode, (unsigned char *) cbuf); + appendStringInfoString(str, cbuf); + } + + instr += 4 + offset; + len -= 4 + offset; + } + else if (len >= 8 && + instr[1] == '+' && + isxdigit_four(&instr[2]) && + isxdigit((unsigned char) instr[6]) && + isxdigit((unsigned char) instr[7])) + { + pg_wchar unicode; + + unicode = (hexval_four(&instr[2]) << 8) + + (hexval(instr[6]) << 4) + + hexval(instr[7]); + + check_unicode_value(unicode); + + if (pair_first) + { + if (is_utf16_surrogate_second(unicode)) + { + unicode = surrogate_pair_to_codepoint(pair_first, unicode); + pair_first = 0; + } + else + goto invalid_pair; + } + else if (is_utf16_surrogate_second(unicode)) + goto invalid_pair; + + if (is_utf16_surrogate_first(unicode)) + pair_first = unicode; + else + { + pg_unicode_to_server(unicode, (unsigned char *) cbuf); + appendStringInfoString(str, cbuf); + } + + instr += 8; + len -= 8; + } + else if (len >= 10 && + instr[1] == 'U' && + isxdigit_four(&instr[2]) && + isxdigit_four(&instr[6])) + { + pg_wchar unicode; + + unicode = (hexval_four(&instr[2]) << 16) + hexval_four(&instr[6]); + + check_unicode_value(unicode); + + if (pair_first) + { + if (is_utf16_surrogate_second(unicode)) + { + unicode = surrogate_pair_to_codepoint(pair_first, unicode); + pair_first = 0; + } + else + goto invalid_pair; + } + else if (is_utf16_surrogate_second(unicode)) + goto invalid_pair; + + if (is_utf16_surrogate_first(unicode)) + pair_first = unicode; + else + { + pg_unicode_to_server(unicode, (unsigned char *) cbuf); + appendStringInfoString(str, cbuf); + } + + instr += 10; + len -= 10; + } + else + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("invalid Unicode escape"), + errhint("Unicode escapes must be \\XXXX, \\+XXXXXX, \\uXXXX or \\UXXXXXXXX."))); + } + else + { + if (pair_first) + goto invalid_pair; + + appendStringInfoChar(str, *instr++); + len--; + } + } + + /* unfinished surrogate pair? */ + if (pair_first) + goto invalid_pair; + + return; + +invalid_pair: + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("invalid Unicode surrogate pair"))); +} + +/* + * Unescape unicode strings + */ +Datum +unicode_uescape_with_escape_char(PG_FUNCTION_ARGS) +{ + StringInfoData str; + text *input_text; + text *escchr_text; + text *result; + const char *escchr_ptr; + + /* when input string is NULL, then result is NULL too */ + if (PG_ARGISNULL(0)) + PG_RETURN_NULL(); + + input_text = PG_GETARG_TEXT_PP(0); + + if (PG_ARGISNULL(1)) + ereport(ERROR, + (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), + errmsg("null value not allowed for escape char"))); + + escchr_text = PG_GETARG_TEXT_PP(1); + escchr_ptr = VARDATA_ANY(escchr_text); + + if (VARSIZE_ANY_EXHDR(escchr_text) == 1 && !check_uescapechar(*escchr_ptr)) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("invalid Unicode escape character"))); + + initStringInfo(&str); + + udeescape(&str, + VARDATA_ANY(input_text), + VARSIZE_ANY_EXHDR(input_text), + *escchr_ptr); + + result = cstring_to_text_with_len(str.data, str.len); + pfree(str.data); + + PG_RETURN_TEXT_P(result); +} + +Datum +unicode_uescape(PG_FUNCTION_ARGS) +{ + StringInfoData str; + text *input_text; + text *result; + + /* when input string is NULL, then result is NULL too */ + if (PG_ARGISNULL(0)) + PG_RETURN_NULL(); + + input_text = PG_GETARG_TEXT_PP(0); + + initStringInfo(&str); + + udeescape(&str, + VARDATA_ANY(input_text), + VARSIZE_ANY_EXHDR(input_text), + '\\'); + + result = cstring_to_text_with_len(str.data, str.len); + pfree(str.data); + + PG_RETURN_TEXT_P(result); +} diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 082a11f270..75359c113a 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -10954,4 +10954,11 @@ proname => 'is_normalized', prorettype => 'bool', proargtypes => 'text text', prosrc => 'unicode_is_normalized' }, +{ oid => '1136', descr => 'unescape Unicode chars in strings', + proname => 'unicode_unescape', prorettype => 'text', proargtypes => 'text text', + proisstrict => 'f', prosrc => 'unicode_uescape_with_escape_char' }, + +{ oid => '1137', descr => 'unescape Unicode chars in strings', + proname => 'unicode_unescape', prorettype => 'text', proargtypes => 'text', + proisstrict => 't', prosrc => 'unicode_uescape' } ] diff --git a/src/include/parser/scansup.h b/src/include/parser/scansup.h index 7a6ee529ae..ca2b6d249e 100644 --- a/src/include/parser/scansup.h +++ b/src/include/parser/scansup.h @@ -15,6 +15,8 @@ #ifndef SCANSUP_H #define SCANSUP_H +#include "mb/pg_wchar.h" + extern char *scanstr(const char *s); extern char *downcase_truncate_identifier(const char *ident, int len, @@ -27,4 +29,10 @@ extern void truncate_identifier(char *ident, int len, bool warn); extern bool scanner_isspace(char ch); +extern unsigned int hexval(unsigned char c); + +extern void check_unicode_value(pg_wchar c); + +extern bool check_uescapechar(unsigned char escape); + #endif /* SCANSUP_H */ diff --git a/src/test/regress/expected/unicode.out b/src/test/regress/expected/unicode.out index 2a1e903696..03d43358e5 100644 --- a/src/test/regress/expected/unicode.out +++ b/src/test/regress/expected/unicode.out @@ -79,3 +79,36 @@ ORDER BY num; SELECT is_normalized('abc', 'def'); -- run-time error ERROR: invalid normalization form: def +SELECT unicode_unescape('\0441\043B\043E\043D'); + unicode_unescape +------------------ + слон +(1 row) + +SELECT unicode_unescape('d!0061t!+000061', '!'); + unicode_unescape +------------------ + data +(1 row) + +SELECT unicode_unescape('d\u0061t\U00000061'); + unicode_unescape +------------------ + data +(1 row) + +-- run-time error +SELECT unicode_unescape('wrong: \db99'); +ERROR: invalid Unicode surrogate pair +SELECT unicode_unescape('wrong: \db99\0061'); +ERROR: invalid Unicode surrogate pair +SELECT unicode_unescape('wrong: \+00db99\+000061'); +ERROR: invalid Unicode surrogate pair +SELECT unicode_unescape('wrong: \+2FFFFF'); +ERROR: invalid Unicode escape value +SELECT unicode_unescape('wrong: \udb99\u0061'); +ERROR: invalid Unicode surrogate pair +SELECT unicode_unescape('wrong: \U0000db99\U00000061'); +ERROR: invalid Unicode surrogate pair +SELECT unicode_unescape('wrong: \U002FFFFF'); +ERROR: invalid Unicode escape value diff --git a/src/test/regress/sql/unicode.sql b/src/test/regress/sql/unicode.sql index ccfc6fa77a..ba7d61e73a 100644 --- a/src/test/regress/sql/unicode.sql +++ b/src/test/regress/sql/unicode.sql @@ -30,3 +30,16 @@ FROM ORDER BY num; SELECT is_normalized('abc', 'def'); -- run-time error + +SELECT unicode_unescape('\0441\043B\043E\043D'); +SELECT unicode_unescape('d!0061t!+000061', '!'); +SELECT unicode_unescape('d\u0061t\U00000061'); + +-- run-time error +SELECT unicode_unescape('wrong: \db99'); +SELECT unicode_unescape('wrong: \db99\0061'); +SELECT unicode_unescape('wrong: \+00db99\+000061'); +SELECT unicode_unescape('wrong: \+2FFFFF'); +SELECT unicode_unescape('wrong: \udb99\u0061'); +SELECT unicode_unescape('wrong: \U0000db99\U00000061'); +SELECT unicode_unescape('wrong: \U002FFFFF');