Hi 2016-03-14 17:39 GMT+01:00 Teodor Sigaev <teo...@sigaev.ru>:
> I afraid so I cannot to fix this inconsistency (if this is inconsistency - >> the >> binary values are same) - the parameter of function is raw string with >> processed >> escape codes, and I have not any information about original escape >> sequences. >> When you enter octet value, and I show it as hex value, then there should >> be >> difference. Buy I have not information about your input (octet or hex). I >> have >> the original string of SQL identifier inside parser, executor, but I have >> not >> original string of function parameter inside function (not without pretty >> complex and long code). >> > Ok, agree > > >> I am trying describe it in doc (I am sorry for my less level English) in >> new >> patch. Fixed duplicated oid too. >> > Edited a bit + fix some typos and remove unneeded headers, patch attached > > Sorry, I can't find all corner-cases at once, but: > SELECT parse_ident(E'"c".X XXXXXXXXXX'); > ERROR: identifier contains disallowed characters: "\"c" > > Error message wrongly points to the reason of error. > > I forgot my original plan - show full original string. Now, complete original parameter is used as part of message everywhere. It is more consistent. I cannot to reuse escape_json - it escape double quotes, and then the paremeter in message looks strange. I hope so the messages are ok now. Few more regress tests added. Thank you for your patience. Regards Pavel > > > > > -- > Teodor Sigaev E-mail: teo...@sigaev.ru > WWW: > http://www.sigaev.ru/ >
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml new file mode 100644 index 000489d..918356c *** a/doc/src/sgml/func.sgml --- b/doc/src/sgml/func.sgml *************** *** 1821,1826 **** --- 1821,1852 ---- <row> <entry> <indexterm> + <primary>parse_ident</primary> + </indexterm> + <literal><function>parse_ident(<parameter>str</parameter> <type>text</type>, + [ <parameter>strictmode</parameter> <type>boolean</type> DEFAULT true ] )</function></literal> + </entry> + <entry><type>text[]</type></entry> + <entry>Split <parameter>qualified identifier</parameter> into array + <parameter>parts</parameter>. When <parameter>strictmode</parameter> is + false, extra characters after the identifier are ignored. This is useful + for parsing identifiers for objects like functions and arrays that may + have trailing characters. By default, extra characters after the last + identifier are considered an error, but if second parameter is false, + then chararacters after last identifier are ignored. Note that this + function does not truncate quoted identifiers. If you care about that + you should cast the result of this function to name[]. A non-printable + chararacters (like 0 to 31) are displayed as hexadecimal codes always, + what can be different from PostgreSQL internal SQL identifiers + processing, when the original escaped value is displayed. + </entry> + <entry><literal>parse_ident('"SomeSchema".someTable')</literal></entry> + <entry><literal>"SomeSchema,sometable"</literal></entry> + </row> + + <row> + <entry> + <indexterm> <primary>pg_client_encoding</primary> </indexterm> <literal><function>pg_client_encoding()</function></literal> diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql new file mode 100644 index fef67bd..9ae1ef4 *** a/src/backend/catalog/system_views.sql --- b/src/backend/catalog/system_views.sql *************** RETURNS jsonb *** 990,992 **** --- 990,999 ---- LANGUAGE INTERNAL STRICT IMMUTABLE AS 'jsonb_set'; + + CREATE OR REPLACE FUNCTION + parse_ident(str text, strict boolean DEFAULT true) + RETURNS text[] + LANGUAGE INTERNAL + STRICT IMMUTABLE + AS 'parse_ident'; diff --git a/src/backend/parser/scansup.c b/src/backend/parser/scansup.c new file mode 100644 index 2b4ab20..7aa5b76 *** a/src/backend/parser/scansup.c --- b/src/backend/parser/scansup.c *************** scanstr(const char *s) *** 130,135 **** --- 130,144 ---- char * downcase_truncate_identifier(const char *ident, int len, bool warn) { + return downcase_identifier(ident, len, warn, true); + } + + /* + * a workhorse for downcase_truncate_identifier + */ + char * + downcase_identifier(const char *ident, int len, bool warn, bool truncate) + { char *result; int i; bool enc_is_single_byte; *************** downcase_truncate_identifier(const char *** 158,169 **** } result[i] = '\0'; ! if (i >= NAMEDATALEN) truncate_identifier(result, i, warn); return result; } /* * truncate_identifier() --- truncate an identifier to NAMEDATALEN-1 bytes. * --- 167,179 ---- } result[i] = '\0'; ! if (i >= NAMEDATALEN && truncate) truncate_identifier(result, i, warn); return result; } + /* * truncate_identifier() --- truncate an identifier to NAMEDATALEN-1 bytes. * diff --git a/src/backend/utils/adt/misc.c b/src/backend/utils/adt/misc.c new file mode 100644 index 43f36db..8917b1e *** a/src/backend/utils/adt/misc.c --- b/src/backend/utils/adt/misc.c *************** *** 27,32 **** --- 27,33 ---- #include "commands/dbcommands.h" #include "funcapi.h" #include "miscadmin.h" + #include "parser/scansup.h" #include "parser/keywords.h" #include "postmaster/syslogger.h" #include "rewrite/rewriteHandler.h" *************** *** 39,44 **** --- 40,46 ---- #include "tcop/tcopprot.h" #include "utils/acl.h" #include "utils/builtins.h" + #include "utils/json.h" #include "utils/timestamp.h" #define atooid(x) ((Oid) strtoul((x), NULL, 10)) *************** pg_column_is_updatable(PG_FUNCTION_ARGS) *** 719,721 **** --- 721,946 ---- PG_RETURN_BOOL((events & REQ_EVENTS) == REQ_EVENTS); } + + + /* + * This simple parser utility are compatible with lexer implementation, + * used only in parse_ident function + */ + static bool + is_ident_start(unsigned char c) + { + if (c == '_') + return true; + if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) + return true; + + if (c >= 0200 && c <= 0377) + return true; + + return false; + } + + static bool + is_ident_cont(unsigned char c) + { + if (c >= '0' && c <= '9') + return true; + + return is_ident_start(c); + } + + /* + * Sanitize SQL string for using in error message. + */ + static char * + sanitize_text(text *t) + { + int len = VARSIZE_ANY_EXHDR(t); + const char *p = VARDATA_ANY(t); + StringInfo dstr; + + dstr = makeStringInfo(); + + appendStringInfoChar(dstr, '"'); + + while (len--) + { + switch (*p) + { + case '\b': + appendStringInfoString(dstr, "\\b"); + break; + case '\f': + appendStringInfoString(dstr, "\\f"); + break; + case '\n': + appendStringInfoString(dstr, "\\n"); + break; + case '\r': + appendStringInfoString(dstr, "\\r"); + break; + case '\t': + appendStringInfoString(dstr, "\\t"); + break; + case '\'': + appendStringInfoString(dstr, "''"); + break; + case '\\': + appendStringInfoString(dstr, "\\\\"); + break; + default: + if ((unsigned char) *p < ' ') + appendStringInfo(dstr, "\\u%04x", (int) *p); + else + appendStringInfoCharMacro(dstr, *p); + break; + } + p++; + } + + appendStringInfoChar(dstr, '"'); + + return dstr->data; + } + + /* + * parse_ident - parse SQL composed identifier to separate identifiers. + * When strict mode is active (second parameter), then any chars after + * last identifiers are disallowed. + */ + Datum + parse_ident(PG_FUNCTION_ARGS) + { + text *qualname; + char *qualname_str; + bool strict; + ArrayBuildState *astate = NULL; + char *nextp; + bool after_dot = false; + + qualname = PG_GETARG_TEXT_PP(0); + qualname_str = text_to_cstring(qualname); + strict = PG_GETARG_BOOL(1); + + nextp = qualname_str; + + /* skip leading whitespace */ + while (isspace((unsigned char) *nextp)) + nextp++; + + for (;;) + { + char *curname; + char *endp; + bool missing_ident; + + missing_ident = true; + + if (*nextp == '\"') + { + curname = nextp + 1; + for (;;) + { + endp = strchr(nextp + 1, '\"'); + if (endp == NULL) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("unclosed double quotes"), + errdetail("string %s is not valid identifier", + sanitize_text(qualname)))); + if (endp[1] != '\"') + break; + memmove(endp, endp + 1, strlen(endp)); + nextp = endp; + } + nextp = endp + 1; + *endp = '\0'; + + /* Show complete input string in this case. */ + if (endp - curname == 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("identifier should not be empty: %s", + sanitize_text(qualname)))); + + astate = accumArrayResult(astate, CStringGetTextDatum(curname), + false, TEXTOID, CurrentMemoryContext); + missing_ident = false; + } + else + { + if (is_ident_start((unsigned char) *nextp)) + { + char *downname; + int len; + text *part; + + curname = nextp++; + while (is_ident_cont((unsigned char) *nextp)) + nextp++; + + len = nextp - curname; + + /* + * Unlike name, we don't implicitly truncate identifiers. This + * is useful for allowing the user to check for specific parts + * of the identifier being too long. It's easy enough for the + * user to get the truncated names by casting our output to + * name[]. + */ + downname = downcase_identifier(curname, len, false, false); + part = cstring_to_text_with_len(downname, len); + astate = accumArrayResult(astate, PointerGetDatum(part), false, + TEXTOID, CurrentMemoryContext); + missing_ident = false; + } + } + + if (missing_ident) + { + /* Different error messages based on where we failed. */ + if (*nextp == '.') + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("missing valid identifier before \".\" symbol: %s", + sanitize_text(qualname)))); + else if (after_dot) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("missing valid identifier after \".\" symbol: %s", + sanitize_text(qualname)))); + else + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("missing valid identifier: %s", + sanitize_text(qualname)))); + } + + while (isspace((unsigned char) *nextp)) + nextp++; + + if (*nextp == '.') + { + after_dot = true; + nextp++; + while (isspace((unsigned char) *nextp)) + nextp++; + } + else if (*nextp == '\0') + { + break; + } + else + { + if (strict) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("identifier contains disallowed characters: %s", + sanitize_text(qualname)))); + break; + } + } + + PG_RETURN_DATUM(makeArrayResult(astate, CurrentMemoryContext)); + } diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h new file mode 100644 index ceb8129..a595327 *** a/src/include/catalog/pg_proc.h --- b/src/include/catalog/pg_proc.h *************** DESCR("I/O"); *** 3463,3468 **** --- 3463,3471 ---- DATA(insert OID = 4086 ( to_regnamespace PGNSP PGUID 12 1 0 0 0 f f f f t f s s 1 0 4089 "25" _null_ _null_ _null_ _null_ _null_ to_regnamespace _null_ _null_ _null_ )); DESCR("convert namespace name to regnamespace"); + DATA(insert OID = 1268 ( parse_ident PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 1009 "25 16" _null_ _null_ "{str,strict}" _null_ _null_ parse_ident _null_ _null_ _null_ )); + DESCR("parse qualified identifier to array of identifiers"); + DATA(insert OID = 2246 ( fmgr_internal_validator PGNSP PGUID 12 1 0 0 0 f f f f t f s s 1 0 2278 "26" _null_ _null_ _null_ _null_ _null_ fmgr_internal_validator _null_ _null_ _null_ )); DESCR("(internal)"); DATA(insert OID = 2247 ( fmgr_c_validator PGNSP PGUID 12 1 0 0 0 f f f f t f s s 1 0 2278 "26" _null_ _null_ _null_ _null_ _null_ fmgr_c_validator _null_ _null_ _null_ )); diff --git a/src/include/parser/scansup.h b/src/include/parser/scansup.h new file mode 100644 index 4f4164b..4f95c81 *** a/src/include/parser/scansup.h --- b/src/include/parser/scansup.h *************** extern char *scanstr(const char *s); *** 20,25 **** --- 20,28 ---- extern char *downcase_truncate_identifier(const char *ident, int len, bool warn); + extern char *downcase_identifier(const char *ident, int len, + bool warn, bool truncate); + extern void truncate_identifier(char *ident, int len, bool warn); extern bool scanner_isspace(char ch); diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h new file mode 100644 index 59a00bb..206288d *** a/src/include/utils/builtins.h --- b/src/include/utils/builtins.h *************** extern Datum pg_typeof(PG_FUNCTION_ARGS) *** 510,515 **** --- 510,516 ---- extern Datum pg_collation_for(PG_FUNCTION_ARGS); extern Datum pg_relation_is_updatable(PG_FUNCTION_ARGS); extern Datum pg_column_is_updatable(PG_FUNCTION_ARGS); + extern Datum parse_ident(PG_FUNCTION_ARGS); /* oid.c */ extern Datum oidin(PG_FUNCTION_ARGS); diff --git a/src/test/regress/expected/name.out b/src/test/regress/expected/name.out new file mode 100644 index b359d52..56139d4 *** a/src/test/regress/expected/name.out --- b/src/test/regress/expected/name.out *************** SELECT '' AS two, c.f1 FROM NAME_TBL c W *** 124,126 **** --- 124,192 ---- (2 rows) DROP TABLE NAME_TBL; + DO $$ + DECLARE r text[]; + BEGIN + r := parse_ident('Schemax.Tabley'); + RAISE NOTICE '%', format('%I.%I', r[1], r[2]); + r := parse_ident('"SchemaX"."TableY"'); + RAISE NOTICE '%', format('%I.%I', r[1], r[2]); + END; + $$; + NOTICE: schemax.tabley + NOTICE: "SchemaX"."TableY" + SELECT parse_ident('foo.boo'); + parse_ident + ------------- + {foo,boo} + (1 row) + + SELECT parse_ident('foo.boo[]'); -- should fail + ERROR: identifier contains disallowed characters: "foo.boo[]" + SELECT parse_ident('foo.boo[]', strict => false); -- ok + parse_ident + ------------- + {foo,boo} + (1 row) + + -- should fail + SELECT parse_ident(' '); + ERROR: missing valid identifier: " " + SELECT parse_ident(' .aaa'); + ERROR: missing valid identifier before "." symbol: " .aaa" + SELECT parse_ident(' aaa . '); + ERROR: missing valid identifier after "." symbol: " aaa . " + SELECT parse_ident('aaa.a%b'); + ERROR: identifier contains disallowed characters: "aaa.a%b" + SELECT parse_ident(E'X\rXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'); + ERROR: identifier contains disallowed characters: "X\rXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" + SELECT length(a[1]), length(a[2]) from parse_ident('"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx".yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy') as a ; + length | length + --------+-------- + 414 | 289 + (1 row) + + SELECT parse_ident(' first . " second " ." third ". " ' || repeat('x',66) || '"'); + parse_ident + ----------------------------------------------------------------------------------------------------------- + {first," second "," third "," xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"} + (1 row) + + SELECT parse_ident(' first . " second " ." third ". " ' || repeat('x',66) || '"')::name[]; + parse_ident + ------------------------------------------------------------------------------------------------------ + {first," second "," third "," xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"} + (1 row) + + SELECT parse_ident(E'"c".X XXXX\002XXXXXX'); + ERROR: identifier contains disallowed characters: ""c".X XXXX\u0002XXXXXX" + SELECT parse_ident('1020'); + ERROR: missing valid identifier: "1020" + SELECT parse_ident('10.20'); + ERROR: missing valid identifier: "10.20" + SELECT parse_ident('.'); + ERROR: missing valid identifier before "." symbol: "." + SELECT parse_ident('.1020'); + ERROR: missing valid identifier before "." symbol: ".1020" + SELECT parse_ident('xxx.1020'); + ERROR: missing valid identifier after "." symbol: "xxx.1020" diff --git a/src/test/regress/sql/name.sql b/src/test/regress/sql/name.sql new file mode 100644 index 1c7a671..602bf26 *** a/src/test/regress/sql/name.sql --- b/src/test/regress/sql/name.sql *************** SELECT '' AS three, c.f1 FROM NAME_TBL c *** 52,54 **** --- 52,87 ---- SELECT '' AS two, c.f1 FROM NAME_TBL c WHERE c.f1 ~ '.*asdf.*'; DROP TABLE NAME_TBL; + + DO $$ + DECLARE r text[]; + BEGIN + r := parse_ident('Schemax.Tabley'); + RAISE NOTICE '%', format('%I.%I', r[1], r[2]); + r := parse_ident('"SchemaX"."TableY"'); + RAISE NOTICE '%', format('%I.%I', r[1], r[2]); + END; + $$; + + SELECT parse_ident('foo.boo'); + SELECT parse_ident('foo.boo[]'); -- should fail + SELECT parse_ident('foo.boo[]', strict => false); -- ok + + -- should fail + SELECT parse_ident(' '); + SELECT parse_ident(' .aaa'); + SELECT parse_ident(' aaa . '); + SELECT parse_ident('aaa.a%b'); + SELECT parse_ident(E'X\rXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'); + + SELECT length(a[1]), length(a[2]) from parse_ident('"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx".yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy') as a ; + + SELECT parse_ident(' first . " second " ." third ". " ' || repeat('x',66) || '"'); + SELECT parse_ident(' first . " second " ." third ". " ' || repeat('x',66) || '"')::name[]; + + SELECT parse_ident(E'"c".X XXXX\002XXXXXX'); + SELECT parse_ident('1020'); + SELECT parse_ident('10.20'); + SELECT parse_ident('.'); + SELECT parse_ident('.1020'); + SELECT parse_ident('xxx.1020');
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers