Re: [HACKERS] proposal: function parse_ident

Pavel Stehule Wed, 02 Dec 2015 12:33:08 -0800

Hi

2015-11-17 1:49 GMT+01:00 Marko Tiikkaja <ma...@joh.to>:

> On 9/11/15 12:25 PM, Pavel Stehule wrote:
>
>> new update of parse_ident function patch
>>
>
> Nice!  I've certainly wanted something like this a number of times.
>
> Some comments about the v2 of the patch:
>
>    - The patch doesn't apply anymore, so it should be rebased.
>

done

>    - The docs don't even try and explain what the "strictmode" parameter
> does.

fixed

   - The comment before the parse_ident function is not up to date anymore,
> since "the rest" was removed from the interface.

fixed

   - I can't immediately grep for any uses of  do { .. } while (true) from
> our code base.  AFAICT the majority look like  for (;;);  I see no reason
> not to be consistent here.
>

fixed

   - What should happen if the input is a string like
> 'one.two.three.four.five.six'?  Do we want to accept input like that?
>

I don't see the reason why not. It is pretty simple to count fields in
result array and raise error later. The application has better information
about expected and valid numbers. But any opinion in this question should
be valid. I have not strong position here.

>    - I haven't reviewed the actual parsing code carefully, but didn't we
> already have a function which splits identifiers up?  I of course can't
> find one with my grepping right now, so I might be wrong.
>

There is: SplitIdentifierString or textToQualifiedNameList in varlena.c. My
first patch was based on these functions. But I cannot to use it.

1. major reason: The buildin parser is based on searching the dot "." and
doesn't search any disallowed identifiers chars. So there is not possible
to implement non strict mode - find last char of last identifier and ignore
other.
2. minor reason: little bit more precious diagnostics - buildin routines
returns only true (valid) and false (invalid).

Regards

Pavel

>
>
> .m
>

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
new file mode 100644
index 60b9a09..7b65ef4
*** a/doc/src/sgml/func.sgml
--- b/doc/src/sgml/func.sgml
***************
*** 1707,1712 ****
--- 1707,1729 ----
        <row>
         <entry>
          <indexterm>
+          <primary>parse_ident</primary>
+         </indexterm>
+         <literal><function>parse_ident(<parameter>str</parameter> <type>text</type>,
+            [ <parameter>strictmode</parameter> <type>boolean</type> DEFAULT true ] )</function></literal>
+        </entry>
+        <entry><type>text[]</type></entry>
+        <entry>Split <parameter>qualified identifier</parameter> to array <parameter>parts</parameter>.
+        When second parameter is true, then no any chars after last identifier is allowed. When
+        second parameter is false, then chars after last identifier are ignored.
+        </entry>
+        <entry><literal>parse_ident('"SomeSchema".someTable')</literal></entry>
+        <entry><literal>"SomeSchema,sometable"</literal></entry>
+       </row>
+ 
+       <row>
+        <entry>
+         <indexterm>
           <primary>pg_client_encoding</primary>
          </indexterm>
          <literal><function>pg_client_encoding()</function></literal>
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
new file mode 100644
index ccc030f..9a7e89d
*** a/src/backend/catalog/system_views.sql
--- b/src/backend/catalog/system_views.sql
*************** RETURNS jsonb
*** 940,942 ****
--- 940,949 ----
  LANGUAGE INTERNAL
  STRICT IMMUTABLE
  AS 'jsonb_set';
+ 
+ CREATE OR REPLACE FUNCTION
+   parse_ident(str text, strictmode boolean DEFAULT true)
+ RETURNS text[]
+ LANGUAGE INTERNAL
+ STRICT IMMUTABLE
+ AS 'parse_ident';
diff --git a/src/backend/utils/adt/misc.c b/src/backend/utils/adt/misc.c
new file mode 100644
index 3ef6e43..2540dac
*** a/src/backend/utils/adt/misc.c
--- b/src/backend/utils/adt/misc.c
***************
*** 21,32 ****
--- 21,35 ----
  #include <unistd.h>
  
  #include "access/sysattr.h"
+ #include "access/htup_details.h"
  #include "catalog/catalog.h"
+ #include "catalog/namespace.h"
  #include "catalog/pg_tablespace.h"
  #include "catalog/pg_type.h"
  #include "commands/dbcommands.h"
  #include "funcapi.h"
  #include "miscadmin.h"
+ #include "parser/scansup.h"
  #include "parser/keywords.h"
  #include "postmaster/syslogger.h"
  #include "rewrite/rewriteHandler.h"
***************
*** 38,43 ****
--- 41,47 ----
  #include "utils/ruleutils.h"
  #include "tcop/tcopprot.h"
  #include "utils/acl.h"
+ #include "utils/array.h"
  #include "utils/builtins.h"
  #include "utils/timestamp.h"
  
*************** pg_column_is_updatable(PG_FUNCTION_ARGS)
*** 598,600 ****
--- 602,755 ----
  
  	PG_RETURN_BOOL((events & REQ_EVENTS) == REQ_EVENTS);
  }
+ 
+ 
+ /*
+  * This simple parser utility are compatible with lexer implementation,
+  * used only in parse_ident function
+  */
+ static bool
+ is_ident_start(unsigned char c)
+ {
+ 	if (c == '_')
+ 		return true;
+ 	if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
+ 		return true;
+ 
+ 	if (c >= 0200 && c <= 0377)
+ 		return true;
+ 
+ 	return false;
+ }
+ 
+ static bool
+ is_ident_cont(unsigned char c)
+ {
+ 	if (c >= '0' && c <= '9')
+ 		return true;
+ 
+ 	return is_ident_start(c);
+ }
+ 
+ /*
+  * parse_ident - parse SQL composed identifier to separate identifiers.
+  * When strict mode is active (second parameter), then any chars after
+  * last identifiers are disallowed.
+  */
+ Datum
+ parse_ident(PG_FUNCTION_ARGS)
+ {
+ 	text		*qualname;
+ 	char		*qualname_str;
+ 	bool		strict_mode;
+ 	ArrayBuildState *astate = NULL;
+ 	char	*nextp;
+ 
+ 	qualname = PG_GETARG_TEXT_PP(0);
+ 	qualname_str = text_to_cstring(qualname);
+ 	strict_mode = PG_GETARG_BOOL(1);
+ 
+ 	nextp = qualname_str;
+ 
+ 	/* skip leading whitespace */
+ 	while (isspace((unsigned char) *nextp))
+ 		nextp++;
+ 
+ 	for (;;)
+ 	{
+ 		char		*curname;
+ 		char		*endp;
+ 		bool		missing_ident;
+ 
+ 		missing_ident = true;
+ 
+ 		if (*nextp == '\"')
+ 		{
+ 			curname = nextp + 1;
+ 			for (;;)
+ 			{
+ 				endp = strchr(nextp + 1, '\"');
+ 				if (endp == NULL)
+ 					ereport(ERROR,
+ 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ 						 errmsg("unclosed double quotes"),
+ 						 errdetail("string \"%s\" is not valid identifier",
+ 							    text_to_cstring(qualname))));
+ 				if (endp[1] != '\"')
+ 					break;
+ 				memmove(endp, endp + 1, strlen(endp));
+ 				nextp = endp;
+ 			}
+ 			nextp = endp + 1;
+ 			*endp = '\0';
+ 
+ 			if (endp - curname == 0)
+ 				ereport(ERROR,
+ 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ 					 errmsg("identifier should not be empty"),
+ 					 errdetail("string \"%s\" is not valid identifier",
+ 							    text_to_cstring(qualname))));
+ 
+ 			astate = accumArrayResult(astate,
+ 				CStringGetTextDatum(curname), false,
+ 						    TEXTOID, CurrentMemoryContext);
+ 			missing_ident = false;
+ 		}
+ 		else
+ 		{
+ 			if (is_ident_start((unsigned char) *nextp))
+ 			{
+ 				char *downname;
+ 				int	len;
+ 				text	*part;
+ 
+ 				curname = nextp++;
+ 				while (is_ident_cont((unsigned char) *nextp))
+ 					nextp++;
+ 
+ 				len = nextp - curname;
+ 
+ 				downname = downcase_truncate_identifier(curname, len, false);
+ 				part = cstring_to_text_with_len(downname, len);
+ 				astate = accumArrayResult(astate,
+ 					PointerGetDatum(part), false,
+ 							    TEXTOID, CurrentMemoryContext);
+ 				missing_ident = false;
+ 			}
+ 		}
+ 
+ 		if (missing_ident)
+ 			ereport(ERROR,
+ 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ 				 errmsg("missing identifier after \".\" symbol"),
+ 				 errdetail("string \"%s\" is not valid identifier",
+ 							    text_to_cstring(qualname))));
+ 
+ 		while (isspace((unsigned char) *nextp))
+ 			nextp++;
+ 
+ 		if (*nextp == '.')
+ 		{
+ 			nextp++;
+ 			while (isspace((unsigned char) *nextp))
+ 				nextp++;
+ 			continue;
+ 		}
+ 		else if (*nextp == '\0')
+ 		{
+ 			break;
+ 		}
+ 		else
+ 		{
+ 			if (strict_mode)
+ 				ereport(ERROR,
+ 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ 					 errmsg("identifier contains disallowed chars"),
+ 					 errdetail("string \"%s\" is not valid identifier",
+ 								    text_to_cstring(qualname))));
+ 			break;
+ 		}
+ 	}
+ 
+ 	PG_RETURN_DATUM(makeArrayResult(astate, CurrentMemoryContext));
+ }
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
new file mode 100644
index d8640db..0c27876
*** a/src/include/catalog/pg_proc.h
--- b/src/include/catalog/pg_proc.h
*************** DESCR("I/O");
*** 3520,3525 ****
--- 3520,3528 ----
  DATA(insert OID = 4086 (  to_regnamespace	PGNSP PGUID 12 1 0 0 0 f f f f t f s s 1 0 4089 "2275" _null_ _null_ _null_ _null_ _null_ to_regnamespace _null_ _null_ _null_ ));
  DESCR("convert namespace name to regnamespace");
  
+ DATA(insert OID = 3317 (  parse_ident		PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 1009 "25 16" _null_ _null_ "{str,strictmode}" _null_ _null_ parse_ident _null_ _null_ _null_ ));
+ DESCR("parse qualified identifier to array of identifiers");
+ 
  DATA(insert OID = 2246 ( fmgr_internal_validator PGNSP PGUID 12 1 0 0 0 f f f f t f s s 1 0 2278 "26" _null_ _null_ _null_ _null_ _null_ fmgr_internal_validator _null_ _null_ _null_ ));
  DESCR("(internal)");
  DATA(insert OID = 2247 ( fmgr_c_validator	PGNSP PGUID 12 1 0 0 0 f f f f t f s s 1 0 2278 "26" _null_ _null_ _null_ _null_ _null_ fmgr_c_validator _null_ _null_ _null_ ));
diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h
new file mode 100644
index e610bf3..773af5d
*** a/src/include/utils/builtins.h
--- b/src/include/utils/builtins.h
*************** extern Datum pg_typeof(PG_FUNCTION_ARGS)
*** 495,500 ****
--- 495,501 ----
  extern Datum pg_collation_for(PG_FUNCTION_ARGS);
  extern Datum pg_relation_is_updatable(PG_FUNCTION_ARGS);
  extern Datum pg_column_is_updatable(PG_FUNCTION_ARGS);
+ extern Datum parse_ident(PG_FUNCTION_ARGS);
  
  /* oid.c */
  extern Datum oidin(PG_FUNCTION_ARGS);
diff --git a/src/test/regress/expected/name.out b/src/test/regress/expected/name.out
new file mode 100644
index b359d52..ebd7f0a
*** a/src/test/regress/expected/name.out
--- b/src/test/regress/expected/name.out
*************** SELECT '' AS two, c.f1 FROM NAME_TBL c W
*** 124,126 ****
--- 124,152 ----
  (2 rows)
  
  DROP TABLE NAME_TBL;
+ DO $$
+ DECLARE r text[];
+ BEGIN
+   r := parse_ident('Schemax.Tabley');
+   RAISE NOTICE '%', format('%I.%I', r[1], r[2]);
+   r := parse_ident('"SchemaX"."TableY"');
+   RAISE NOTICE '%', format('%I.%I', r[1], r[2]);
+ END;
+ $$;
+ NOTICE:  schemax.tabley
+ NOTICE:  "SchemaX"."TableY"
+ SELECT parse_ident('foo.boo');
+  parse_ident 
+ -------------
+  {foo,boo}
+ (1 row)
+ 
+ SELECT parse_ident('foo.boo[]'); -- should fail
+ ERROR:  identifier contains disallowed chars
+ DETAIL:  string "foo.boo[]" is not valid identifier
+ SELECT parse_ident('foo.boo[]', strictmode => false); -- ok
+  parse_ident 
+ -------------
+  {foo,boo}
+ (1 row)
+ 
diff --git a/src/test/regress/sql/name.sql b/src/test/regress/sql/name.sql
new file mode 100644
index 1c7a671..629e23f
*** a/src/test/regress/sql/name.sql
--- b/src/test/regress/sql/name.sql
*************** SELECT '' AS three, c.f1 FROM NAME_TBL c
*** 52,54 ****
--- 52,69 ----
  SELECT '' AS two, c.f1 FROM NAME_TBL c WHERE c.f1 ~ '.*asdf.*';
  
  DROP TABLE NAME_TBL;
+ 
+ DO $$
+ DECLARE r text[];
+ BEGIN
+   r := parse_ident('Schemax.Tabley');
+   RAISE NOTICE '%', format('%I.%I', r[1], r[2]);
+   r := parse_ident('"SchemaX"."TableY"');
+   RAISE NOTICE '%', format('%I.%I', r[1], r[2]);
+ END;
+ $$;
+ 
+ SELECT parse_ident('foo.boo');
+ SELECT parse_ident('foo.boo[]'); -- should fail
+ SELECT parse_ident('foo.boo[]', strictmode => false); -- ok
+

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] proposal: function parse_ident

Reply via email to