Fix parsing of identifiers in jsonpath

Nikita Glukhov Wed, 18 Sep 2019 08:14:04 -0700

Hi!

Unfortunately, jsonpath lexer, in contrast to jsonpath parser, was written by
Teodor and me without a proper attention to the stanard.  JSON path lexics is
is borrowed from the external ECMAScript [1], and we did not study it carefully.


There were numerous deviations from the ECMAScript standard in our jsonpath
implementation that were mostly fixed in the attached patch:

1. Identifiers (unquoted JSON key names) should start from the one of (see [2]):
   - Unicode symbol having Unicode property "ID_Start" (see [3])
   - Unicode escape sequence '\uXXXX' or '\u{X...}'
   - '$'
   - '_'

   And they should continue with the one of:
   - Unicode symbol having Unicode property "ID_Continue" (see [3])
   - Unicode escape sequence
   - '$'
   - ZWNJ
   - ZWJ

2. '$' is also allowed inside the identifiers, so it is possible to write
   something like '$.a$$b'.

3. Variable references '$var' are regular identifiers simply starting from the
   '$' sign, and there is no syntax like '$"var"', because quotes are not
   allowed in identifiers.

4. Even if the Unicode escape sequence '\uXXXX' is used, it cannot produce
   special symbols or whitespace, because the identifiers are displayed without
   quoting (i.e. '$\u{20}' is not possible to display as '$" "' or even more as
   string '"$ "').

5. All codepoints in '\u{XXXXXX}' greater than 0x10FFFF should be forbidden.

6. 6 single-character escape sequences (\b \t \r \f \n \v) should only be
   supported inside quoted strings.


I don't know if it is possible to check Unicode properties "ID_Start" and
"ID_Continue" in Postgres, and what ZWNJ/ZWJ is.  Now, identifier's starting
character set is simply determined by the exclusion of all recognized special
characters.


The patch is not so simple, but I believe that it's not too late to fix v12.


[1] 
https://www.ecma-international.org/ecma-262/10.0/index.html#sec-ecmascript-language-lexical-grammar
[2] 
https://www.ecma-international.org/ecma-262/10.0/index.html#sec-names-and-keywords
[3] https://unicode.org/reports/tr31/

--
Nikita Glukhov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

>From 2387de10cf241f86f6987ba310f59594cae6b64f Mon Sep 17 00:00:00 2001
From: Nikita Glukhov <n.glu...@postgrespro.ru>
Date: Fri, 22 Mar 2019 15:15:38 +0300
Subject: [PATCH] Fix parsing of identifiers in jsonpath

---
 src/backend/utils/adt/jsonpath.c       |  11 ++-
 src/backend/utils/adt/jsonpath_gram.y  |   6 +-
 src/backend/utils/adt/jsonpath_scan.l  | 125 +++++++++++++++--------------
 src/test/regress/expected/jsonpath.out | 138 +++++++++++++++++++++++++++++----
 src/test/regress/sql/jsonpath.sql      |  21 +++++
 5 files changed, 216 insertions(+), 85 deletions(-)

diff --git a/src/backend/utils/adt/jsonpath.c b/src/backend/utils/adt/jsonpath.c
index 7f32248..f43aeef 100644
--- a/src/backend/utils/adt/jsonpath.c
+++ b/src/backend/utils/adt/jsonpath.c
@@ -494,9 +494,14 @@ printJsonPathItem(StringInfo buf, JsonPathItem *v, bool inKey,
 			escape_json(buf, jspGetString(v, NULL));
 			break;
 		case jpiVariable:
-			appendStringInfoChar(buf, '$');
-			escape_json(buf, jspGetString(v, NULL));
-			break;
+			{
+				int32		len;
+				char	   *name = jspGetString(v, &len);
+
+				appendStringInfoChar(buf, '$');
+				appendBinaryStringInfo(buf, name, len);
+				break;
+			}
 		case jpiNumeric:
 			appendStringInfoString(buf,
 								   DatumGetCString(DirectFunctionCall1(numeric_out,
diff --git a/src/backend/utils/adt/jsonpath_gram.y b/src/backend/utils/adt/jsonpath_gram.y
index 1725502..196a191 100644
--- a/src/backend/utils/adt/jsonpath_gram.y
+++ b/src/backend/utils/adt/jsonpath_gram.y
@@ -334,8 +334,10 @@ makeItemVariable(JsonPathString *s)
 	JsonPathParseItem  *v;
 
 	v = makeItemType(jpiVariable);
-	v->value.string.val = s->val;
-	v->value.string.len = s->len;
+
+	/* skip leading '$' */
+	v->value.string.val = &s->val[1];
+	v->value.string.len = s->len - 1;
 
 	return v;
 }
diff --git a/src/backend/utils/adt/jsonpath_scan.l b/src/backend/utils/adt/jsonpath_scan.l
index 2165ffc..2c79bf0 100644
--- a/src/backend/utils/adt/jsonpath_scan.l
+++ b/src/backend/utils/adt/jsonpath_scan.l
@@ -20,6 +20,8 @@
 #include "mb/pg_wchar.h"
 #include "nodes/pg_list.h"
 
+#define JSONPATH_SPECIAL_CHARS "?%.[]{}()|&!=<>@#,*:-+/~`;\\\"' \b\f\n\r\t\v"
+
 static JsonPathString scanstring;
 
 /* Handles to the buffer that the lexer uses internally */
@@ -63,7 +65,7 @@ fprintf_to_ereport(const char *fmt, const char *msg)
  * quoted variable names and C-tyle comments.
  * Exclusive states:
  *  <xq> - quoted strings
- *  <xnq> - non-quoted strings
+ *  <xnq> - non-quoted identifiers
  *  <xvq> - quoted variable names
  *  <xsq> - single-quoted strings
  *  <xc> - C-style comment
@@ -75,9 +77,12 @@ fprintf_to_ereport(const char *fmt, const char *msg)
 %x xsq
 %x xc
 
-special		 [\?\%\$\.\[\]\{\}\(\)\|\&\!\=\<\>\@\#\,\*:\-\+\/]
-any			[^\?\%\$\.\[\]\{\}\(\)\|\&\!\=\<\>\@\#\,\*:\-\+\/\\\"\' \t\n\r\f]
-blank		[ \t\n\r\f]
+special		 [\?\%\.\[\]\{\}\(\)\|\&\!\=\<\>\@\#\,\*:\-\+\/\~\`\;]
+id_start	[^\?\%\.\[\]\{\}\(\)\|\&\!\=\<\>\@\#\,\*:\-\+\/\~\`\;\\\"\' \b\f\n\r\t\v(0-9)]
+blank		[ \b\f\n\r\t\v]
+
+id_cont		({id_start}|[0-9])
+id			{id_start}{id_cont}*
 
 digit		[0-9]
 integer		(0|[1-9]{digit}*)
@@ -95,68 +100,60 @@ hex_fail	\\x{hex_dig}{0,1}
 
 %%
 
-<xnq>{any}+						{
-									addstring(false, yytext, yyleng);
-								}
+<xq,xvq,xsq>\\[\"\'\\]			{ addchar(false, yytext[1]); }
 
-<xnq>{blank}+					{
-									yylval->str = scanstring;
-									BEGIN INITIAL;
-									return checkKeyword();
-								}
+<xq,xvq,xsq>\\b					{ addchar(false, '\b'); }
 
+<xq,xvq,xsq>\\f					{ addchar(false, '\f'); }
 
-<xnq>\/\*						{
-									yylval->str = scanstring;
-									BEGIN xc;
-								}
+<xq,xvq,xsq>\\n					{ addchar(false, '\n'); }
 
-<xnq>({special}|\"|\')			{
-									yylval->str = scanstring;
-									yyless(0);
-									BEGIN INITIAL;
-									return checkKeyword();
-								}
+<xq,xvq,xsq>\\r					{ addchar(false, '\r'); }
 
-<xnq><<EOF>>					{
-									yylval->str = scanstring;
-									BEGIN INITIAL;
-									return checkKeyword();
-								}
+<xq,xvq,xsq>\\t					{ addchar(false, '\t'); }
 
-<xnq,xq,xvq,xsq>\\[\"\'\\]		{ addchar(false, yytext[1]); }
+<xq,xvq,xsq>\\v					{ addchar(false, '\v'); }
 
-<xnq,xq,xvq,xsq>\\b				{ addchar(false, '\b'); }
+<xq,xnq,xvq,xsq>{unicode}+		{ parseUnicode(yytext, yyleng); }
 
-<xnq,xq,xvq,xsq>\\f				{ addchar(false, '\f'); }
+<xq,xvq,xsq>{hex_char}			{ parseHexChar(yytext); }
 
-<xnq,xq,xvq,xsq>\\n				{ addchar(false, '\n'); }
+<xq,xvq,xsq>{hex_fail}			{ yyerror(NULL, "invalid hex character sequence"); }
 
-<xnq,xq,xvq,xsq>\\r				{ addchar(false, '\r'); }
+<INITIAL,xq,xnq,xvq,xsq>{unicode}*{unicodefail}	{
+									yyerror(NULL, "invalid unicode sequence");
+								}
 
-<xnq,xq,xvq,xsq>\\t				{ addchar(false, '\t'); }
+<INITIAL,xq,xnq,xvq,xsq>{unicode}+\\ {
+									/* throw back the \\, and treat as unicode */
+									yyless(yyleng - 1);
+									parseUnicode(yytext, yyleng);
+								}
 
-<xnq,xq,xvq,xsq>\\v				{ addchar(false, '\v'); }
+<xq,xnq,xvq,xsq>\\.				{ yyerror(NULL, "escape sequence is invalid"); }
 
-<xnq,xq,xvq,xsq>{unicode}+		{ parseUnicode(yytext, yyleng); }
+<xq,xnq,xvq,xsq>\\				{ yyerror(NULL, "unexpected end after backslash"); }
 
-<xnq,xq,xvq,xsq>{hex_char}		{ parseHexChar(yytext); }
+<xq,xvq,xsq><<EOF>>				{ yyerror(NULL, "unexpected end of quoted string"); }
 
-<xnq,xq,xvq,xsq>{unicode}*{unicodefail}	{ yyerror(NULL, "invalid unicode sequence"); }
 
-<xnq,xq,xvq,xsq>{hex_fail}		{ yyerror(NULL, "invalid hex character sequence"); }
+<xnq>{id_cont}+					{ addstring(false, yytext, yyleng); }
 
-<xnq,xq,xvq,xsq>{unicode}+\\	{
-									/* throw back the \\, and treat as unicode */
+<xnq>({special}|{blank}|[\'\"])	{
+									/* throw back the special symbol and return id */
 									yyless(yyleng - 1);
-									parseUnicode(yytext, yyleng);
+									addchar(false, '\0');
+									yylval->str = scanstring;
+									BEGIN INITIAL;
+									return checkKeyword();
 								}
 
-<xnq,xq,xvq,xsq>\\.				{ yyerror(NULL, "escape sequence is invalid"); }
-
-<xnq,xq,xvq,xsq>\\				{ yyerror(NULL, "unexpected end after backslash"); }
-
-<xq,xvq,xsq><<EOF>>				{ yyerror(NULL, "unexpected end of quoted string"); }
+<xnq><<EOF>>					{
+									addchar(false, '\0');
+									yylval->str = scanstring;
+									BEGIN INITIAL;
+									return checkKeyword();
+								}
 
 <xq>\"							{
 									yylval->str = scanstring;
@@ -210,18 +207,6 @@ hex_fail	\\x{hex_dig}{0,1}
 
 \>								{ return GREATER_P; }
 
-\${any}+						{
-									addstring(true, yytext + 1, yyleng - 1);
-									addchar(false, '\0');
-									yylval->str = scanstring;
-									return VARIABLE_P;
-								}
-
-\$\"							{
-									addchar(true, '\0');
-									BEGIN xvq;
-								}
-
 {special}						{ return *yytext; }
 
 {blank}+						{ /* ignore */ }
@@ -263,11 +248,18 @@ hex_fail	\\x{hex_dig}{0,1}
 
 ({realfail1}|{realfail2})		{ yyerror(NULL, "invalid floating point number"); }
 
-{any}+							{
+
+{id_start}{id_cont}*			{
 									addstring(true, yytext, yyleng);
 									BEGIN xnq;
 								}
 
+{unicode}+						{
+									addstring(true, "", 0);
+									parseUnicode(yytext, yyleng);
+									BEGIN xnq;
+								}
+
 \"								{
 									addchar(true, '\0');
 									BEGIN xq;
@@ -278,11 +270,7 @@ hex_fail	\\x{hex_dig}{0,1}
 									BEGIN xsq;
 								}
 
-\\								{
-									yyless(0);
-									addchar(true, '\0');
-									BEGIN xnq;
-								}
+\\								{ yyerror(NULL, "invalid escape sequence"); }
 
 <<EOF>>							{ yyterminate(); }
 
@@ -354,6 +342,12 @@ checkKeyword()
 						   *StopHigh = keywords + lengthof(keywords),
 						   *StopMiddle;
 
+	if (strcspn(scanstring.val, JSONPATH_SPECIAL_CHARS) < scanstring.len)
+		jsonpath_yyerror(NULL, "invalid characters in identifier");
+
+	if (scanstring.val[0] == '$')
+		return scanstring.len == 1 ? '$' : VARIABLE_P;
+
 	if (scanstring.len > keywords[lengthof(keywords) - 1].len)
 		return res;
 
@@ -604,6 +598,9 @@ parseUnicode(char *s, int l)
 			while (s[++i] != '}' && i < l)
 				ch = (ch << 4) | hexval(s[i]);
 			i++;	/* skip '}' */
+
+			if (ch > 0x10FFFF)
+				jsonpath_yyerror(NULL, "invalid Unicode escape value");
 		}
 		else		/* parse '\uXXXX' */
 		{
diff --git a/src/test/regress/expected/jsonpath.out b/src/test/regress/expected/jsonpath.out
index ea42ae3..14221d4 100644
--- a/src/test/regress/expected/jsonpath.out
+++ b/src/test/regress/expected/jsonpath.out
@@ -190,11 +190,75 @@ select '''\x50\u0067\u{53}\u{051}\u{00004C}'''::jsonpath;
 (1 row)
 
 select '$.foo\x50\u0067\u{53}\u{051}\u{00004C}\t\"bar'::jsonpath;
+ERROR:  escape sequence is invalid at or near "\x" of jsonpath input
+LINE 1: select '$.foo\x50\u0067\u{53}\u{051}\u{00004C}\t\"bar'::json...
+               ^
+select '$."foo\x50\u0067\u{53}\u{051}\u{00004C}\t\"bar"'::jsonpath;
       jsonpath       
 ---------------------
  $."fooPgSQL\t\"bar"
 (1 row)
 
+select '$.\u12345.a\u1234.b\u{12}34'::jsonpath;
+        jsonpath         
+-------------------------
+ $."á´5"."aá´"."b\u001234"
+(1 row)
+
+select '$.\u123'::jsonpath;
+ERROR:  invalid unicode sequence at or near "\u123" of jsonpath input
+LINE 1: select '$.\u123'::jsonpath;
+               ^
+select '$.\u{}'::jsonpath;
+ERROR:  invalid unicode sequence at or near "\u{" of jsonpath input
+LINE 1: select '$.\u{}'::jsonpath;
+               ^
+select '$.\u{1}'::jsonpath;
+  jsonpath  
+------------
+ $."\u0001"
+(1 row)
+
+select '$.\u{20}'::jsonpath;
+ERROR:  invalid characters in identifier at end of jsonpath input
+LINE 1: select '$.\u{20}'::jsonpath;
+               ^
+select '$."\u{20}"'::jsonpath;
+ jsonpath 
+----------
+ $." "
+(1 row)
+
+select '$.\u{21}'::jsonpath;
+ERROR:  invalid characters in identifier at end of jsonpath input
+LINE 1: select '$.\u{21}'::jsonpath;
+               ^
+select '$."\u{21}"'::jsonpath;
+ jsonpath 
+----------
+ $."!"
+(1 row)
+
+select '$.\u{12345}'::jsonpath;
+ jsonpath 
+----------
+ $."ð"
+(1 row)
+
+select '$.\u{10FFFF}'::jsonpath;
+ jsonpath 
+----------
+ $.""
+(1 row)
+
+select '$.\u{110000}'::jsonpath;
+ERROR:  invalid Unicode escape value at or near "\u{110000}" of jsonpath input
+LINE 1: select '$.\u{110000}'::jsonpath;
+               ^
+select '$.\x123'::jsonpath;
+ERROR:  invalid escape sequence at or near "\" of jsonpath input
+LINE 1: select '$.\x123'::jsonpath;
+               ^
 select '$.g ? ($.a == 1)'::jsonpath;
       jsonpath      
 --------------------
@@ -282,25 +346,59 @@ select '$.g ? (+@.x >= +-(+@.a + 2))'::jsonpath;
 select '$a'::jsonpath;
  jsonpath 
 ----------
- $"a"
+ $a
+(1 row)
+
+select '$_'::jsonpath;
+ jsonpath 
+----------
+ $_
+(1 row)
+
+select '$123'::jsonpath;
+ jsonpath 
+----------
+ $123
+(1 row)
+
+select '$$$'::jsonpath;
+ jsonpath 
+----------
+ $$$
+(1 row)
+
+select '$_$$1_3a'::jsonpath;
+ jsonpath 
+----------
+ $_$$1_3a
+(1 row)
+
+select '$\u12345'::jsonpath;
+ jsonpath 
+----------
+ $á´5
 (1 row)
 
+select '$\u{20}'::jsonpath;
+ERROR:  invalid characters in identifier at end of jsonpath input
+LINE 1: select '$\u{20}'::jsonpath;
+               ^
 select '$a.b'::jsonpath;
  jsonpath 
 ----------
- $"a"."b"
+ $a."b"
 (1 row)
 
 select '$a[*]'::jsonpath;
  jsonpath 
 ----------
- $"a"[*]
+ $a[*]
 (1 row)
 
 select '$.g ? (@.zip == $zip)'::jsonpath;
-         jsonpath          
----------------------------
- $."g"?(@."zip" == $"zip")
+        jsonpath         
+-------------------------
+ $."g"?(@."zip" == $zip)
 (1 row)
 
 select '$.a[1,2, 3 to 16]'::jsonpath;
@@ -310,9 +408,9 @@ select '$.a[1,2, 3 to 16]'::jsonpath;
 (1 row)
 
 select '$.a[$a + 1, ($b[*]) to -($[0] * 2)]'::jsonpath;
-                jsonpath                
-----------------------------------------
- $."a"[$"a" + 1,$"b"[*] to -($[0] * 2)]
+              jsonpath              
+------------------------------------
+ $."a"[$a + 1,$b[*] to -($[0] * 2)]
 (1 row)
 
 select '$.a[$.a.size() - 3]'::jsonpath;
@@ -408,9 +506,9 @@ select '$ ? (@ starts with "abc")'::jsonpath;
 (1 row)
 
 select '$ ? (@ starts with $var)'::jsonpath;
-         jsonpath         
---------------------------
- $?(@ starts with $"var")
+        jsonpath        
+------------------------
+ $?(@ starts with $var)
 (1 row)
 
 select '$ ? (@ like_regex "(invalid pattern")'::jsonpath;
@@ -481,9 +579,9 @@ select '$ < 1'::jsonpath;
 (1 row)
 
 select '($ < 1) || $.a.b <= $x'::jsonpath;
-           jsonpath           
-------------------------------
- ($ < 1 || $."a"."b" <= $"x")
+          jsonpath          
+----------------------------
+ ($ < 1 || $."a"."b" <= $x)
 (1 row)
 
 select '@ + 1'::jsonpath;
@@ -815,9 +913,17 @@ select '0'::jsonpath;
 (1 row)
 
 select '00'::jsonpath;
-ERROR:  syntax error, unexpected IDENT_P at end of jsonpath input
+ERROR:  syntax error, unexpected INT_P, expecting $end at or near "0" of jsonpath input
 LINE 1: select '00'::jsonpath;
                ^
+select '$.00'::jsonpath;
+ERROR:  syntax error, unexpected INT_P at or near "0" of jsonpath input
+LINE 1: select '$.00'::jsonpath;
+               ^
+select '$.0a'::jsonpath;
+ERROR:  syntax error, unexpected INT_P at or near "0" of jsonpath input
+LINE 1: select '$.0a'::jsonpath;
+               ^
 select '0.0'::jsonpath;
  jsonpath 
 ----------
diff --git a/src/test/regress/sql/jsonpath.sql b/src/test/regress/sql/jsonpath.sql
index 29ea77a..9e44f1d 100644
--- a/src/test/regress/sql/jsonpath.sql
+++ b/src/test/regress/sql/jsonpath.sql
@@ -34,6 +34,19 @@ select '''\b\f\r\n\t\v\"\''\\'''::jsonpath;
 select '"\x50\u0067\u{53}\u{051}\u{00004C}"'::jsonpath;
 select '''\x50\u0067\u{53}\u{051}\u{00004C}'''::jsonpath;
 select '$.foo\x50\u0067\u{53}\u{051}\u{00004C}\t\"bar'::jsonpath;
+select '$."foo\x50\u0067\u{53}\u{051}\u{00004C}\t\"bar"'::jsonpath;
+select '$.\u12345.a\u1234.b\u{12}34'::jsonpath;
+select '$.\u123'::jsonpath;
+select '$.\u{}'::jsonpath;
+select '$.\u{1}'::jsonpath;
+select '$.\u{20}'::jsonpath;
+select '$."\u{20}"'::jsonpath;
+select '$.\u{21}'::jsonpath;
+select '$."\u{21}"'::jsonpath;
+select '$.\u{12345}'::jsonpath;
+select '$.\u{10FFFF}'::jsonpath;
+select '$.\u{110000}'::jsonpath;
+select '$.\x123'::jsonpath;
 
 select '$.g ? ($.a == 1)'::jsonpath;
 select '$.g ? (@ == 1)'::jsonpath;
@@ -51,6 +64,12 @@ select '$.g ? ((@.x >= 123 || @.a == 4) && exists (@.x ? (@ == 14)))'::jsonpath;
 select '$.g ? (+@.x >= +-(+@.a + 2))'::jsonpath;
 
 select '$a'::jsonpath;
+select '$_'::jsonpath;
+select '$123'::jsonpath;
+select '$$$'::jsonpath;
+select '$_$$1_3a'::jsonpath;
+select '$\u12345'::jsonpath;
+select '$\u{20}'::jsonpath;
 select '$a.b'::jsonpath;
 select '$a[*]'::jsonpath;
 select '$.g ? (@.zip == $zip)'::jsonpath;
@@ -153,6 +172,8 @@ select '$ ? (@.a < +10.1e+1)'::jsonpath;
 
 select '0'::jsonpath;
 select '00'::jsonpath;
+select '$.00'::jsonpath;
+select '$.0a'::jsonpath;
 select '0.0'::jsonpath;
 select '0.000'::jsonpath;
 select '0.000e1'::jsonpath;
-- 
2.7.4

Fix parsing of identifiers in jsonpath

Reply via email to