On Thu, Sep 5, 2024 at 6:07 PM Karina Litskevich
<litskevichkar...@gmail.com> wrote:
> In v3 of the patch I grouped all the *_junk rules together and included
> the suggested comment with a little added something.

Oops, I forgot to attach the patch, here it is.


Best regards,
Karina Litskevich
Postgres Professional: http://postgrespro.com/
From b993d45d34c0bff676bf439e75dfd8f58244f8ed Mon Sep 17 00:00:00 2001
From: Karina Litskevich <litskevichkar...@gmail.com>
Date: Wed, 28 Aug 2024 11:52:25 +0300
Subject: [PATCH v3] Improve error message for rejecting trailing junk after
 numeric literals

Rejecting trailing junk after numeric literals was introduced in
commit 2549f066 to prevent scanning a number immediately followed
by an identifier without whitespace as number and identifier.
Unfortunately, all the tokens made to catch such numeric literals
followed by non-digits match a numeric literal and the next byte. The
lexemes found by these tokens are broken in case the next symbol after
a numeric literal is presented by several bytes as only the first byte
of the symbol gets to the lexeme. When this lexeme is then printed as
a part of an error message that message became broken too along with
the whole log file where it goes.

This commit fixes the problem by using tokens that match a numeric
literal immediately followed by an identifier, not only one byte. This
also improves error messages in cases with English letters. For 123abc
the error message now will say that the error appeared at or near
"123abc" instead of "123a".

Since anything that matches hexinteger, octinteger or bininteger
followed by identifier also matches decinteger followed by identifier,
use one common rule for that.
---
 src/backend/parser/scan.l                | 43 ++++++++++++------------
 src/fe_utils/psqlscan.l                  | 40 ++++++++++++----------
 src/interfaces/ecpg/preproc/pgc.l        | 40 ++++++++++++----------
 src/test/regress/expected/numerology.out | 14 +++++---
 src/test/regress/sql/numerology.sql      |  1 +
 5 files changed, 76 insertions(+), 62 deletions(-)

diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l
index f74059e7b0..1a7c9b9d8f 100644
--- a/src/backend/parser/scan.l
+++ b/src/backend/parser/scan.l
@@ -412,16 +412,29 @@ numericfail		{decinteger}\.\.
 real			({decinteger}|{numeric})[Ee][-+]?{decinteger}
 realfail		({decinteger}|{numeric})[Ee][-+]
 
-decinteger_junk	{decinteger}{ident_start}
-hexinteger_junk	{hexinteger}{ident_start}
-octinteger_junk	{octinteger}{ident_start}
-bininteger_junk	{bininteger}{ident_start}
-numeric_junk	{numeric}{ident_start}
-real_junk		{real}{ident_start}
-
 /* Positional parameters don't accept underscores. */
 param			\${decdigit}+
-param_junk		\${decdigit}+{ident_start}
+
+/*
+ * An identifier immediately following a numeric literal is disallowed because
+ * in some cases it's ambiguous what is meant: for example, 0x1234 could be
+ * either a hexinteger or a decinteger "0" and an identifier "x1234".  We can
+ * detect such problems by seeing if integer_junk matches a longer substring
+ * than any of the XXXinteger patterns (decinteger, hexinteger, octinteger,
+ * and bininteger). One "junk" pattern is sufficient because
+ * {decinteger}{identifier} will match all the same strings we'd match with
+ * {hexinteger}{identifier} etc.
+ *
+ * Note that the rule for integer_junk must appear after the ones for
+ * XXXinteger to make this work correctly.
+ *
+ * Also disallow strings matched by numeric_junk, real_junk and param_junk for
+ * consistency.
+ */
+integer_junk	{decinteger}{identifier}
+numeric_junk	{numeric}{identifier}
+real_junk		{real}{identifier}
+param_junk		\${decdigit}+{identifier}
 
 other			.
 
@@ -1055,19 +1068,7 @@ other			.
 					SET_YYLLOC();
 					yyerror("trailing junk after numeric literal");
 				}
-{decinteger_junk}	{
-					SET_YYLLOC();
-					yyerror("trailing junk after numeric literal");
-				}
-{hexinteger_junk}	{
-					SET_YYLLOC();
-					yyerror("trailing junk after numeric literal");
-				}
-{octinteger_junk}	{
-					SET_YYLLOC();
-					yyerror("trailing junk after numeric literal");
-				}
-{bininteger_junk}	{
+{integer_junk}	{
 					SET_YYLLOC();
 					yyerror("trailing junk after numeric literal");
 				}
diff --git a/src/fe_utils/psqlscan.l b/src/fe_utils/psqlscan.l
index ddc4658b92..f1c24d505b 100644
--- a/src/fe_utils/psqlscan.l
+++ b/src/fe_utils/psqlscan.l
@@ -348,16 +348,29 @@ numericfail		{decinteger}\.\.
 real			({decinteger}|{numeric})[Ee][-+]?{decinteger}
 realfail		({decinteger}|{numeric})[Ee][-+]
 
-decinteger_junk	{decinteger}{ident_start}
-hexinteger_junk	{hexinteger}{ident_start}
-octinteger_junk	{octinteger}{ident_start}
-bininteger_junk	{bininteger}{ident_start}
-numeric_junk	{numeric}{ident_start}
-real_junk		{real}{ident_start}
-
 /* Positional parameters don't accept underscores. */
 param			\${decdigit}+
-param_junk		\${decdigit}+{ident_start}
+
+/*
+ * An identifier immediately following a numeric literal is disallowed because
+ * in some cases it's ambiguous what is meant: for example, 0x1234 could be
+ * either a hexinteger or a decinteger "0" and an identifier "x1234".  We can
+ * detect such problems by seeing if integer_junk matches a longer substring
+ * than any of the XXXinteger patterns (decinteger, hexinteger, octinteger,
+ * and bininteger). One "junk" pattern is sufficient because
+ * {decinteger}{identifier} will match all the same strings we'd match with
+ * {hexinteger}{identifier} etc.
+ *
+ * Note that the rule for integer_junk must appear after the ones for
+ * XXXinteger to make this work correctly.
+ *
+ * Also disallow strings matched by numeric_junk, real_junk and param_junk for
+ * consistency.
+ */
+integer_junk	{decinteger}{identifier}
+numeric_junk	{numeric}{identifier}
+real_junk		{real}{identifier}
+param_junk		\${decdigit}+{identifier}
 
 /* psql-specific: characters allowed in variable names */
 variable_char	[A-Za-z\200-\377_0-9]
@@ -898,16 +911,7 @@ other			.
 {realfail}		{
 					ECHO;
 				}
-{decinteger_junk}	{
-					ECHO;
-				}
-{hexinteger_junk}	{
-					ECHO;
-				}
-{octinteger_junk}	{
-					ECHO;
-				}
-{bininteger_junk}	{
+{integer_junk}	{
 					ECHO;
 				}
 {numeric_junk}	{
diff --git a/src/interfaces/ecpg/preproc/pgc.l b/src/interfaces/ecpg/preproc/pgc.l
index f363a34659..54e7571ef2 100644
--- a/src/interfaces/ecpg/preproc/pgc.l
+++ b/src/interfaces/ecpg/preproc/pgc.l
@@ -381,16 +381,29 @@ numericfail		{decinteger}\.\.
 real			({decinteger}|{numeric})[Ee][-+]?{decinteger}
 realfail		({decinteger}|{numeric})[Ee][-+]
 
-decinteger_junk	{decinteger}{ident_start}
-hexinteger_junk	{hexinteger}{ident_start}
-octinteger_junk	{octinteger}{ident_start}
-bininteger_junk	{bininteger}{ident_start}
-numeric_junk	{numeric}{ident_start}
-real_junk		{real}{ident_start}
-
 /* Positional parameters don't accept underscores. */
 param			\${decdigit}+
-param_junk		\${decdigit}+{ident_start}
+
+/*
+ * An identifier immediately following a numeric literal is disallowed because
+ * in some cases it's ambiguous what is meant: for example, 0x1234 could be
+ * either a hexinteger or a decinteger "0" and an identifier "x1234".  We can
+ * detect such problems by seeing if integer_junk matches a longer substring
+ * than any of the XXXinteger patterns (decinteger, hexinteger, octinteger,
+ * and bininteger). One "junk" pattern is sufficient because
+ * {decinteger}{identifier} will match all the same strings we'd match with
+ * {hexinteger}{identifier} etc.
+ *
+ * Note that the rule for integer_junk must appear after the ones for
+ * XXXinteger to make this work correctly.
+ *
+ * Also disallow strings matched by numeric_junk, real_junk and param_junk for
+ * consistency.
+ */
+integer_junk	{decinteger}{identifier}
+numeric_junk	{numeric}{identifier}
+real_junk		{real}{identifier}
+param_junk		\${decdigit}+{identifier}
 
 /* special characters for other dbms */
 /* we have to react differently in compat mode */
@@ -1023,16 +1036,7 @@ cppline			{space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+
 	 * Note that some trailing junk is valid in C (such as 100LL), so we
 	 * contain this to SQL mode.
 	 */
-{decinteger_junk} {
-					mmfatal(PARSE_ERROR, "trailing junk after numeric literal");
-				}
-{hexinteger_junk} {
-					mmfatal(PARSE_ERROR, "trailing junk after numeric literal");
-				}
-{octinteger_junk} {
-					mmfatal(PARSE_ERROR, "trailing junk after numeric literal");
-				}
-{bininteger_junk} {
+{integer_junk}	{
 					mmfatal(PARSE_ERROR, "trailing junk after numeric literal");
 				}
 {numeric_junk}	{
diff --git a/src/test/regress/expected/numerology.out b/src/test/regress/expected/numerology.out
index 717a237df9..f4bd6fbfee 100644
--- a/src/test/regress/expected/numerology.out
+++ b/src/test/regress/expected/numerology.out
@@ -171,9 +171,13 @@ SELECT -0x8000000000000001;
 
 -- error cases
 SELECT 123abc;
-ERROR:  trailing junk after numeric literal at or near "123a"
+ERROR:  trailing junk after numeric literal at or near "123abc"
 LINE 1: SELECT 123abc;
                ^
+SELECT 1ä;
+ERROR:  trailing junk after numeric literal at or near "1ä"
+LINE 1: SELECT 1ä;
+               ^
 SELECT 0x0o;
 ERROR:  trailing junk after numeric literal at or near "0x0o"
 LINE 1: SELECT 0x0o;
@@ -322,7 +326,7 @@ ERROR:  trailing junk after numeric literal at or near "100_"
 LINE 1: SELECT 100_;
                ^
 SELECT 100__000;
-ERROR:  trailing junk after numeric literal at or near "100_"
+ERROR:  trailing junk after numeric literal at or near "100__000"
 LINE 1: SELECT 100__000;
                ^
 SELECT _1_000.5;
@@ -334,7 +338,7 @@ ERROR:  trailing junk after numeric literal at or near "1_000_"
 LINE 1: SELECT 1_000_.5;
                ^
 SELECT 1_000._5;
-ERROR:  trailing junk after numeric literal at or near "1_000._"
+ERROR:  trailing junk after numeric literal at or near "1_000._5"
 LINE 1: SELECT 1_000._5;
                ^
 SELECT 1_000.5_;
@@ -342,11 +346,11 @@ ERROR:  trailing junk after numeric literal at or near "1_000.5_"
 LINE 1: SELECT 1_000.5_;
                ^
 SELECT 1_000.5e_1;
-ERROR:  trailing junk after numeric literal at or near "1_000.5e"
+ERROR:  trailing junk after numeric literal at or near "1_000.5e_1"
 LINE 1: SELECT 1_000.5e_1;
                ^
 PREPARE p1 AS SELECT $0_1;
-ERROR:  trailing junk after parameter at or near "$0_"
+ERROR:  trailing junk after parameter at or near "$0_1"
 LINE 1: PREPARE p1 AS SELECT $0_1;
                              ^
 --
diff --git a/src/test/regress/sql/numerology.sql b/src/test/regress/sql/numerology.sql
index 3ae491cc98..d1a42b046c 100644
--- a/src/test/regress/sql/numerology.sql
+++ b/src/test/regress/sql/numerology.sql
@@ -44,6 +44,7 @@ SELECT -0x8000000000000001;
 
 -- error cases
 SELECT 123abc;
+SELECT 1ä;
 SELECT 0x0o;
 SELECT 0.a;
 SELECT 0.0a;
-- 
2.34.1

Reply via email to