On Thu, Sep 5, 2024 at 6:07 PM Karina Litskevich <litskevichkar...@gmail.com> wrote: > In v3 of the patch I grouped all the *_junk rules together and included > the suggested comment with a little added something.
Oops, I forgot to attach the patch, here it is. Best regards, Karina Litskevich Postgres Professional: http://postgrespro.com/
From b993d45d34c0bff676bf439e75dfd8f58244f8ed Mon Sep 17 00:00:00 2001 From: Karina Litskevich <litskevichkar...@gmail.com> Date: Wed, 28 Aug 2024 11:52:25 +0300 Subject: [PATCH v3] Improve error message for rejecting trailing junk after numeric literals Rejecting trailing junk after numeric literals was introduced in commit 2549f066 to prevent scanning a number immediately followed by an identifier without whitespace as number and identifier. Unfortunately, all the tokens made to catch such numeric literals followed by non-digits match a numeric literal and the next byte. The lexemes found by these tokens are broken in case the next symbol after a numeric literal is presented by several bytes as only the first byte of the symbol gets to the lexeme. When this lexeme is then printed as a part of an error message that message became broken too along with the whole log file where it goes. This commit fixes the problem by using tokens that match a numeric literal immediately followed by an identifier, not only one byte. This also improves error messages in cases with English letters. For 123abc the error message now will say that the error appeared at or near "123abc" instead of "123a". Since anything that matches hexinteger, octinteger or bininteger followed by identifier also matches decinteger followed by identifier, use one common rule for that. --- src/backend/parser/scan.l | 43 ++++++++++++------------ src/fe_utils/psqlscan.l | 40 ++++++++++++---------- src/interfaces/ecpg/preproc/pgc.l | 40 ++++++++++++---------- src/test/regress/expected/numerology.out | 14 +++++--- src/test/regress/sql/numerology.sql | 1 + 5 files changed, 76 insertions(+), 62 deletions(-) diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l index f74059e7b0..1a7c9b9d8f 100644 --- a/src/backend/parser/scan.l +++ b/src/backend/parser/scan.l @@ -412,16 +412,29 @@ numericfail {decinteger}\.\. real ({decinteger}|{numeric})[Ee][-+]?{decinteger} realfail ({decinteger}|{numeric})[Ee][-+] -decinteger_junk {decinteger}{ident_start} -hexinteger_junk {hexinteger}{ident_start} -octinteger_junk {octinteger}{ident_start} -bininteger_junk {bininteger}{ident_start} -numeric_junk {numeric}{ident_start} -real_junk {real}{ident_start} - /* Positional parameters don't accept underscores. */ param \${decdigit}+ -param_junk \${decdigit}+{ident_start} + +/* + * An identifier immediately following a numeric literal is disallowed because + * in some cases it's ambiguous what is meant: for example, 0x1234 could be + * either a hexinteger or a decinteger "0" and an identifier "x1234". We can + * detect such problems by seeing if integer_junk matches a longer substring + * than any of the XXXinteger patterns (decinteger, hexinteger, octinteger, + * and bininteger). One "junk" pattern is sufficient because + * {decinteger}{identifier} will match all the same strings we'd match with + * {hexinteger}{identifier} etc. + * + * Note that the rule for integer_junk must appear after the ones for + * XXXinteger to make this work correctly. + * + * Also disallow strings matched by numeric_junk, real_junk and param_junk for + * consistency. + */ +integer_junk {decinteger}{identifier} +numeric_junk {numeric}{identifier} +real_junk {real}{identifier} +param_junk \${decdigit}+{identifier} other . @@ -1055,19 +1068,7 @@ other . SET_YYLLOC(); yyerror("trailing junk after numeric literal"); } -{decinteger_junk} { - SET_YYLLOC(); - yyerror("trailing junk after numeric literal"); - } -{hexinteger_junk} { - SET_YYLLOC(); - yyerror("trailing junk after numeric literal"); - } -{octinteger_junk} { - SET_YYLLOC(); - yyerror("trailing junk after numeric literal"); - } -{bininteger_junk} { +{integer_junk} { SET_YYLLOC(); yyerror("trailing junk after numeric literal"); } diff --git a/src/fe_utils/psqlscan.l b/src/fe_utils/psqlscan.l index ddc4658b92..f1c24d505b 100644 --- a/src/fe_utils/psqlscan.l +++ b/src/fe_utils/psqlscan.l @@ -348,16 +348,29 @@ numericfail {decinteger}\.\. real ({decinteger}|{numeric})[Ee][-+]?{decinteger} realfail ({decinteger}|{numeric})[Ee][-+] -decinteger_junk {decinteger}{ident_start} -hexinteger_junk {hexinteger}{ident_start} -octinteger_junk {octinteger}{ident_start} -bininteger_junk {bininteger}{ident_start} -numeric_junk {numeric}{ident_start} -real_junk {real}{ident_start} - /* Positional parameters don't accept underscores. */ param \${decdigit}+ -param_junk \${decdigit}+{ident_start} + +/* + * An identifier immediately following a numeric literal is disallowed because + * in some cases it's ambiguous what is meant: for example, 0x1234 could be + * either a hexinteger or a decinteger "0" and an identifier "x1234". We can + * detect such problems by seeing if integer_junk matches a longer substring + * than any of the XXXinteger patterns (decinteger, hexinteger, octinteger, + * and bininteger). One "junk" pattern is sufficient because + * {decinteger}{identifier} will match all the same strings we'd match with + * {hexinteger}{identifier} etc. + * + * Note that the rule for integer_junk must appear after the ones for + * XXXinteger to make this work correctly. + * + * Also disallow strings matched by numeric_junk, real_junk and param_junk for + * consistency. + */ +integer_junk {decinteger}{identifier} +numeric_junk {numeric}{identifier} +real_junk {real}{identifier} +param_junk \${decdigit}+{identifier} /* psql-specific: characters allowed in variable names */ variable_char [A-Za-z\200-\377_0-9] @@ -898,16 +911,7 @@ other . {realfail} { ECHO; } -{decinteger_junk} { - ECHO; - } -{hexinteger_junk} { - ECHO; - } -{octinteger_junk} { - ECHO; - } -{bininteger_junk} { +{integer_junk} { ECHO; } {numeric_junk} { diff --git a/src/interfaces/ecpg/preproc/pgc.l b/src/interfaces/ecpg/preproc/pgc.l index f363a34659..54e7571ef2 100644 --- a/src/interfaces/ecpg/preproc/pgc.l +++ b/src/interfaces/ecpg/preproc/pgc.l @@ -381,16 +381,29 @@ numericfail {decinteger}\.\. real ({decinteger}|{numeric})[Ee][-+]?{decinteger} realfail ({decinteger}|{numeric})[Ee][-+] -decinteger_junk {decinteger}{ident_start} -hexinteger_junk {hexinteger}{ident_start} -octinteger_junk {octinteger}{ident_start} -bininteger_junk {bininteger}{ident_start} -numeric_junk {numeric}{ident_start} -real_junk {real}{ident_start} - /* Positional parameters don't accept underscores. */ param \${decdigit}+ -param_junk \${decdigit}+{ident_start} + +/* + * An identifier immediately following a numeric literal is disallowed because + * in some cases it's ambiguous what is meant: for example, 0x1234 could be + * either a hexinteger or a decinteger "0" and an identifier "x1234". We can + * detect such problems by seeing if integer_junk matches a longer substring + * than any of the XXXinteger patterns (decinteger, hexinteger, octinteger, + * and bininteger). One "junk" pattern is sufficient because + * {decinteger}{identifier} will match all the same strings we'd match with + * {hexinteger}{identifier} etc. + * + * Note that the rule for integer_junk must appear after the ones for + * XXXinteger to make this work correctly. + * + * Also disallow strings matched by numeric_junk, real_junk and param_junk for + * consistency. + */ +integer_junk {decinteger}{identifier} +numeric_junk {numeric}{identifier} +real_junk {real}{identifier} +param_junk \${decdigit}+{identifier} /* special characters for other dbms */ /* we have to react differently in compat mode */ @@ -1023,16 +1036,7 @@ cppline {space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+ * Note that some trailing junk is valid in C (such as 100LL), so we * contain this to SQL mode. */ -{decinteger_junk} { - mmfatal(PARSE_ERROR, "trailing junk after numeric literal"); - } -{hexinteger_junk} { - mmfatal(PARSE_ERROR, "trailing junk after numeric literal"); - } -{octinteger_junk} { - mmfatal(PARSE_ERROR, "trailing junk after numeric literal"); - } -{bininteger_junk} { +{integer_junk} { mmfatal(PARSE_ERROR, "trailing junk after numeric literal"); } {numeric_junk} { diff --git a/src/test/regress/expected/numerology.out b/src/test/regress/expected/numerology.out index 717a237df9..f4bd6fbfee 100644 --- a/src/test/regress/expected/numerology.out +++ b/src/test/regress/expected/numerology.out @@ -171,9 +171,13 @@ SELECT -0x8000000000000001; -- error cases SELECT 123abc; -ERROR: trailing junk after numeric literal at or near "123a" +ERROR: trailing junk after numeric literal at or near "123abc" LINE 1: SELECT 123abc; ^ +SELECT 1ä; +ERROR: trailing junk after numeric literal at or near "1ä" +LINE 1: SELECT 1ä; + ^ SELECT 0x0o; ERROR: trailing junk after numeric literal at or near "0x0o" LINE 1: SELECT 0x0o; @@ -322,7 +326,7 @@ ERROR: trailing junk after numeric literal at or near "100_" LINE 1: SELECT 100_; ^ SELECT 100__000; -ERROR: trailing junk after numeric literal at or near "100_" +ERROR: trailing junk after numeric literal at or near "100__000" LINE 1: SELECT 100__000; ^ SELECT _1_000.5; @@ -334,7 +338,7 @@ ERROR: trailing junk after numeric literal at or near "1_000_" LINE 1: SELECT 1_000_.5; ^ SELECT 1_000._5; -ERROR: trailing junk after numeric literal at or near "1_000._" +ERROR: trailing junk after numeric literal at or near "1_000._5" LINE 1: SELECT 1_000._5; ^ SELECT 1_000.5_; @@ -342,11 +346,11 @@ ERROR: trailing junk after numeric literal at or near "1_000.5_" LINE 1: SELECT 1_000.5_; ^ SELECT 1_000.5e_1; -ERROR: trailing junk after numeric literal at or near "1_000.5e" +ERROR: trailing junk after numeric literal at or near "1_000.5e_1" LINE 1: SELECT 1_000.5e_1; ^ PREPARE p1 AS SELECT $0_1; -ERROR: trailing junk after parameter at or near "$0_" +ERROR: trailing junk after parameter at or near "$0_1" LINE 1: PREPARE p1 AS SELECT $0_1; ^ -- diff --git a/src/test/regress/sql/numerology.sql b/src/test/regress/sql/numerology.sql index 3ae491cc98..d1a42b046c 100644 --- a/src/test/regress/sql/numerology.sql +++ b/src/test/regress/sql/numerology.sql @@ -44,6 +44,7 @@ SELECT -0x8000000000000001; -- error cases SELECT 123abc; +SELECT 1ä; SELECT 0x0o; SELECT 0.a; SELECT 0.0a; -- 2.34.1