Erik Wienhold <e...@ewie.name> writes:
> On 2024-04-07 06:33 +0200, Tom Lane wrote:
>> I suspect it'd be much more robust if we could remove the comment from
>> the expr->query string.  No idea how hard that is.

> I slept on it and I think this can be fixed by tracking the end of the
> last token before THEN and use that instead of yylloc in the call to
> plpgsql_append_source_text.  We already already track the token length
> in plpgsql_yyleng but don't make it available outside pl_scanner.c yet.
> Attached v2 tries to do that.  But it breaks other test cases, probably
> because the calculation of endlocation is off.  I'm missing something
> here.

I poked at this and found that the failures occur when the patched
code decides to trim an expression like "_r.v" to just "_r", naturally
breaking the semantics completely.  That happens because when
plpgsql_yylex recognizes a compound token, it doesn't bother to
adjust the token length to include the additional word(s).  I vaguely
remember having thought about that when writing the lookahead logic,
and deciding that it wasn't worth the trouble -- but now it is.
Up to now, the only thing we did with plpgsql_yyleng was to set the
cutoff point for text reported by plpgsql_yyerror.  Extending the
token length changes reports like this:

regression=# do $$ declare r record; r.x$$;
ERROR:  syntax error at or near "r"
LINE 1: do $$ declare r record; r.x$$;
                                ^

to this:

regression=# do $$ declare r record; r.x$$;
ERROR:  syntax error at or near "r.x"
LINE 1: do $$ declare r record; r.x$$;
                                ^

which seems like strictly an improvement to me (the syntax error is
premature EOF, which is after the "x"); but in any case it's minor
enough to not be worth worrying about.

Looking around, I noticed that we *have* had a similar case in the
past, which 4adead1d2 noticed and worked around by suppressing the
whitespace-trimming action in read_sql_construct.  We could probably
reach a near-one-line fix for the current problem by passing
trim=false in the CASE calls, but TBH that discovery just reinforces
my feeling that we need a cleaner fix.  The attached v3 reverts
the make-trim-optional hack that 4adead1d2 added, since we don't
need or want the manual trimming anymore.

With this in mind, I find the other manual whitespace trimming logic,
in make_execsql_stmt(), quite scary; but it looks somewhat nontrivial
to get rid of it.  (The problem is that parsing of an INTO clause
will leave us with a pushed-back token as next, and then we don't
know where the end of the token before that is.)  Since we don't
currently do anything as crazy as combining execsql statements,
I think the problem is only latent, but still...

Anyway, the attached works for me.

                        regards, tom lane

diff --git a/src/pl/plpgsql/src/expected/plpgsql_control.out b/src/pl/plpgsql/src/expected/plpgsql_control.out
index 328bd48586..ccd4f54704 100644
--- a/src/pl/plpgsql/src/expected/plpgsql_control.out
+++ b/src/pl/plpgsql/src/expected/plpgsql_control.out
@@ -681,3 +681,20 @@ select case_test(13);
  other
 (1 row)
 
+-- test line comment between WHEN and THEN
+create or replace function case_comment(int) returns text as $$
+begin
+  case $1
+    when 1 -- comment before THEN
+      then return 'one';
+    else
+      return 'other';
+  end case;
+end;
+$$ language plpgsql immutable;
+select case_comment(1);
+ case_comment 
+--------------
+ one
+(1 row)
+
diff --git a/src/pl/plpgsql/src/pl_gram.y b/src/pl/plpgsql/src/pl_gram.y
index bef33d58a2..a29d2dfacd 100644
--- a/src/pl/plpgsql/src/pl_gram.y
+++ b/src/pl/plpgsql/src/pl_gram.y
@@ -66,7 +66,6 @@ static	PLpgSQL_expr	*read_sql_construct(int until,
 											RawParseMode parsemode,
 											bool isexpression,
 											bool valid_sql,
-											bool trim,
 											int *startloc,
 											int *endtoken);
 static	PLpgSQL_expr	*read_sql_expression(int until,
@@ -895,7 +894,7 @@ stmt_perform	: K_PERFORM
 						 */
 						new->expr = read_sql_construct(';', 0, 0, ";",
 													   RAW_PARSE_DEFAULT,
-													   false, false, true,
+													   false, false,
 													   &startloc, NULL);
 						/* overwrite "perform" ... */
 						memcpy(new->expr->query, " SELECT", 7);
@@ -981,7 +980,7 @@ stmt_assign		: T_DATUM
 						plpgsql_push_back_token(T_DATUM);
 						new->expr = read_sql_construct(';', 0, 0, ";",
 													   pmode,
-													   false, true, true,
+													   false, true,
 													   NULL, NULL);
 
 						$$ = (PLpgSQL_stmt *) new;
@@ -1474,7 +1473,6 @@ for_control		: for_variable K_IN
 													   RAW_PARSE_DEFAULT,
 													   true,
 													   false,
-													   true,
 													   &expr1loc,
 													   &tok);
 
@@ -1879,7 +1877,7 @@ stmt_raise		: K_RAISE
 									expr = read_sql_construct(',', ';', K_USING,
 															  ", or ; or USING",
 															  RAW_PARSE_PLPGSQL_EXPR,
-															  true, true, true,
+															  true, true,
 															  NULL, &tok);
 									new->params = lappend(new->params, expr);
 								}
@@ -2016,7 +2014,7 @@ stmt_dynexecute : K_EXECUTE
 						expr = read_sql_construct(K_INTO, K_USING, ';',
 												  "INTO or USING or ;",
 												  RAW_PARSE_PLPGSQL_EXPR,
-												  true, true, true,
+												  true, true,
 												  NULL, &endtoken);
 
 						new = palloc(sizeof(PLpgSQL_stmt_dynexecute));
@@ -2055,7 +2053,7 @@ stmt_dynexecute : K_EXECUTE
 									expr = read_sql_construct(',', ';', K_INTO,
 															  ", or ; or INTO",
 															  RAW_PARSE_PLPGSQL_EXPR,
-															  true, true, true,
+															  true, true,
 															  NULL, &endtoken);
 									new->params = lappend(new->params, expr);
 								} while (endtoken == ',');
@@ -2640,7 +2638,7 @@ read_sql_expression(int until, const char *expected)
 {
 	return read_sql_construct(until, 0, 0, expected,
 							  RAW_PARSE_PLPGSQL_EXPR,
-							  true, true, true, NULL, NULL);
+							  true, true, NULL, NULL);
 }
 
 /* Convenience routine to read an expression with two possible terminators */
@@ -2650,7 +2648,7 @@ read_sql_expression2(int until, int until2, const char *expected,
 {
 	return read_sql_construct(until, until2, 0, expected,
 							  RAW_PARSE_PLPGSQL_EXPR,
-							  true, true, true, NULL, endtoken);
+							  true, true, NULL, endtoken);
 }
 
 /* Convenience routine to read a SQL statement that must end with ';' */
@@ -2659,7 +2657,7 @@ read_sql_stmt(void)
 {
 	return read_sql_construct(';', 0, 0, ";",
 							  RAW_PARSE_DEFAULT,
-							  false, true, true, NULL, NULL);
+							  false, true, NULL, NULL);
 }
 
 /*
@@ -2672,7 +2670,6 @@ read_sql_stmt(void)
  * parsemode:	raw_parser() mode to use
  * isexpression: whether to say we're reading an "expression" or a "statement"
  * valid_sql:   whether to check the syntax of the expr
- * trim:		trim trailing whitespace
  * startloc:	if not NULL, location of first token is stored at *startloc
  * endtoken:	if not NULL, ending token is stored at *endtoken
  *				(this is only interesting if until2 or until3 isn't zero)
@@ -2685,7 +2682,6 @@ read_sql_construct(int until,
 				   RawParseMode parsemode,
 				   bool isexpression,
 				   bool valid_sql,
-				   bool trim,
 				   int *startloc,
 				   int *endtoken)
 {
@@ -2693,6 +2689,7 @@ read_sql_construct(int until,
 	StringInfoData ds;
 	IdentifierLookup save_IdentifierLookup;
 	int			startlocation = -1;
+	int			endlocation = -1;
 	int			parenlevel = 0;
 	PLpgSQL_expr *expr;
 
@@ -2743,6 +2740,8 @@ read_sql_construct(int until,
 								expected),
 						 parser_errposition(yylloc)));
 		}
+		/* Remember end+1 location of last accepted token */
+		endlocation = yylloc + plpgsql_token_length();
 	}
 
 	plpgsql_IdentifierLookup = save_IdentifierLookup;
@@ -2753,7 +2752,7 @@ read_sql_construct(int until,
 		*endtoken = tok;
 
 	/* give helpful complaint about empty input */
-	if (startlocation >= yylloc)
+	if (startlocation >= endlocation)
 	{
 		if (isexpression)
 			yyerror("missing expression");
@@ -2761,14 +2760,14 @@ read_sql_construct(int until,
 			yyerror("missing SQL statement");
 	}
 
-	plpgsql_append_source_text(&ds, startlocation, yylloc);
-
-	/* trim any trailing whitespace, for neatness */
-	if (trim)
-	{
-		while (ds.len > 0 && scanner_isspace(ds.data[ds.len - 1]))
-			ds.data[--ds.len] = '\0';
-	}
+	/*
+	 * We save only the text from startlocation to endlocation-1.  This
+	 * suppresses the "until" token as well as any whitespace or comments
+	 * following the last accepted token.  (We used to strip such trailing
+	 * whitespace by hand, but that causes problems if there's a "-- comment"
+	 * in front of said whitespace.)
+	 */
+	plpgsql_append_source_text(&ds, startlocation, endlocation);
 
 	expr = palloc0(sizeof(PLpgSQL_expr));
 	expr->query = pstrdup(ds.data);
@@ -3933,16 +3932,12 @@ read_cursor_args(PLpgSQL_var *cursor, int until)
 		 * Read the value expression. To provide the user with meaningful
 		 * parse error positions, we check the syntax immediately, instead of
 		 * checking the final expression that may have the arguments
-		 * reordered. Trailing whitespace must not be trimmed, because
-		 * otherwise input of the form (param -- comment\n, param) would be
-		 * translated into a form where the second parameter is commented
-		 * out.
+		 * reordered.
 		 */
 		item = read_sql_construct(',', ')', 0,
 								  ",\" or \")",
 								  RAW_PARSE_PLPGSQL_EXPR,
 								  true, true,
-								  false, /* do not trim */
 								  NULL, &endtoken);
 
 		argv[argpos] = item->query;
diff --git a/src/pl/plpgsql/src/pl_scanner.c b/src/pl/plpgsql/src/pl_scanner.c
index 101804d506..9407da51ef 100644
--- a/src/pl/plpgsql/src/pl_scanner.c
+++ b/src/pl/plpgsql/src/pl_scanner.c
@@ -184,6 +184,8 @@ plpgsql_yylex(void)
 							tok1 = T_DATUM;
 						else
 							tok1 = T_CWORD;
+						/* Adjust token length to include A.B.C */
+						aux1.leng = aux5.lloc - aux1.lloc + aux5.leng;
 					}
 					else
 					{
@@ -197,6 +199,8 @@ plpgsql_yylex(void)
 							tok1 = T_DATUM;
 						else
 							tok1 = T_CWORD;
+						/* Adjust token length to include A.B */
+						aux1.leng = aux3.lloc - aux1.lloc + aux3.leng;
 					}
 				}
 				else
@@ -210,6 +214,8 @@ plpgsql_yylex(void)
 						tok1 = T_DATUM;
 					else
 						tok1 = T_CWORD;
+					/* Adjust token length to include A.B */
+					aux1.leng = aux3.lloc - aux1.lloc + aux3.leng;
 				}
 			}
 			else
@@ -298,6 +304,17 @@ plpgsql_yylex(void)
 	return tok1;
 }
 
+/*
+ * Return the length of the token last returned by plpgsql_yylex().
+ *
+ * In the case of compound tokens, the length includes all the parts.
+ */
+int
+plpgsql_token_length(void)
+{
+	return plpgsql_yyleng;
+}
+
 /*
  * Internal yylex function.  This wraps the core lexer and adds one feature:
  * a token pushback stack.  We also make a couple of trivial single-token
diff --git a/src/pl/plpgsql/src/plpgsql.h b/src/pl/plpgsql/src/plpgsql.h
index 40056bb851..50c3b28472 100644
--- a/src/pl/plpgsql/src/plpgsql.h
+++ b/src/pl/plpgsql/src/plpgsql.h
@@ -1317,6 +1317,7 @@ extern void plpgsql_dumptree(PLpgSQL_function *func);
  */
 extern int	plpgsql_base_yylex(void);
 extern int	plpgsql_yylex(void);
+extern int	plpgsql_token_length(void);
 extern void plpgsql_push_back_token(int token);
 extern bool plpgsql_token_is_unreserved_keyword(int token);
 extern void plpgsql_append_source_text(StringInfo buf,
diff --git a/src/pl/plpgsql/src/sql/plpgsql_control.sql b/src/pl/plpgsql/src/sql/plpgsql_control.sql
index ed7231134f..8e007c51dc 100644
--- a/src/pl/plpgsql/src/sql/plpgsql_control.sql
+++ b/src/pl/plpgsql/src/sql/plpgsql_control.sql
@@ -486,3 +486,17 @@ select case_test(1);
 select case_test(2);
 select case_test(12);
 select case_test(13);
+
+-- test line comment between WHEN and THEN
+create or replace function case_comment(int) returns text as $$
+begin
+  case $1
+    when 1 -- comment before THEN
+      then return 'one';
+    else
+      return 'other';
+  end case;
+end;
+$$ language plpgsql immutable;
+
+select case_comment(1);
diff --git a/src/test/regress/expected/plpgsql.out b/src/test/regress/expected/plpgsql.out
index 0637256676..074af8f33a 100644
--- a/src/test/regress/expected/plpgsql.out
+++ b/src/test/regress/expected/plpgsql.out
@@ -2390,11 +2390,9 @@ select namedparmcursor_test7();
 ERROR:  division by zero
 CONTEXT:  SQL expression "42/0 AS p1, 77 AS p2"
 PL/pgSQL function namedparmcursor_test7() line 6 at OPEN
--- check that line comments work correctly within the argument list (there
--- is some special handling of this case in the code: the newline after the
--- comment must be preserved when the argument-evaluating query is
--- constructed, otherwise the comment effectively comments out the next
--- argument, too)
+-- check that line comments work correctly within the argument list
+-- (this used to require a special hack in the code; it no longer does,
+-- but let's keep the test anyway)
 create function namedparmcursor_test8() returns int4 as $$
 declare
   c1 cursor (p1 int, p2 int) for
diff --git a/src/test/regress/sql/plpgsql.sql b/src/test/regress/sql/plpgsql.sql
index 9ca9449a50..18c91572ae 100644
--- a/src/test/regress/sql/plpgsql.sql
+++ b/src/test/regress/sql/plpgsql.sql
@@ -2047,11 +2047,9 @@ begin
 end $$ language plpgsql;
 select namedparmcursor_test7();
 
--- check that line comments work correctly within the argument list (there
--- is some special handling of this case in the code: the newline after the
--- comment must be preserved when the argument-evaluating query is
--- constructed, otherwise the comment effectively comments out the next
--- argument, too)
+-- check that line comments work correctly within the argument list
+-- (this used to require a special hack in the code; it no longer does,
+-- but let's keep the test anyway)
 create function namedparmcursor_test8() returns int4 as $$
 declare
   c1 cursor (p1 int, p2 int) for

Reply via email to