At Sat, 04 Sep 2021 10:18:24 -0400, Tom Lane <t...@sss.pgh.pa.us> wrote in 
> I kind of wonder if it isn't time to enlist the help of psqlscan.l
> instead of doubling down on the idea that tab-complete.c should have
> its own half-baked SQL lexer.

So, I played with this idea and came up with the attached WIP.  The
test added by the original patch succeeds with it not tweakig the
"Matches" part in psql_completion.

While I checked this I found several dubious parts in TAP test.

=== 010_tab_completion.pl
 # COPY requires quoting
 # note: broken versions of libedit want to backslash the closing quote;
 # not much we can do about that
 check_completion(
-       "COPY foo FROM tmp_check/some\t",
+       "COPY foo FROM \'tmp_check/some\t",

The original command syntax is just wrong and this patch make
completion code treat the command line correctly (breaks the
"filename" into "tmp_check" "/" "some") and the test item fails.

check_completion(
-       "COPY foo FROM tmp_check/af\t",
-       qr|'tmp_check/afile|,
+       "COPY foo FROM \'tmp_check/af\t",
+       qr|'tmp_check/af\a?ile|,        # \a is BEL

This test fails for the same reason, but after fixing it the result
contains \a (BEL) in the output on my CentOS8. I'm not sure what is
happening here..


regards.

-- 
Kyotaro Horiguchi
NTT Open Source Software Center
diff --git a/src/bin/psql/psqlscanslash.l b/src/bin/psql/psqlscanslash.l
index 51aa33e161..5d0d625809 100644
--- a/src/bin/psql/psqlscanslash.l
+++ b/src/bin/psql/psqlscanslash.l
@@ -149,11 +149,15 @@ other			.
 
 {space}|"\\"	{
 					yyless(0);
+					psqlscan_mark_whitespace(cur_state, true);
 					cur_state->start_state = YY_START;
 					return LEXRES_OK;
 				}
 
-{other}			{ ECHO; }
+{other}			{
+					psqlscan_mark_whitespace(cur_state, false);
+					ECHO;
+				}
 
 }
 
@@ -164,9 +168,10 @@ other			.
 	 * check for it here.
 	 */
 
-{space}+		{ }
+{space}+		{  psqlscan_mark_whitespace(cur_state, true); }
 
 "|"				{
+					psqlscan_mark_whitespace(cur_state, false);
 					if (option_type == OT_FILEPIPE)
 					{
 						/* treat like whole-string case */
@@ -182,6 +187,7 @@ other			.
 				}
 
 {other}			{
+					psqlscan_mark_whitespace(cur_state, false);
 					yyless(0);
 					BEGIN(xslasharg);
 				}
@@ -207,18 +213,21 @@ other			.
 					 * processing that encourages use of backslashes is rather
 					 * broken.
 					 */
+					psqlscan_mark_whitespace(cur_state, true);
 					yyless(0);
 					cur_state->start_state = YY_START;
 					return LEXRES_OK;
 				}
 
 {quote}			{
+					psqlscan_mark_whitespace(cur_state, false);
 					*option_quote = '\'';
 					unquoted_option_chars = 0;
 					BEGIN(xslashquote);
 				}
 
 "`"				{
+					psqlscan_mark_whitespace(cur_state, false);
 					backtick_start_offset = output_buf->len;
 					*option_quote = '`';
 					unquoted_option_chars = 0;
@@ -226,6 +235,7 @@ other			.
 				}
 
 {dquote}		{
+					psqlscan_mark_whitespace(cur_state, false);
 					ECHO;
 					*option_quote = '"';
 					unquoted_option_chars = 0;
@@ -233,6 +243,7 @@ other			.
 				}
 
 :{variable_char}+	{
+					psqlscan_mark_whitespace(cur_state, false);
 					/* Possible psql variable substitution */
 					if (cur_state->callbacks->get_variable == NULL)
 						ECHO;
@@ -278,6 +289,7 @@ other			.
 
 
 :\"{variable_char}+\"	{
+					psqlscan_mark_whitespace(cur_state, false);
 					psqlscan_escape_variable(cur_state, yytext, yyleng,
 											 PQUOTE_SQL_IDENT);
 					*option_quote = ':';
@@ -285,6 +297,7 @@ other			.
 				}
 
 :\{\?{variable_char}+\}	{
+					psqlscan_mark_whitespace(cur_state, false);
 					psqlscan_test_variable(cur_state, yytext, yyleng);
 				}
 
@@ -317,6 +330,7 @@ other			.
 				}
 
 {other}			{
+					psqlscan_mark_whitespace(cur_state, false);
 					unquoted_option_chars++;
 					ECHO;
 				}
diff --git a/src/bin/psql/t/010_tab_completion.pl b/src/bin/psql/t/010_tab_completion.pl
index 8695d22545..80181d1355 100644
--- a/src/bin/psql/t/010_tab_completion.pl
+++ b/src/bin/psql/t/010_tab_completion.pl
@@ -194,19 +194,27 @@ check_completion(
 
 clear_query();
 
+# check tab-completion for CONNECTION string with equal sign.
+check_completion(
+	"CREATE SUBSCRIPTION my_sub CONNECTION 'host=localhost port=5432 dbname=postgres' \t",
+	qr|PUBLICATION|,
+	"tab-completion for CONNECTION string with equal sign");
+
+clear_query();
+
 # COPY requires quoting
 # note: broken versions of libedit want to backslash the closing quote;
 # not much we can do about that
 check_completion(
-	"COPY foo FROM tmp_check/some\t",
+	"COPY foo FROM \'tmp_check/some\t",
 	qr|'tmp_check/somefile\\?' |,
 	"quoted filename completion with one possibility");
 
 clear_line();
 
 check_completion(
-	"COPY foo FROM tmp_check/af\t",
-	qr|'tmp_check/afile|,
+	"COPY foo FROM \'tmp_check/af\t",
+	qr|'tmp_check/af\a?ile|,	# \a is BEL
 	"quoted filename completion with multiple possibilities");
 
 # some versions of readline/libedit require two tabs here, some only need one
diff --git a/src/bin/psql/tab-complete.c b/src/bin/psql/tab-complete.c
index 5cd5838668..86871bd0a2 100644
--- a/src/bin/psql/tab-complete.c
+++ b/src/bin/psql/tab-complete.c
@@ -50,6 +50,7 @@
 #include "pqexpbuffer.h"
 #include "settings.h"
 #include "stringutils.h"
+#include "psqlscanslash.h"
 
 /*
  * Ancient versions of libedit provide filename_completion_function()
@@ -4983,15 +4984,24 @@ exec_query(const char *query)
  * Words are returned right to left, that is, previous_words[0] gets the last
  * word before point, previous_words[1] the next-to-last, etc.
  */
+static const PsqlScanCallbacks psqlscan_callbacks = {
+	NULL,
+};
 static char **
 get_previous_words(int point, char **buffer, int *nwords)
 {
 	char	  **previous_words;
 	char	   *buf;
 	char	   *outptr;
+	char	   *endptr;
 	int			words_found = 0;
 	int			i;
+	PsqlScanState scan_state;
+	PsqlScanToken tok;
+	PQExpBufferData query_buf;
 
+	initPQExpBuffer(&query_buf);
+	
 	/*
 	 * If we have anything in tab_completion_query_buf, paste it together with
 	 * rl_line_buffer to construct the full query.  Otherwise we can just use
@@ -5021,81 +5031,61 @@ get_previous_words(int point, char **buffer, int *nwords)
 	 */
 	previous_words = (char **) pg_malloc(point * sizeof(char *));
 	*buffer = outptr = (char *) pg_malloc(point * 2);
+	endptr = outptr + point * 2;	/* limit + 1 */
 
-	/*
-	 * First we look for a non-word char before the current point.  (This is
-	 * probably useless, if readline is on the same page as we are about what
-	 * is a word, but if so it's cheap.)
-	 */
-	for (i = point - 1; i >= 0; i--)
+	scan_state = psql_scan_create(&psqlscan_callbacks);
+	psql_scan_setup(scan_state, buf, strlen(buf),
+					pset.encoding, standard_strings());
+	if (psql_scan(scan_state, NULL,  NULL) != PSCAN_BACKSLASH)
 	{
-		if (strchr(WORD_BREAKS, buf[i]))
-			break;
-	}
-	point = i;
+		words_found = psql_scan_get_ntokens(scan_state);
 
-	/*
-	 * Now parse words, working backwards, until we hit start of line.  The
-	 * backwards scan has some interesting but intentional properties
-	 * concerning parenthesis handling.
-	 */
-	while (point >= 0)
-	{
-		int			start,
-					end;
-		bool		inquotes = false;
-		int			parentheses = 0;
+		if (words_found > 0 && !psql_scan_last_token_is_whitespace(scan_state))
+			words_found--;
 
-		/* now find the first non-space which then constitutes the end */
-		end = -1;
-		for (i = point; i >= 0; i--)
+		tok = NULL;
+		for (i = words_found - 1 ; i >= 0 ; i--)
 		{
-			if (!isspace((unsigned char) buf[i]))
-			{
-				end = i;
-				break;
-			}
+			tok = psql_scan_get_next_token(scan_state, tok);
+			previous_words[i] = outptr;
+			outptr += psql_scan_get_token_string(scan_state, tok, outptr);
 		}
-		/* if no end found, we're done */
-		if (end < 0)
-			break;
+		psql_scan_destroy(scan_state);
+	}
+	else
+	{
+		char *word = psql_scan_slash_command(scan_state);
+		words_found = 0;
 
-		/*
-		 * Otherwise we now look for the start.  The start is either the last
-		 * character before any word-break character going backwards from the
-		 * end, or it's simply character 0.  We also handle open quotes and
-		 * parentheses.
-		 */
-		for (start = end; start > 0; start--)
+		if (word && psql_scan_last_token_is_whitespace(scan_state))
 		{
-			if (buf[start] == '"')
-				inquotes = !inquotes;
-			if (!inquotes)
+			int  n = point - 1;
+
+			previous_words[n--] = outptr;
+			*outptr++ = '\\';
+			strcpy(outptr, word);
+			free(word);
+			outptr += strlen(outptr) + 1;
+			words_found++;
+
+			while ((word = psql_scan_slash_option(scan_state, OT_NORMAL,
+												  NULL, false)) != NULL &&
+				   psql_scan_last_token_is_whitespace(scan_state))
 			{
-				if (buf[start] == ')')
-					parentheses++;
-				else if (buf[start] == '(')
-				{
-					if (--parentheses <= 0)
-						break;
-				}
-				else if (parentheses == 0 &&
-						 strchr(WORD_BREAKS, buf[start - 1]))
-					break;
+				previous_words[n--] = outptr;
+				strcpy(outptr, word);
+				free(word);
+				outptr += strlen(outptr) + 1;
+				words_found++;
 			}
-		}
 
-		/* Return the word located at start to end inclusive */
-		previous_words[words_found++] = outptr;
-		i = end - start + 1;
-		memcpy(outptr, &buf[start], i);
-		outptr += i;
-		*outptr++ = '\0';
-
-		/* Continue searching */
-		point = start - 1;
+			memcpy(previous_words, previous_words + n + 1,
+				   words_found * sizeof(char *));
+		}
 	}
 
+	Assert (point == 0 || outptr < endptr);
+
 	/* Release parsing input workspace, if we made one above */
 	if (buf != rl_line_buffer)
 		free(buf);
@@ -5104,6 +5094,7 @@ get_previous_words(int point, char **buffer, int *nwords)
 	return previous_words;
 }
 
+
 /*
  * Look up the type for the GUC variable with the passed name.
  *
diff --git a/src/fe_utils/psqlscan.l b/src/fe_utils/psqlscan.l
index 0fab48a382..4225c50efd 100644
--- a/src/fe_utils/psqlscan.l
+++ b/src/fe_utils/psqlscan.l
@@ -381,8 +381,10 @@ other			.
 					 * which is pretty dubious but is the historical
 					 * behavior.
 					 */
-					if (!(output_buf->len == 0 || yytext[0] == '-'))
+					if (output_buf &&
+						!(output_buf->len == 0 || yytext[0] == '-'))
 						ECHO;
+					psqlscan_mark_whitespace(cur_state, true);
 				}
 
 {xcstart}		{
@@ -414,6 +416,7 @@ other			.
 				}
 
 {op_chars}		{
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_OP);
 					ECHO;
 				}
 
@@ -424,10 +427,13 @@ other			.
 
 {xbstart}		{
 					BEGIN(xb);
+					cur_state->quote_depth++;
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_STRING);
 					ECHO;
 				}
 <xh>{xhinside}	|
 <xb>{xbinside}	{
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_STRING);
 					ECHO;
 				}
 
@@ -439,11 +445,14 @@ other			.
 					 * to mark it for the input routine as a hex string.
 					 */
 					BEGIN(xh);
+					cur_state->quote_depth++;
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_STRING);
 					ECHO;
 				}
 
 {xnstart}		{
 					yyless(1);	/* eat only 'n' this time */
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_STRING);
 					ECHO;
 				}
 
@@ -452,14 +461,20 @@ other			.
 						BEGIN(xq);
 					else
 						BEGIN(xe);
+					cur_state->quote_depth++;
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_STRING);
 					ECHO;
 				}
 {xestart}		{
 					BEGIN(xe);
+					cur_state->quote_depth++;
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_STRING);
 					ECHO;
 				}
 {xusstart}		{
 					BEGIN(xus);
+					cur_state->quote_depth++;
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_STRING);
 					ECHO;
 				}
 
@@ -474,6 +489,7 @@ other			.
 					 */
 					cur_state->state_before_str_stop = YYSTATE;
 					BEGIN(xqs);
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_CONT);
 					ECHO;
 				}
 <xqs>{quotecontinue} {
@@ -483,6 +499,7 @@ other			.
 					 * added to the literal's contents.
 					 */
 					BEGIN(cur_state->state_before_str_stop);
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_CONT);
 					ECHO;
 				}
 <xqs>{quotecontinuefail} |
@@ -494,46 +511,62 @@ other			.
 					 */
 					yyless(0);
 					BEGIN(INITIAL);
+
+					if (cur_state->quote_depth > 0)
+						cur_state->quote_depth--;
+					/* terminate this string */
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_CONT);
 					/* There's nothing to echo ... */
 				}
 
 <xq,xe,xus>{xqdouble} {
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_STRING);
 					ECHO;
 				}
 <xq,xus>{xqinside}  {
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_CONT);
 					ECHO;
 				}
 <xe>{xeinside}  {
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_CONT);
 					ECHO;
 				}
 <xe>{xeunicode} {
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_CONT);
 					ECHO;
 				}
 <xe>{xeunicodefail}	{
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_CONT);
 					ECHO;
 				}
 <xe>{xeescape}  {
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_CONT);
 					ECHO;
 				}
 <xe>{xeoctesc}  {
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_CONT);
 					ECHO;
 				}
 <xe>{xehexesc}  {
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_CONT);
 					ECHO;
 				}
 <xe>.			{
 					/* This is only needed for \ just before EOF */
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_CONT);
 					ECHO;
 				}
 
 {dolqdelim}		{
 					cur_state->dolqstart = pg_strdup(yytext);
 					BEGIN(xdolq);
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_WORD);
 					ECHO;
 				}
 {dolqfailed}	{
 					/* throw back all but the initial "$" */
 					yyless(1);
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_WORD);
 					ECHO;
 				}
 <xdolq>{dolqdelim} {
@@ -552,77 +585,102 @@ other			.
 						 */
 						yyless(yyleng - 1);
 					}
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_WORD);
 					ECHO;
 				}
 <xdolq>{dolqinside} {
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_WORD);
 					ECHO;
 				}
 <xdolq>{dolqfailed} {
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_WORD);
 					ECHO;
 				}
 <xdolq>.		{
 					/* This is only needed for $ inside the quoted text */
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_WORD);
 					ECHO;
 				}
 
 {xdstart}		{
 					BEGIN(xd);
+					cur_state->quote_depth++;
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_IDENTIFIER);
 					ECHO;
 				}
 {xuistart}		{
 					BEGIN(xui);
+					cur_state->quote_depth++;
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_IDENTIFIER);
 					ECHO;
 				}
 <xd>{xdstop}	{
 					BEGIN(INITIAL);
+					if (cur_state->quote_depth > 0)
+						cur_state->quote_depth--;
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_CONT);
 					ECHO;
 				}
 <xui>{dquote}	{
 					BEGIN(INITIAL);
+					if (cur_state->quote_depth > 0)
+						cur_state->quote_depth--;
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_CONT);
 					ECHO;
 				}
 <xd,xui>{xddouble}	{
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_CONT);
 					ECHO;
 				}
 <xd,xui>{xdinside}	{
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_CONT);
 					ECHO;
 				}
 
 {xufailed}	{
 					/* throw back all but the initial u/U */
 					yyless(1);
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_WORD);
 					ECHO;
 				}
 
 {typecast}		{
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_WORD);
 					ECHO;
 				}
 
 {dot_dot}		{
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_WORD);
 					ECHO;
 				}
 
 {colon_equals}	{
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_WORD);
 					ECHO;
 				}
 
 {equals_greater} {
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_WORD);
 					ECHO;
 				}
 
 {less_equals}	{
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_WORD);
 					ECHO;
 				}
 
 {greater_equals} {
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_WORD);
 					ECHO;
 				}
 
 {less_greater}	{
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_WORD);
 					ECHO;
 				}
 
 {not_equals}	{
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_WORD);
 					ECHO;
 				}
 
@@ -634,16 +692,19 @@ other			.
 
 "("				{
 					cur_state->paren_depth++;
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_LPAREN);
 					ECHO;
 				}
 
 ")"				{
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_RPAREN);
 					if (cur_state->paren_depth > 0)
 						cur_state->paren_depth--;
 					ECHO;
 				}
 
 ";"				{
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_WORD);
 					ECHO;
 					if (cur_state->paren_depth == 0 && cur_state->begin_depth == 0)
 					{
@@ -765,6 +826,7 @@ other			.
 	 */
 
 {self}			{
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_WORD);
 					ECHO;
 				}
 
@@ -832,25 +894,31 @@ other			.
 						/* Strip the unwanted chars from the token */
 						yyless(nchars);
 					}
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_OP);
 					ECHO;
 				}
 
 {param}			{
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_WORD);
 					ECHO;
 				}
 
 {integer}		{
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_WORD);
 					ECHO;
 				}
 {decimal}		{
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_WORD);
 					ECHO;
 				}
 {decimalfail}	{
 					/* throw back the .., and treat as integer */
 					yyless(yyleng - 2);
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_WORD);
 					ECHO;
 				}
 {real}			{
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_WORD);
 					ECHO;
 				}
 {realfail1}		{
@@ -860,11 +928,13 @@ other			.
 					 * (in psql, we don't actually care...)
 					 */
 					yyless(yyleng - 1);
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_WORD);
 					ECHO;
 				}
 {realfail2}		{
 					/* throw back the [Ee][+-], and proceed as above */
 					yyless(yyleng - 2);
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_WORD);
 					ECHO;
 				}
 
@@ -919,10 +989,12 @@ other			.
 						}
 					}
 
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_WORD);
 					ECHO;
 				}
 
 {other}			{
+					psqlscan_add_token(cur_state, yytext, yyleng, PSCAN_TOKEN_WORD);
 					ECHO;
 				}
 
@@ -1036,6 +1108,134 @@ psql_scan_setup(PsqlScanState state,
 	state->refline = state->scanline;
 }
 
+/*
+ * Add a token to token array.
+ *
+ * This is similar to psqlscan_emit but instead adds tokens into an array that
+ * is to be used for syntax completion. Quoted identifiers and strings are
+ * merged together as a token.
+ */
+void
+psqlscan_add_token(PsqlScanState state, const char *txt, int len,
+					PsqlScanTokenType type)
+{
+	PsqlScanToken tok;
+	PsqlScanToken prev;
+
+	if (state->output_buf)
+		return;
+
+	/* we have met non-whitespace characters */
+	state->last_is_whitespace = false;
+
+	/* merge this token into the previous one if it is continued */
+	if (state->token_continue)
+	{
+		Assert (state->lasttoken);
+		state->lasttoken->len += len;
+
+		/* still continue to the next token? */
+		state->token_continue = (state->quote_depth > 0);
+
+		/* 
+		 * If this token has finished, assume it is followed by whitespace.
+		 * (Maybe this is not needed since readline requires a whitespace.)
+		 */
+		if (!state->token_continue)
+			state->last_is_whitespace = true;
+		
+		return;
+	}
+
+	/* Append new token */
+	prev = state->lasttoken;
+	tok = pg_malloc(sizeof(PsqlScanToken));
+
+	/*
+	 * This logic could be simplified if we stored the tokens in the reverse
+	 * order and it is convenient for tab-completion code, but we choose to
+	 * store in the saner order here.
+	 */
+	if (prev)
+		prev->next = tok;
+	else
+		state->tokenlist = tok;
+
+	state->lasttoken = tok;
+
+	/* if this token starts quote, it continues to the following tokens */
+	state->token_continue = (state->quote_depth > 0);
+	tok->type = type;
+	tok->startpos = txt - state->scanbuf;
+	tok->len = len;
+	tok->paren_depth = state->paren_depth;
+	tok->next = NULL;
+}
+
+/*
+ * Set if the last token was whitespace.
+ */
+void
+
+psqlscan_mark_whitespace(PsqlScanState state, bool is_ws)
+{
+	state->last_is_whitespace = is_ws;
+}
+
+
+/*
+ * Returns the next token.  prev=NULL means requesting for the first token.
+ */
+const PsqlScanToken
+psql_scan_get_next_token(PsqlScanState state, const PsqlScanToken prev)
+{
+	if (!prev)
+		return state->tokenlist;
+
+	return prev->next;
+}
+
+/*
+ * Copy token string into given buffer then return the consumed buffer size
+ * including the terminating zero byte.
+ */
+int
+psql_scan_get_token_string(PsqlScanState state,
+						   const PsqlScanToken tok, char *buf)
+{
+	memcpy(buf, &state->scanline[tok->startpos], tok->len);
+	buf[tok->len] = 0;
+
+	return tok->len + 1;
+}
+
+/*
+ * Returns the number of tokens.
+ */
+int
+psql_scan_get_ntokens(PsqlScanState state)
+{
+	int i = 0;
+	PsqlScanToken tok = state->tokenlist;
+	
+	while (tok)
+	{
+		i++;
+		tok = tok->next;
+	}
+
+	return i;
+}
+
+/*
+ *
+ */
+bool
+psql_scan_last_token_is_whitespace(PsqlScanState state)
+{
+	return state->last_is_whitespace;
+}
+
 /*
  * Do lexical analysis of SQL command text.
  *
@@ -1075,7 +1275,8 @@ psql_scan(PsqlScanState state,
 {
 	PsqlScanResult result;
 	int			lexresult;
-
+	promptStatus_t	tprompt;
+	
 	/* Must be scanning already */
 	Assert(state->scanbufhandle != NULL);
 
@@ -1104,60 +1305,60 @@ psql_scan(PsqlScanState state,
 					if (state->paren_depth > 0)
 					{
 						result = PSCAN_INCOMPLETE;
-						*prompt = PROMPT_PAREN;
+						tprompt = PROMPT_PAREN;
 					}
 					else if (state->begin_depth > 0)
 					{
 						result = PSCAN_INCOMPLETE;
-						*prompt = PROMPT_CONTINUE;
+						tprompt = PROMPT_CONTINUE;
 					}
-					else if (query_buf->len > 0)
+					else if (query_buf ? query_buf->len > 0 : state->tokenlist != NULL)
 					{
 						result = PSCAN_EOL;
-						*prompt = PROMPT_CONTINUE;
+						tprompt = PROMPT_CONTINUE;
 					}
 					else
 					{
 						/* never bother to send an empty buffer */
 						result = PSCAN_INCOMPLETE;
-						*prompt = PROMPT_READY;
+						tprompt = PROMPT_READY;
 					}
 					break;
 				case xb:
 					result = PSCAN_INCOMPLETE;
-					*prompt = PROMPT_SINGLEQUOTE;
+					tprompt = PROMPT_SINGLEQUOTE;
 					break;
 				case xc:
 					result = PSCAN_INCOMPLETE;
-					*prompt = PROMPT_COMMENT;
+					tprompt = PROMPT_COMMENT;
 					break;
 				case xd:
 					result = PSCAN_INCOMPLETE;
-					*prompt = PROMPT_DOUBLEQUOTE;
+					tprompt = PROMPT_DOUBLEQUOTE;
 					break;
 				case xh:
 					result = PSCAN_INCOMPLETE;
-					*prompt = PROMPT_SINGLEQUOTE;
+					tprompt = PROMPT_SINGLEQUOTE;
 					break;
 				case xe:
 					result = PSCAN_INCOMPLETE;
-					*prompt = PROMPT_SINGLEQUOTE;
+					tprompt = PROMPT_SINGLEQUOTE;
 					break;
 				case xq:
 					result = PSCAN_INCOMPLETE;
-					*prompt = PROMPT_SINGLEQUOTE;
+					tprompt = PROMPT_SINGLEQUOTE;
 					break;
 				case xdolq:
 					result = PSCAN_INCOMPLETE;
-					*prompt = PROMPT_DOLLARQUOTE;
+					tprompt = PROMPT_DOLLARQUOTE;
 					break;
 				case xui:
 					result = PSCAN_INCOMPLETE;
-					*prompt = PROMPT_DOUBLEQUOTE;
+					tprompt = PROMPT_DOUBLEQUOTE;
 					break;
 				case xus:
 					result = PSCAN_INCOMPLETE;
-					*prompt = PROMPT_SINGLEQUOTE;
+					tprompt = PROMPT_SINGLEQUOTE;
 					break;
 				default:
 					/* can't get here */
@@ -1167,11 +1368,11 @@ psql_scan(PsqlScanState state,
 			break;
 		case LEXRES_SEMI:		/* semicolon */
 			result = PSCAN_SEMICOLON;
-			*prompt = PROMPT_READY;
+			tprompt = PROMPT_READY;
 			break;
 		case LEXRES_BACKSLASH:	/* backslash */
 			result = PSCAN_BACKSLASH;
-			*prompt = PROMPT_READY;
+			tprompt = PROMPT_READY;
 			break;
 		default:
 			/* can't get here */
@@ -1179,6 +1380,8 @@ psql_scan(PsqlScanState state,
 			exit(1);
 	}
 
+	if (prompt)
+		*prompt = tprompt;
 	return result;
 }
 
@@ -1221,14 +1424,28 @@ psql_scan_finish(PsqlScanState state)
 void
 psql_scan_reset(PsqlScanState state)
 {
+	PsqlScanToken tok;
+
 	state->start_state = INITIAL;
 	state->paren_depth = 0;
+	state->quote_depth = 0;
 	state->xcdepth = 0;			/* not really necessary */
 	if (state->dolqstart)
 		free(state->dolqstart);
 	state->dolqstart = NULL;
 	state->identifier_count = 0;
 	state->begin_depth = 0;
+	state->token_continue = false;
+
+	/* free up toekn list */
+	tok = state->tokenlist;
+	while (tok)
+	{
+		PsqlScanToken prev = tok;
+		tok = tok->next;
+		free(prev);
+	}
+	state->tokenlist = NULL;
 }
 
 /*
@@ -1419,6 +1636,9 @@ psqlscan_emit(PsqlScanState state, const char *txt, int len)
 {
 	PQExpBuffer output_buf = state->output_buf;
 
+	if (!output_buf)
+		return;
+
 	if (state->safe_encoding)
 		appendBinaryPQExpBuffer(output_buf, txt, len);
 	else
diff --git a/src/include/fe_utils/psqlscan.h b/src/include/fe_utils/psqlscan.h
index e55f1fa213..5f32106f3b 100644
--- a/src/include/fe_utils/psqlscan.h
+++ b/src/include/fe_utils/psqlscan.h
@@ -57,6 +57,20 @@ typedef enum
 	PQUOTE_SHELL_ARG			/* quote if needed to be safe in a shell cmd */
 } PsqlScanQuoteType;
 
+typedef enum PsqlScanTokenType
+{
+	PSCAN_TOKEN_WORD,
+	PSCAN_TOKEN_IDENTIFIER,
+	PSCAN_TOKEN_LPAREN,
+	PSCAN_TOKEN_RPAREN,
+	PSCAN_TOKEN_STRING,
+	PSCAN_TOKEN_OP,
+	PSCAN_TOKEN_CONT
+} PsqlScanTokenType;
+
+struct PsqlScanTokenData;
+typedef struct PsqlScanTokenData *PsqlScanToken;
+
 /* Callback functions to be used by the lexer */
 typedef struct PsqlScanCallbacks
 {
@@ -76,7 +90,12 @@ extern void psql_scan_setup(PsqlScanState state,
 							const char *line, int line_len,
 							int encoding, bool std_strings);
 extern void psql_scan_finish(PsqlScanState state);
-
+extern const PsqlScanToken psql_scan_get_next_token(PsqlScanState state,
+													const PsqlScanToken prev);
+extern int psql_scan_get_token_string(PsqlScanState state,
+									  const PsqlScanToken tok, char *buf);
+extern int	psql_scan_get_ntokens(PsqlScanState state);
+extern bool psql_scan_last_token_is_whitespace(PsqlScanState state);
 extern PsqlScanResult psql_scan(PsqlScanState state,
 								PQExpBuffer query_buf,
 								promptStatus_t *prompt);
diff --git a/src/include/fe_utils/psqlscan_int.h b/src/include/fe_utils/psqlscan_int.h
index 8ada977092..3fc6e4d97e 100644
--- a/src/include/fe_utils/psqlscan_int.h
+++ b/src/include/fe_utils/psqlscan_int.h
@@ -75,6 +75,19 @@ typedef struct StackElem
 	struct StackElem *next;
 } StackElem;
 
+/*
+ * psql_scan has another mode that stores tokenized data instead of a
+ * cleaned-up string.  This represents every token in the token array.
+ */
+typedef struct PsqlScanTokenData
+{
+	PsqlScanTokenType	type;		/* toke type */
+	int					startpos;	/* start position in state.scanline */
+	int					len;		/* token length */
+	int					paren_depth; /* depth of parenthesized region */
+	struct PsqlScanTokenData *next;
+} PsqlScanTokenData;
+
 /*
  * All working state of the lexer must be stored in PsqlScanStateData
  * between calls.  This allows us to have multiple open lexer operations,
@@ -112,6 +125,7 @@ typedef struct PsqlScanStateData
 	int			start_state;	/* yylex's starting/finishing state */
 	int			state_before_str_stop;	/* start cond. before end quote */
 	int			paren_depth;	/* depth of nesting in parentheses */
+	int			quote_depth;
 	int			xcdepth;		/* depth of nesting in slash-star comments */
 	char	   *dolqstart;		/* current $foo$ quote start string */
 
@@ -123,6 +137,11 @@ typedef struct PsqlScanStateData
 	char		identifiers[4]; /* records the first few identifiers */
 	int			begin_depth;	/* depth of begin/end pairs */
 
+	PsqlScanToken	tokenlist;		/* list of tokens*/
+	PsqlScanToken	lasttoken;		/* last token for fast access */
+	bool			token_continue;	/* the last token is continuing */
+	bool		last_is_whitespace;	/* last token was whitespace */
+
 	/*
 	 * Callback functions provided by the program making use of the lexer,
 	 * plus a void* callback passthrough argument.
@@ -146,6 +165,9 @@ extern YY_BUFFER_STATE psqlscan_prepare_buffer(PsqlScanState state,
 											   const char *txt, int len,
 											   char **txtcopy);
 extern void psqlscan_emit(PsqlScanState state, const char *txt, int len);
+extern void psqlscan_add_token(PsqlScanState state, const char *txt, int len,
+	PsqlScanTokenType type);
+extern void psqlscan_mark_whitespace(PsqlScanState state, bool is_ws);
 extern char *psqlscan_extract_substring(PsqlScanState state,
 										const char *txt, int len);
 extern void psqlscan_escape_variable(PsqlScanState state,

Reply via email to