Re: reducing the footprint of ScanKeyword (was Re: Large writable variables)

John Naylor Thu, 27 Dec 2018 08:20:45 -0800

On 12/26/18, John Naylor <jcnay...@gmail.com> wrote:
> On 12/26/18, Robert Haas <robertmh...@gmail.com> wrote:
>> I wonder if we could do something really simple like a lookup based on
>> the first character of the scan keyword. It looks to me like there are
>> 440 keywords right now, and the most common starting letter is 'c',
>> which is the first letter of 51 keywords. So dispatching based on the
>> first letter clips at least 3 steps off the binary search.  I don't
>> know whether that's enough to be worthwhile, but it's probably pretty
>> simple to implement.


> I agree it'd be fairly simple to do, and might raise the bar for doing
> anything more complex.

I went ahead and did this for v4, but split out into a separate patch.
In addition, I used a heuristic to bypass binary search for the most
common keywords. Normally, the middle value is computed
mathematically, but I found that in each range of keywords beginning
with the same letter, there is often 1 or 2 common keywords that are
good first guesses, such as select, from, join, limit, where. I taught
the lookup to try those first, and then compute subsequent steps the
usual way.

Barring additional bikeshedding on 0001, I'll plan on implementing
offset-based lookup for the other keyword types and retire the old
ScanKeyword. Once that's done, we can benchmark and compare with the
optimizations in 0002.

-John Naylor

From ed78318df69bd3fa1543f3dc04d0a6ca9794a81c Mon Sep 17 00:00:00 2001
From: John Naylor <jcnay...@gmail.com>
Date: Wed, 26 Dec 2018 21:28:27 -0500
Subject: [PATCH v4 1/2] WIP Use offset-based keyword lookup.

(For WIP, only for plpgsql unreserved keywords)

Replace binary search over an array of ScanKeyword structs with a binary
search over an array of offsets into a keyword string. Access auxillary
data only after a keyword hit. This has better locality of reference and
a smaller memory footprint.
---
 src/common/keywords.c                     |  55 +++++++++
 src/include/common/keywords.h             |  12 ++
 src/pl/plpgsql/src/.gitignore             |   1 +
 src/pl/plpgsql/src/Makefile               |  13 +-
 src/pl/plpgsql/src/pl_scanner.c           | 128 +++++---------------
 src/pl/plpgsql/src/pl_unreserved_kwlist.h | 108 +++++++++++++++++
 src/tools/gen_keywords.pl                 | 137 ++++++++++++++++++++++
 src/tools/msvc/Solution.pm                |  10 ++
 8 files changed, 361 insertions(+), 103 deletions(-)
 create mode 100644 src/pl/plpgsql/src/pl_unreserved_kwlist.h
 create mode 100644 src/tools/gen_keywords.pl

diff --git a/src/common/keywords.c b/src/common/keywords.c
index 0c0c794c68..b0e5a721b6 100644
--- a/src/common/keywords.c
+++ b/src/common/keywords.c
@@ -112,3 +112,58 @@ ScanKeywordLookup(const char *text,
 
 	return NULL;
 }
+
+/* Like ScanKeywordLookup, but uses offsets into a keyword string. */
+int
+ScanKeywordLookupOffset(const char *string_to_lookup,
+						const char *kw_strings,
+						const uint16 *kw_offsets,
+						int num_keywords)
+{
+	int			len,
+				i;
+	char		word[NAMEDATALEN];
+	const uint16 *low;
+	const uint16 *high;
+
+	len = strlen(string_to_lookup);
+	/* We assume all keywords are shorter than NAMEDATALEN. */
+	if (len >= NAMEDATALEN)
+		return -1;
+
+	/*
+	 * Apply an ASCII-only downcasing.  We must not use tolower() since it may
+	 * produce the wrong translation in some locales (eg, Turkish).
+	 */
+	for (i = 0; i < len; i++)
+	{
+		char		ch = string_to_lookup[i];
+
+		if (ch >= 'A' && ch <= 'Z')
+			ch += 'a' - 'A';
+		word[i] = ch;
+	}
+	word[len] = '\0';
+
+	/*
+	 * Now do a binary search using plain strcmp() comparison.
+	 */
+	low = kw_offsets;
+	high = kw_offsets + (num_keywords - 1);
+	while (low <= high)
+	{
+		const uint16 *middle;
+		int			difference;
+
+		middle = low + (high - low) / 2;
+		difference = strcmp(kw_strings + *middle, word);
+		if (difference == 0)
+			return middle - kw_offsets;
+		else if (difference < 0)
+			low = middle + 1;
+		else
+			high = middle - 1;
+	}
+
+	return -1;
+}
diff --git a/src/include/common/keywords.h b/src/include/common/keywords.h
index 0b31505b66..201d0fcc7a 100644
--- a/src/include/common/keywords.h
+++ b/src/include/common/keywords.h
@@ -28,6 +28,13 @@ typedef struct ScanKeyword
 	int16		category;		/* see codes above */
 } ScanKeyword;
 
+/* Payload data for keywords */
+typedef struct ScanKeywordAux
+{
+	int16		value;			/* grammar's token code */
+	char		category;		/* see codes above */
+} ScanKeywordAux;
+
 #ifndef FRONTEND
 extern PGDLLIMPORT const ScanKeyword ScanKeywords[];
 extern PGDLLIMPORT const int NumScanKeywords;
@@ -41,4 +48,9 @@ extern const ScanKeyword *ScanKeywordLookup(const char *text,
 				  const ScanKeyword *keywords,
 				  int num_keywords);
 
+int ScanKeywordLookupOffset(const char *string_to_lookup,
+						const char *kw_strings,
+						const uint16 *kw_offsets,
+						int num_keywords);
+
 #endif							/* KEYWORDS_H */
diff --git a/src/pl/plpgsql/src/.gitignore b/src/pl/plpgsql/src/.gitignore
index ff6ac965fd..a649302fdb 100644
--- a/src/pl/plpgsql/src/.gitignore
+++ b/src/pl/plpgsql/src/.gitignore
@@ -1,3 +1,4 @@
+/*kwlist_d.h
 /pl_gram.c
 /pl_gram.h
 /plerrcodes.h
diff --git a/src/pl/plpgsql/src/Makefile b/src/pl/plpgsql/src/Makefile
index 25a5a9d448..9112aa7d23 100644
--- a/src/pl/plpgsql/src/Makefile
+++ b/src/pl/plpgsql/src/Makefile
@@ -60,7 +60,7 @@ uninstall-headers:
 
 
 # Force these dependencies to be known even without dependency info built:
-pl_gram.o pl_handler.o pl_comp.o pl_exec.o pl_funcs.o pl_scanner.o: plpgsql.h pl_gram.h plerrcodes.h
+pl_gram.o pl_handler.o pl_comp.o pl_exec.o pl_funcs.o pl_scanner.o: plpgsql.h pl_gram.h plerrcodes.h pl_unreserved_kwlist_d.h
 
 # See notes in src/backend/parser/Makefile about the following two rules
 pl_gram.h: pl_gram.c
@@ -72,6 +72,9 @@ pl_gram.c: BISONFLAGS += -d
 plerrcodes.h: $(top_srcdir)/src/backend/utils/errcodes.txt generate-plerrcodes.pl
 	$(PERL) $(srcdir)/generate-plerrcodes.pl $< > $@
 
+# generate keyword headers for the scanner
+pl_unreserved_kwlist_d.h: pl_unreserved_kwlist.h $(top_srcdir)/src/tools/gen_keywords.pl
+	$(PERL) $(top_srcdir)/src/tools/gen_keywords.pl $<
 
 check: submake
 	$(pg_regress_check) $(REGRESS_OPTS) $(REGRESS)
@@ -84,13 +87,13 @@ submake:
 	$(MAKE) -C $(top_builddir)/src/test/regress pg_regress$(X)
 
 
-distprep: pl_gram.h pl_gram.c plerrcodes.h
+distprep: pl_gram.h pl_gram.c plerrcodes.h pl_unreserved_kwlist_d.h
 
-# pl_gram.c, pl_gram.h and plerrcodes.h are in the distribution tarball,
-# so they are not cleaned here.
+# pl_gram.c, pl_gram.h, plerrcodes.h, the generated keyword headers are
+# in the distribution tarball, so they are not cleaned here.
 clean distclean: clean-lib
 	rm -f $(OBJS)
 	rm -rf $(pg_regress_clean_files)
 
 maintainer-clean: distclean
-	rm -f pl_gram.c pl_gram.h plerrcodes.h
+	rm -f pl_gram.c pl_gram.h plerrcodes.h pl_unreserved_kwlist_d.h
diff --git a/src/pl/plpgsql/src/pl_scanner.c b/src/pl/plpgsql/src/pl_scanner.c
index ab18946847..0b2f331117 100644
--- a/src/pl/plpgsql/src/pl_scanner.c
+++ b/src/pl/plpgsql/src/pl_scanner.c
@@ -21,6 +21,8 @@
 #include "plpgsql.h"
 #include "pl_gram.h"			/* must be after parser/scanner.h */
 
+/* String lookup table for keywords */
+#include "pl_unreserved_kwlist_d.h"
 
 #define PG_KEYWORD(a,b,c) {a,b,c},
 
@@ -96,88 +98,12 @@ static const ScanKeyword reserved_keywords[] = {
 
 static const int num_reserved_keywords = lengthof(reserved_keywords);
 
-static const ScanKeyword unreserved_keywords[] = {
-	PG_KEYWORD("absolute", K_ABSOLUTE, UNRESERVED_KEYWORD)
-	PG_KEYWORD("alias", K_ALIAS, UNRESERVED_KEYWORD)
-	PG_KEYWORD("array", K_ARRAY, UNRESERVED_KEYWORD)
-	PG_KEYWORD("assert", K_ASSERT, UNRESERVED_KEYWORD)
-	PG_KEYWORD("backward", K_BACKWARD, UNRESERVED_KEYWORD)
-	PG_KEYWORD("call", K_CALL, UNRESERVED_KEYWORD)
-	PG_KEYWORD("close", K_CLOSE, UNRESERVED_KEYWORD)
-	PG_KEYWORD("collate", K_COLLATE, UNRESERVED_KEYWORD)
-	PG_KEYWORD("column", K_COLUMN, UNRESERVED_KEYWORD)
-	PG_KEYWORD("column_name", K_COLUMN_NAME, UNRESERVED_KEYWORD)
-	PG_KEYWORD("commit", K_COMMIT, UNRESERVED_KEYWORD)
-	PG_KEYWORD("constant", K_CONSTANT, UNRESERVED_KEYWORD)
-	PG_KEYWORD("constraint", K_CONSTRAINT, UNRESERVED_KEYWORD)
-	PG_KEYWORD("constraint_name", K_CONSTRAINT_NAME, UNRESERVED_KEYWORD)
-	PG_KEYWORD("continue", K_CONTINUE, UNRESERVED_KEYWORD)
-	PG_KEYWORD("current", K_CURRENT, UNRESERVED_KEYWORD)
-	PG_KEYWORD("cursor", K_CURSOR, UNRESERVED_KEYWORD)
-	PG_KEYWORD("datatype", K_DATATYPE, UNRESERVED_KEYWORD)
-	PG_KEYWORD("debug", K_DEBUG, UNRESERVED_KEYWORD)
-	PG_KEYWORD("default", K_DEFAULT, UNRESERVED_KEYWORD)
-	PG_KEYWORD("detail", K_DETAIL, UNRESERVED_KEYWORD)
-	PG_KEYWORD("diagnostics", K_DIAGNOSTICS, UNRESERVED_KEYWORD)
-	PG_KEYWORD("do", K_DO, UNRESERVED_KEYWORD)
-	PG_KEYWORD("dump", K_DUMP, UNRESERVED_KEYWORD)
-	PG_KEYWORD("elseif", K_ELSIF, UNRESERVED_KEYWORD)
-	PG_KEYWORD("elsif", K_ELSIF, UNRESERVED_KEYWORD)
-	PG_KEYWORD("errcode", K_ERRCODE, UNRESERVED_KEYWORD)
-	PG_KEYWORD("error", K_ERROR, UNRESERVED_KEYWORD)
-	PG_KEYWORD("exception", K_EXCEPTION, UNRESERVED_KEYWORD)
-	PG_KEYWORD("exit", K_EXIT, UNRESERVED_KEYWORD)
-	PG_KEYWORD("fetch", K_FETCH, UNRESERVED_KEYWORD)
-	PG_KEYWORD("first", K_FIRST, UNRESERVED_KEYWORD)
-	PG_KEYWORD("forward", K_FORWARD, UNRESERVED_KEYWORD)
-	PG_KEYWORD("get", K_GET, UNRESERVED_KEYWORD)
-	PG_KEYWORD("hint", K_HINT, UNRESERVED_KEYWORD)
-	PG_KEYWORD("import", K_IMPORT, UNRESERVED_KEYWORD)
-	PG_KEYWORD("info", K_INFO, UNRESERVED_KEYWORD)
-	PG_KEYWORD("insert", K_INSERT, UNRESERVED_KEYWORD)
-	PG_KEYWORD("is", K_IS, UNRESERVED_KEYWORD)
-	PG_KEYWORD("last", K_LAST, UNRESERVED_KEYWORD)
-	PG_KEYWORD("log", K_LOG, UNRESERVED_KEYWORD)
-	PG_KEYWORD("message", K_MESSAGE, UNRESERVED_KEYWORD)
-	PG_KEYWORD("message_text", K_MESSAGE_TEXT, UNRESERVED_KEYWORD)
-	PG_KEYWORD("move", K_MOVE, UNRESERVED_KEYWORD)
-	PG_KEYWORD("next", K_NEXT, UNRESERVED_KEYWORD)
-	PG_KEYWORD("no", K_NO, UNRESERVED_KEYWORD)
-	PG_KEYWORD("notice", K_NOTICE, UNRESERVED_KEYWORD)
-	PG_KEYWORD("open", K_OPEN, UNRESERVED_KEYWORD)
-	PG_KEYWORD("option", K_OPTION, UNRESERVED_KEYWORD)
-	PG_KEYWORD("perform", K_PERFORM, UNRESERVED_KEYWORD)
-	PG_KEYWORD("pg_context", K_PG_CONTEXT, UNRESERVED_KEYWORD)
-	PG_KEYWORD("pg_datatype_name", K_PG_DATATYPE_NAME, UNRESERVED_KEYWORD)
-	PG_KEYWORD("pg_exception_context", K_PG_EXCEPTION_CONTEXT, UNRESERVED_KEYWORD)
-	PG_KEYWORD("pg_exception_detail", K_PG_EXCEPTION_DETAIL, UNRESERVED_KEYWORD)
-	PG_KEYWORD("pg_exception_hint", K_PG_EXCEPTION_HINT, UNRESERVED_KEYWORD)
-	PG_KEYWORD("print_strict_params", K_PRINT_STRICT_PARAMS, UNRESERVED_KEYWORD)
-	PG_KEYWORD("prior", K_PRIOR, UNRESERVED_KEYWORD)
-	PG_KEYWORD("query", K_QUERY, UNRESERVED_KEYWORD)
-	PG_KEYWORD("raise", K_RAISE, UNRESERVED_KEYWORD)
-	PG_KEYWORD("relative", K_RELATIVE, UNRESERVED_KEYWORD)
-	PG_KEYWORD("reset", K_RESET, UNRESERVED_KEYWORD)
-	PG_KEYWORD("return", K_RETURN, UNRESERVED_KEYWORD)
-	PG_KEYWORD("returned_sqlstate", K_RETURNED_SQLSTATE, UNRESERVED_KEYWORD)
-	PG_KEYWORD("reverse", K_REVERSE, UNRESERVED_KEYWORD)
-	PG_KEYWORD("rollback", K_ROLLBACK, UNRESERVED_KEYWORD)
-	PG_KEYWORD("row_count", K_ROW_COUNT, UNRESERVED_KEYWORD)
-	PG_KEYWORD("rowtype", K_ROWTYPE, UNRESERVED_KEYWORD)
-	PG_KEYWORD("schema", K_SCHEMA, UNRESERVED_KEYWORD)
-	PG_KEYWORD("schema_name", K_SCHEMA_NAME, UNRESERVED_KEYWORD)
-	PG_KEYWORD("scroll", K_SCROLL, UNRESERVED_KEYWORD)
-	PG_KEYWORD("set", K_SET, UNRESERVED_KEYWORD)
-	PG_KEYWORD("slice", K_SLICE, UNRESERVED_KEYWORD)
-	PG_KEYWORD("sqlstate", K_SQLSTATE, UNRESERVED_KEYWORD)
-	PG_KEYWORD("stacked", K_STACKED, UNRESERVED_KEYWORD)
-	PG_KEYWORD("table", K_TABLE, UNRESERVED_KEYWORD)
-	PG_KEYWORD("table_name", K_TABLE_NAME, UNRESERVED_KEYWORD)
-	PG_KEYWORD("type", K_TYPE, UNRESERVED_KEYWORD)
-	PG_KEYWORD("use_column", K_USE_COLUMN, UNRESERVED_KEYWORD)
-	PG_KEYWORD("use_variable", K_USE_VARIABLE, UNRESERVED_KEYWORD)
-	PG_KEYWORD("variable_conflict", K_VARIABLE_CONFLICT, UNRESERVED_KEYWORD)
-	PG_KEYWORD("warning", K_WARNING, UNRESERVED_KEYWORD)
+/* FIXME: Have to redefine this symbol for the WIP. */
+#undef PG_KEYWORD
+#define PG_KEYWORD(kwname, value, category) {value, category},
+
+static const ScanKeywordAux unreserved_keywords[] = {
+#include "pl_unreserved_kwlist.h"
 };
 
 static const int num_unreserved_keywords = lengthof(unreserved_keywords);
@@ -256,7 +182,7 @@ plpgsql_yylex(void)
 {
 	int			tok1;
 	TokenAuxData aux1;
-	const ScanKeyword *kw;
+	int kwnum;
 
 	tok1 = internal_yylex(&aux1);
 	if (tok1 == IDENT || tok1 == PARAM)
@@ -332,12 +258,14 @@ plpgsql_yylex(void)
 									   &aux1.lval.word))
 					tok1 = T_DATUM;
 				else if (!aux1.lval.word.quoted &&
-						 (kw = ScanKeywordLookup(aux1.lval.word.ident,
-												 unreserved_keywords,
-												 num_unreserved_keywords)))
+						 (kwnum = ScanKeywordLookupOffset(aux1.lval.word.ident,
+												 pl_unreserved_kw_strings,
+												 pl_unreserved_kw_offsets,
+												 num_unreserved_keywords)) >= 0)
 				{
-					aux1.lval.keyword = kw->name;
-					tok1 = kw->value;
+					aux1.lval.keyword = pl_unreserved_kw_strings
+										+ pl_unreserved_kw_offsets[kwnum];
+					tok1 = unreserved_keywords[kwnum].value;
 				}
 				else
 					tok1 = T_WORD;
@@ -362,12 +290,14 @@ plpgsql_yylex(void)
 			{
 				/* try for unreserved keyword, then for variable name */
 				if (core_yy.scanbuf[aux1.lloc] != '"' &&
-					(kw = ScanKeywordLookup(aux1.lval.str,
-											unreserved_keywords,
-											num_unreserved_keywords)))
+					(kwnum = ScanKeywordLookupOffset(aux1.lval.str,
+											 pl_unreserved_kw_strings,
+											 pl_unreserved_kw_offsets,
+											 num_unreserved_keywords)) >= 0)
 				{
-					aux1.lval.keyword = kw->name;
-					tok1 = kw->value;
+					aux1.lval.keyword = pl_unreserved_kw_strings
+										+ pl_unreserved_kw_offsets[kwnum];
+					tok1 = unreserved_keywords[kwnum].value;
 				}
 				else if (plpgsql_parse_word(aux1.lval.str,
 											core_yy.scanbuf + aux1.lloc,
@@ -386,12 +316,14 @@ plpgsql_yylex(void)
 									   &aux1.lval.word))
 					tok1 = T_DATUM;
 				else if (!aux1.lval.word.quoted &&
-						 (kw = ScanKeywordLookup(aux1.lval.word.ident,
-												 unreserved_keywords,
-												 num_unreserved_keywords)))
+						 (kwnum = ScanKeywordLookupOffset(aux1.lval.word.ident,
+												 pl_unreserved_kw_strings,
+												 pl_unreserved_kw_offsets,
+												 num_unreserved_keywords)) >= 0)
 				{
-					aux1.lval.keyword = kw->name;
-					tok1 = kw->value;
+					aux1.lval.keyword = pl_unreserved_kw_strings
+										+ pl_unreserved_kw_offsets[kwnum];
+					tok1 = unreserved_keywords[kwnum].value;
 				}
 				else
 					tok1 = T_WORD;
diff --git a/src/pl/plpgsql/src/pl_unreserved_kwlist.h b/src/pl/plpgsql/src/pl_unreserved_kwlist.h
new file mode 100644
index 0000000000..4fb0dff772
--- /dev/null
+++ b/src/pl/plpgsql/src/pl_unreserved_kwlist.h
@@ -0,0 +1,108 @@
+/*-------------------------------------------------------------------------
+ *
+ * pl_unreserved_kwlist.h
+ *
+ * The keyword lists are kept in their own source files for use by automatic
+ * tools.  The exact representation of a keyword is determined by the
+ * PG_KEYWORD macro, which is not defined in this file; it can be
+ * defined by the caller for special purposes.
+ *
+ * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/pl/plpgsql/src/pl_unreserved_kwlist.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/* There is deliberately not an #ifndef PL_UNRESERVED_KWLIST_H here. */
+
+/*
+ * List of keyword (name, token-value, category) entries.
+ *
+ * !!WARNING!!: This list must be sorted by ASCII name, because binary
+ *		search is used to locate entries.
+ */
+
+/* name, value, category */
+PG_KEYWORD("absolute", K_ABSOLUTE, UNRESERVED_KEYWORD)
+PG_KEYWORD("alias", K_ALIAS, UNRESERVED_KEYWORD)
+PG_KEYWORD("array", K_ARRAY, UNRESERVED_KEYWORD)
+PG_KEYWORD("assert", K_ASSERT, UNRESERVED_KEYWORD)
+PG_KEYWORD("backward", K_BACKWARD, UNRESERVED_KEYWORD)
+PG_KEYWORD("call", K_CALL, UNRESERVED_KEYWORD)
+PG_KEYWORD("close", K_CLOSE, UNRESERVED_KEYWORD)
+PG_KEYWORD("collate", K_COLLATE, UNRESERVED_KEYWORD)
+PG_KEYWORD("column", K_COLUMN, UNRESERVED_KEYWORD)
+PG_KEYWORD("column_name", K_COLUMN_NAME, UNRESERVED_KEYWORD)
+PG_KEYWORD("commit", K_COMMIT, UNRESERVED_KEYWORD)
+PG_KEYWORD("constant", K_CONSTANT, UNRESERVED_KEYWORD)
+PG_KEYWORD("constraint", K_CONSTRAINT, UNRESERVED_KEYWORD)
+PG_KEYWORD("constraint_name", K_CONSTRAINT_NAME, UNRESERVED_KEYWORD)
+PG_KEYWORD("continue", K_CONTINUE, UNRESERVED_KEYWORD)
+PG_KEYWORD("current", K_CURRENT, UNRESERVED_KEYWORD)
+PG_KEYWORD("cursor", K_CURSOR, UNRESERVED_KEYWORD)
+PG_KEYWORD("datatype", K_DATATYPE, UNRESERVED_KEYWORD)
+PG_KEYWORD("debug", K_DEBUG, UNRESERVED_KEYWORD)
+PG_KEYWORD("default", K_DEFAULT, UNRESERVED_KEYWORD)
+PG_KEYWORD("detail", K_DETAIL, UNRESERVED_KEYWORD)
+PG_KEYWORD("diagnostics", K_DIAGNOSTICS, UNRESERVED_KEYWORD)
+PG_KEYWORD("do", K_DO, UNRESERVED_KEYWORD)
+PG_KEYWORD("dump", K_DUMP, UNRESERVED_KEYWORD)
+PG_KEYWORD("elseif", K_ELSIF, UNRESERVED_KEYWORD)
+PG_KEYWORD("elsif", K_ELSIF, UNRESERVED_KEYWORD)
+PG_KEYWORD("errcode", K_ERRCODE, UNRESERVED_KEYWORD)
+PG_KEYWORD("error", K_ERROR, UNRESERVED_KEYWORD)
+PG_KEYWORD("exception", K_EXCEPTION, UNRESERVED_KEYWORD)
+PG_KEYWORD("exit", K_EXIT, UNRESERVED_KEYWORD)
+PG_KEYWORD("fetch", K_FETCH, UNRESERVED_KEYWORD)
+PG_KEYWORD("first", K_FIRST, UNRESERVED_KEYWORD)
+PG_KEYWORD("forward", K_FORWARD, UNRESERVED_KEYWORD)
+PG_KEYWORD("get", K_GET, UNRESERVED_KEYWORD)
+PG_KEYWORD("hint", K_HINT, UNRESERVED_KEYWORD)
+PG_KEYWORD("import", K_IMPORT, UNRESERVED_KEYWORD)
+PG_KEYWORD("info", K_INFO, UNRESERVED_KEYWORD)
+PG_KEYWORD("insert", K_INSERT, UNRESERVED_KEYWORD)
+PG_KEYWORD("is", K_IS, UNRESERVED_KEYWORD)
+PG_KEYWORD("last", K_LAST, UNRESERVED_KEYWORD)
+PG_KEYWORD("log", K_LOG, UNRESERVED_KEYWORD)
+PG_KEYWORD("message", K_MESSAGE, UNRESERVED_KEYWORD)
+PG_KEYWORD("message_text", K_MESSAGE_TEXT, UNRESERVED_KEYWORD)
+PG_KEYWORD("move", K_MOVE, UNRESERVED_KEYWORD)
+PG_KEYWORD("next", K_NEXT, UNRESERVED_KEYWORD)
+PG_KEYWORD("no", K_NO, UNRESERVED_KEYWORD)
+PG_KEYWORD("notice", K_NOTICE, UNRESERVED_KEYWORD)
+PG_KEYWORD("open", K_OPEN, UNRESERVED_KEYWORD)
+PG_KEYWORD("option", K_OPTION, UNRESERVED_KEYWORD)
+PG_KEYWORD("perform", K_PERFORM, UNRESERVED_KEYWORD)
+PG_KEYWORD("pg_context", K_PG_CONTEXT, UNRESERVED_KEYWORD)
+PG_KEYWORD("pg_datatype_name", K_PG_DATATYPE_NAME, UNRESERVED_KEYWORD)
+PG_KEYWORD("pg_exception_context", K_PG_EXCEPTION_CONTEXT, UNRESERVED_KEYWORD)
+PG_KEYWORD("pg_exception_detail", K_PG_EXCEPTION_DETAIL, UNRESERVED_KEYWORD)
+PG_KEYWORD("pg_exception_hint", K_PG_EXCEPTION_HINT, UNRESERVED_KEYWORD)
+PG_KEYWORD("print_strict_params", K_PRINT_STRICT_PARAMS, UNRESERVED_KEYWORD)
+PG_KEYWORD("prior", K_PRIOR, UNRESERVED_KEYWORD)
+PG_KEYWORD("query", K_QUERY, UNRESERVED_KEYWORD)
+PG_KEYWORD("raise", K_RAISE, UNRESERVED_KEYWORD)
+PG_KEYWORD("relative", K_RELATIVE, UNRESERVED_KEYWORD)
+PG_KEYWORD("reset", K_RESET, UNRESERVED_KEYWORD)
+PG_KEYWORD("return", K_RETURN, UNRESERVED_KEYWORD)
+PG_KEYWORD("returned_sqlstate", K_RETURNED_SQLSTATE, UNRESERVED_KEYWORD)
+PG_KEYWORD("reverse", K_REVERSE, UNRESERVED_KEYWORD)
+PG_KEYWORD("rollback", K_ROLLBACK, UNRESERVED_KEYWORD)
+PG_KEYWORD("row_count", K_ROW_COUNT, UNRESERVED_KEYWORD)
+PG_KEYWORD("rowtype", K_ROWTYPE, UNRESERVED_KEYWORD)
+PG_KEYWORD("schema", K_SCHEMA, UNRESERVED_KEYWORD)
+PG_KEYWORD("schema_name", K_SCHEMA_NAME, UNRESERVED_KEYWORD)
+PG_KEYWORD("scroll", K_SCROLL, UNRESERVED_KEYWORD)
+PG_KEYWORD("set", K_SET, UNRESERVED_KEYWORD)
+PG_KEYWORD("slice", K_SLICE, UNRESERVED_KEYWORD)
+PG_KEYWORD("sqlstate", K_SQLSTATE, UNRESERVED_KEYWORD)
+PG_KEYWORD("stacked", K_STACKED, UNRESERVED_KEYWORD)
+PG_KEYWORD("table", K_TABLE, UNRESERVED_KEYWORD)
+PG_KEYWORD("table_name", K_TABLE_NAME, UNRESERVED_KEYWORD)
+PG_KEYWORD("type", K_TYPE, UNRESERVED_KEYWORD)
+PG_KEYWORD("use_column", K_USE_COLUMN, UNRESERVED_KEYWORD)
+PG_KEYWORD("use_variable", K_USE_VARIABLE, UNRESERVED_KEYWORD)
+PG_KEYWORD("variable_conflict", K_VARIABLE_CONFLICT, UNRESERVED_KEYWORD)
+PG_KEYWORD("warning", K_WARNING, UNRESERVED_KEYWORD)
diff --git a/src/tools/gen_keywords.pl b/src/tools/gen_keywords.pl
new file mode 100644
index 0000000000..1faa14ffca
--- /dev/null
+++ b/src/tools/gen_keywords.pl
@@ -0,0 +1,137 @@
+#----------------------------------------------------------------------
+#
+# gen_keywords.pl
+#	Perl script that generates *kwlist_d.h from a given *kwlist.h
+#	keyword list file.  These headers are then included into files that
+#	call ScanKeywordLookup() on that keyword list.  The keyword name is
+#	is represented as an offset into a single string.
+#
+# Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
+# Portions Copyright (c) 1994, Regents of the University of California
+#
+# src/tools/gen_keywords.pl
+#
+#----------------------------------------------------------------------
+
+
+my $kw_input_file;
+my $output_path = '';
+my $prefix;
+
+# Process command line switches.
+while (@ARGV)
+{
+	my $arg = shift @ARGV;
+	if ($arg !~ /^-/)
+	{
+		$kw_input_file = $arg;
+	}
+	elsif ($arg =~ /^-o/)
+	{
+		$output_path = length($arg) > 2 ? substr($arg, 2) : shift @ARGV;
+	}
+	elsif ($arg =~ /^-p/)
+	{
+		$prefix = length($arg) > 2 ? substr($arg, 2) : shift @ARGV;
+	}
+	else
+	{
+		usage();
+	}
+}
+
+# Sanity check arguments.
+die "No input file.\n" if !$kw_input_file;
+
+# Make sure output_path ends in a slash.
+if ($output_path ne '' && substr($output_path, -1) ne '/')
+{
+	$output_path .= '/';
+}
+
+$kw_input_file =~ /((\w*)kwlist)\.h/;
+my $base_filename = $1;
+$prefix = $2 if !defined $prefix;
+my $kw_def_file = $output_path . $base_filename . '_d.h';
+
+open(my $kif, '<', $kw_input_file) || die "$kw_input_file: $!";
+open(my $kwdef, '>', $kw_def_file) || die "$kw_def_file: $!";
+
+# Opening boilerplate for keyword definition header.
+printf $kwdef <<EOM, $base_filename, uc $base_filename, uc $base_filename;
+/*-------------------------------------------------------------------------
+ *
+ * %s_d.h
+ *    List of keywords represented as a keyword string and offsets into it.
+ *
+ * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * NOTES
+ *  ******************************
+ *  *** DO NOT EDIT THIS FILE! ***
+ *  ******************************
+ *
+ *  It has been GENERATED by src/tools/gen_keywords.pl
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef %s_D_H
+#define %s_D_H
+
+EOM
+
+# Parse keyword header for names.
+my @keywords;
+while (<$kif>)
+{
+	if (/^PG_KEYWORD\("(\w+)",\s*\w+,\s*\w+\)/)
+	{
+		push @keywords, $1;
+	}
+}
+
+# Error out if the keyword names are not in ASCII order.
+for my $i (0..$#keywords - 1)
+{
+	die qq|The keyword "$keywords[$i + 1]" is out of order in $kw_input_file|
+	  if ($keywords[$i] cmp $keywords[$i + 1]) >= 0;
+}
+
+# Emit an array of numerical offsets which will be used to index into the
+# keyword string.
+printf $kwdef "static const uint16 %skw_offsets[] = {\n\t", $prefix;
+my $offset = 0;
+foreach my $name (@keywords)
+{
+	print $kwdef "$offset,\n\t";
+
+	# Calculate the cumulative offset of the next keyword,
+	# taking into account the null terminator.
+	$offset += length($name) + 1;
+}
+
+print $kwdef "};\n\n";
+
+# Now generate the keyword string.
+printf $kwdef qq|static const char %skw_strings[] =\n\t"|, $prefix;
+print $kwdef join qq|\\0"\n\t"|, @keywords;
+print $kwdef qq|";\n\n|;
+printf $kwdef "#endif\t\t\t\t\t\t\t/* %s_D_H */\n", uc $base_filename;
+
+
+sub usage
+{
+	die <<EOM;
+Usage: gen_keywords.pl [options] header...
+
+Options:
+    -o               output path
+    -p               optional prefix for generated data structures
+
+gen_keywords.pl transforms a list of keywords into a array of offsets
+into a single string.
+
+EOM
+}
diff --git a/src/tools/msvc/Solution.pm b/src/tools/msvc/Solution.pm
index 0b7cdf8dd5..d0a9505b07 100644
--- a/src/tools/msvc/Solution.pm
+++ b/src/tools/msvc/Solution.pm
@@ -411,6 +411,16 @@ sub GenerateFiles
 		chdir('../../..');
 	}
 
+	if (IsNewer(
+			'src/pl/plpgsql/src/pl_unreserved_kwlist_d.h',
+			'src/pl/plpgsql/src/pl_unreserved_kwlist.h'))
+	{
+		print "Generating pl_unreserved_kwlist_d.h...\n";
+		chdir('src/pl/plpgsql/src');
+		system('perl ../../../tools/gen_keywords.pl pl_unreserved_kwlist.h');
+		chdir('../../../..');
+	}
+
 	if (IsNewer(
 			'src/interfaces/ecpg/preproc/preproc.y',
 			'src/backend/parser/gram.y'))
-- 
2.17.1

From 741afead540b219a2a3e82f0db6840e8ff93b0eb Mon Sep 17 00:00:00 2001
From: John Naylor <jcnay...@gmail.com>
Date: Wed, 26 Dec 2018 21:34:34 -0500
Subject: [PATCH v4 2/2] Dispatch keyword lookup on the first character.

In addition, use heuristics to improve chances of skipping binary search.
For each legal keyword character where it makes sense, choose a common
keyword and use its index for the first middle index of binary search.
---
 src/common/keywords.c           | 30 ++++++++++---
 src/include/common/keywords.h   | 10 ++++-
 src/pl/plpgsql/src/pl_scanner.c |  6 +--
 src/tools/gen_keywords.pl       | 78 +++++++++++++++++++++++++++++++++
 4 files changed, 114 insertions(+), 10 deletions(-)

diff --git a/src/common/keywords.c b/src/common/keywords.c
index b0e5a721b6..207bc4274f 100644
--- a/src/common/keywords.c
+++ b/src/common/keywords.c
@@ -118,12 +118,14 @@ int
 ScanKeywordLookupOffset(const char *string_to_lookup,
 						const char *kw_strings,
 						const uint16 *kw_offsets,
-						int num_keywords)
+						const ScanKeywordRange *kw_ranges)
 {
 	int			len,
-				i;
+				i,
+				range_idx;
 	char		word[NAMEDATALEN];
 	const uint16 *low;
+	const uint16 *middle;
 	const uint16 *high;
 
 	len = strlen(string_to_lookup);
@@ -145,17 +147,32 @@ ScanKeywordLookupOffset(const char *string_to_lookup,
 	}
 	word[len] = '\0';
 
+	/* XXX assumes keywords can't start with '_' */
+	range_idx = (int) word[0] - 'a';
+
+	if (word[0] < 'a' || word[0] > 'z'
+		|| kw_ranges[range_idx].lower == PG_UINT16_MAX)
+		return -1;
+
 	/*
 	 * Now do a binary search using plain strcmp() comparison.
 	 */
-	low = kw_offsets;
-	high = kw_offsets + (num_keywords - 1);
+	low = kw_offsets + kw_ranges[range_idx].lower;
+	high = kw_offsets + kw_ranges[range_idx].upper;
+
+	if (kw_ranges[range_idx].middle != PG_UINT16_MAX)
+	{
+		middle = kw_offsets + kw_ranges[range_idx].middle;
+	}
+	else
+	{
+		middle = low + (high - low) / 2;
+	}
+
 	while (low <= high)
 	{
-		const uint16 *middle;
 		int			difference;
 
-		middle = low + (high - low) / 2;
 		difference = strcmp(kw_strings + *middle, word);
 		if (difference == 0)
 			return middle - kw_offsets;
@@ -163,6 +180,7 @@ ScanKeywordLookupOffset(const char *string_to_lookup,
 			low = middle + 1;
 		else
 			high = middle - 1;
+		middle = low + (high - low) / 2;
 	}
 
 	return -1;
diff --git a/src/include/common/keywords.h b/src/include/common/keywords.h
index 201d0fcc7a..348773c682 100644
--- a/src/include/common/keywords.h
+++ b/src/include/common/keywords.h
@@ -35,6 +35,14 @@ typedef struct ScanKeywordAux
 	char		category;		/* see codes above */
 } ScanKeywordAux;
 
+/* Lower and upper indexes, and first guess, for a range of keywords. */
+typedef struct ScanKeywordRange
+{
+	uint16		lower;
+	uint16		middle;
+	uint16		upper;
+} ScanKeywordRange;
+
 #ifndef FRONTEND
 extern PGDLLIMPORT const ScanKeyword ScanKeywords[];
 extern PGDLLIMPORT const int NumScanKeywords;
@@ -51,6 +59,6 @@ extern const ScanKeyword *ScanKeywordLookup(const char *text,
 int ScanKeywordLookupOffset(const char *string_to_lookup,
 						const char *kw_strings,
 						const uint16 *kw_offsets,
-						int num_keywords);
+						const ScanKeywordRange *kw_ranges);
 
 #endif							/* KEYWORDS_H */
diff --git a/src/pl/plpgsql/src/pl_scanner.c b/src/pl/plpgsql/src/pl_scanner.c
index 0b2f331117..808bf465a0 100644
--- a/src/pl/plpgsql/src/pl_scanner.c
+++ b/src/pl/plpgsql/src/pl_scanner.c
@@ -261,7 +261,7 @@ plpgsql_yylex(void)
 						 (kwnum = ScanKeywordLookupOffset(aux1.lval.word.ident,
 												 pl_unreserved_kw_strings,
 												 pl_unreserved_kw_offsets,
-												 num_unreserved_keywords)) >= 0)
+												 pl_unreserved_kw_ranges)) >= 0)
 				{
 					aux1.lval.keyword = pl_unreserved_kw_strings
 										+ pl_unreserved_kw_offsets[kwnum];
@@ -293,7 +293,7 @@ plpgsql_yylex(void)
 					(kwnum = ScanKeywordLookupOffset(aux1.lval.str,
 											 pl_unreserved_kw_strings,
 											 pl_unreserved_kw_offsets,
-											 num_unreserved_keywords)) >= 0)
+											 pl_unreserved_kw_ranges)) >= 0)
 				{
 					aux1.lval.keyword = pl_unreserved_kw_strings
 										+ pl_unreserved_kw_offsets[kwnum];
@@ -319,7 +319,7 @@ plpgsql_yylex(void)
 						 (kwnum = ScanKeywordLookupOffset(aux1.lval.word.ident,
 												 pl_unreserved_kw_strings,
 												 pl_unreserved_kw_offsets,
-												 num_unreserved_keywords)) >= 0)
+												 pl_unreserved_kw_ranges)) >= 0)
 				{
 					aux1.lval.keyword = pl_unreserved_kw_strings
 										+ pl_unreserved_kw_offsets[kwnum];
diff --git a/src/tools/gen_keywords.pl b/src/tools/gen_keywords.pl
index 1faa14ffca..6678bfc72f 100644
--- a/src/tools/gen_keywords.pl
+++ b/src/tools/gen_keywords.pl
@@ -99,6 +99,84 @@ for my $i (0..$#keywords - 1)
 	  if ($keywords[$i] cmp $keywords[$i + 1]) >= 0;
 }
 
+# These are the most common core keywords for each letter of the alphabet,
+# if there is a clear winner, preferably near the middle of the range.
+my %FIRST_GUESS = (
+	and => 1,
+	commit => 1, # demo for WIP, since it's a plpgsql unreserved keyword
+	delete => 1,
+	exists => 1,
+	from => 1,
+	group => 1,
+	having => 1,
+	in => 1,
+	join => 1,
+	limit => 1,
+	not => 1,
+	or => 1,
+	select => 1,
+	then => 1,
+	update => 1,
+	values => 1,
+	where => 1,
+);
+
+# Save the min and max indexes and a first guess middle index for each range
+# of keywords starting with the same first character.
+my $index = 0;
+my $lower = 0;
+my $middle;
+my $first = 1;
+my $curr_char = substr($keywords[0], 0, 1);
+my %range;
+foreach my $name (@keywords)
+{
+	if (substr($name, 0, 1) ne $curr_char)
+	{
+		$range{$curr_char} = {lower => $lower, middle => $middle, upper => $index};
+
+		# Set values for next range.
+		$curr_char = substr($name, 0, 1);
+		$index++;
+		$lower = $index;
+		$middle = undef;
+	}
+	elsif ($first == 1)
+	{
+		$first = 0;
+	}
+	else
+	{
+		$index++;
+	}
+
+	# Save the index if the keyword is our first guess for the current
+	# first character.
+	if (exists $FIRST_GUESS{$name})
+	{
+		$middle = $index;
+	}
+}
+
+# Save last member of the list.
+$range{$curr_char} = {lower => $lower, middle => $middle, upper => $index};
+
+# Emit array of the upper, middle and lower indexes that we saved earlier.
+# To keep the array small, we count from 'a', the lowest character that can
+# start a keyword.
+# XXX assumes keywords can't start with '_'
+printf $kwdef "static const ScanKeywordRange %skw_ranges[] = {\n\t", $prefix;
+foreach my $c ('a'..'z')
+{
+	my $char = $range{$c};
+
+	printf $kwdef "{%s, %s, %s},\n\t",
+	  defined($char->{lower}) ? $char->{lower} : 'PG_UINT16_MAX',
+	  defined($char->{middle}) ? $char->{middle} : 'PG_UINT16_MAX',
+	  defined($char->{upper}) ? $char->{upper} : 'PG_UINT16_MAX',
+}
+print $kwdef "};\n\n";
+
 # Emit an array of numerical offsets which will be used to index into the
 # keyword string.
 printf $kwdef "static const uint16 %skw_offsets[] = {\n\t", $prefix;
-- 
2.17.1

Re: reducing the footprint of ScanKeyword (was Re: Large writable variables)

Reply via email to