On 12/26/18, John Naylor <jcnay...@gmail.com> wrote: > On 12/26/18, Robert Haas <robertmh...@gmail.com> wrote: >> I wonder if we could do something really simple like a lookup based on >> the first character of the scan keyword. It looks to me like there are >> 440 keywords right now, and the most common starting letter is 'c', >> which is the first letter of 51 keywords. So dispatching based on the >> first letter clips at least 3 steps off the binary search. I don't >> know whether that's enough to be worthwhile, but it's probably pretty >> simple to implement.
> I agree it'd be fairly simple to do, and might raise the bar for doing > anything more complex. I went ahead and did this for v4, but split out into a separate patch. In addition, I used a heuristic to bypass binary search for the most common keywords. Normally, the middle value is computed mathematically, but I found that in each range of keywords beginning with the same letter, there is often 1 or 2 common keywords that are good first guesses, such as select, from, join, limit, where. I taught the lookup to try those first, and then compute subsequent steps the usual way. Barring additional bikeshedding on 0001, I'll plan on implementing offset-based lookup for the other keyword types and retire the old ScanKeyword. Once that's done, we can benchmark and compare with the optimizations in 0002. -John Naylor
From ed78318df69bd3fa1543f3dc04d0a6ca9794a81c Mon Sep 17 00:00:00 2001 From: John Naylor <jcnay...@gmail.com> Date: Wed, 26 Dec 2018 21:28:27 -0500 Subject: [PATCH v4 1/2] WIP Use offset-based keyword lookup. (For WIP, only for plpgsql unreserved keywords) Replace binary search over an array of ScanKeyword structs with a binary search over an array of offsets into a keyword string. Access auxillary data only after a keyword hit. This has better locality of reference and a smaller memory footprint. --- src/common/keywords.c | 55 +++++++++ src/include/common/keywords.h | 12 ++ src/pl/plpgsql/src/.gitignore | 1 + src/pl/plpgsql/src/Makefile | 13 +- src/pl/plpgsql/src/pl_scanner.c | 128 +++++--------------- src/pl/plpgsql/src/pl_unreserved_kwlist.h | 108 +++++++++++++++++ src/tools/gen_keywords.pl | 137 ++++++++++++++++++++++ src/tools/msvc/Solution.pm | 10 ++ 8 files changed, 361 insertions(+), 103 deletions(-) create mode 100644 src/pl/plpgsql/src/pl_unreserved_kwlist.h create mode 100644 src/tools/gen_keywords.pl diff --git a/src/common/keywords.c b/src/common/keywords.c index 0c0c794c68..b0e5a721b6 100644 --- a/src/common/keywords.c +++ b/src/common/keywords.c @@ -112,3 +112,58 @@ ScanKeywordLookup(const char *text, return NULL; } + +/* Like ScanKeywordLookup, but uses offsets into a keyword string. */ +int +ScanKeywordLookupOffset(const char *string_to_lookup, + const char *kw_strings, + const uint16 *kw_offsets, + int num_keywords) +{ + int len, + i; + char word[NAMEDATALEN]; + const uint16 *low; + const uint16 *high; + + len = strlen(string_to_lookup); + /* We assume all keywords are shorter than NAMEDATALEN. */ + if (len >= NAMEDATALEN) + return -1; + + /* + * Apply an ASCII-only downcasing. We must not use tolower() since it may + * produce the wrong translation in some locales (eg, Turkish). + */ + for (i = 0; i < len; i++) + { + char ch = string_to_lookup[i]; + + if (ch >= 'A' && ch <= 'Z') + ch += 'a' - 'A'; + word[i] = ch; + } + word[len] = '\0'; + + /* + * Now do a binary search using plain strcmp() comparison. + */ + low = kw_offsets; + high = kw_offsets + (num_keywords - 1); + while (low <= high) + { + const uint16 *middle; + int difference; + + middle = low + (high - low) / 2; + difference = strcmp(kw_strings + *middle, word); + if (difference == 0) + return middle - kw_offsets; + else if (difference < 0) + low = middle + 1; + else + high = middle - 1; + } + + return -1; +} diff --git a/src/include/common/keywords.h b/src/include/common/keywords.h index 0b31505b66..201d0fcc7a 100644 --- a/src/include/common/keywords.h +++ b/src/include/common/keywords.h @@ -28,6 +28,13 @@ typedef struct ScanKeyword int16 category; /* see codes above */ } ScanKeyword; +/* Payload data for keywords */ +typedef struct ScanKeywordAux +{ + int16 value; /* grammar's token code */ + char category; /* see codes above */ +} ScanKeywordAux; + #ifndef FRONTEND extern PGDLLIMPORT const ScanKeyword ScanKeywords[]; extern PGDLLIMPORT const int NumScanKeywords; @@ -41,4 +48,9 @@ extern const ScanKeyword *ScanKeywordLookup(const char *text, const ScanKeyword *keywords, int num_keywords); +int ScanKeywordLookupOffset(const char *string_to_lookup, + const char *kw_strings, + const uint16 *kw_offsets, + int num_keywords); + #endif /* KEYWORDS_H */ diff --git a/src/pl/plpgsql/src/.gitignore b/src/pl/plpgsql/src/.gitignore index ff6ac965fd..a649302fdb 100644 --- a/src/pl/plpgsql/src/.gitignore +++ b/src/pl/plpgsql/src/.gitignore @@ -1,3 +1,4 @@ +/*kwlist_d.h /pl_gram.c /pl_gram.h /plerrcodes.h diff --git a/src/pl/plpgsql/src/Makefile b/src/pl/plpgsql/src/Makefile index 25a5a9d448..9112aa7d23 100644 --- a/src/pl/plpgsql/src/Makefile +++ b/src/pl/plpgsql/src/Makefile @@ -60,7 +60,7 @@ uninstall-headers: # Force these dependencies to be known even without dependency info built: -pl_gram.o pl_handler.o pl_comp.o pl_exec.o pl_funcs.o pl_scanner.o: plpgsql.h pl_gram.h plerrcodes.h +pl_gram.o pl_handler.o pl_comp.o pl_exec.o pl_funcs.o pl_scanner.o: plpgsql.h pl_gram.h plerrcodes.h pl_unreserved_kwlist_d.h # See notes in src/backend/parser/Makefile about the following two rules pl_gram.h: pl_gram.c @@ -72,6 +72,9 @@ pl_gram.c: BISONFLAGS += -d plerrcodes.h: $(top_srcdir)/src/backend/utils/errcodes.txt generate-plerrcodes.pl $(PERL) $(srcdir)/generate-plerrcodes.pl $< > $@ +# generate keyword headers for the scanner +pl_unreserved_kwlist_d.h: pl_unreserved_kwlist.h $(top_srcdir)/src/tools/gen_keywords.pl + $(PERL) $(top_srcdir)/src/tools/gen_keywords.pl $< check: submake $(pg_regress_check) $(REGRESS_OPTS) $(REGRESS) @@ -84,13 +87,13 @@ submake: $(MAKE) -C $(top_builddir)/src/test/regress pg_regress$(X) -distprep: pl_gram.h pl_gram.c plerrcodes.h +distprep: pl_gram.h pl_gram.c plerrcodes.h pl_unreserved_kwlist_d.h -# pl_gram.c, pl_gram.h and plerrcodes.h are in the distribution tarball, -# so they are not cleaned here. +# pl_gram.c, pl_gram.h, plerrcodes.h, the generated keyword headers are +# in the distribution tarball, so they are not cleaned here. clean distclean: clean-lib rm -f $(OBJS) rm -rf $(pg_regress_clean_files) maintainer-clean: distclean - rm -f pl_gram.c pl_gram.h plerrcodes.h + rm -f pl_gram.c pl_gram.h plerrcodes.h pl_unreserved_kwlist_d.h diff --git a/src/pl/plpgsql/src/pl_scanner.c b/src/pl/plpgsql/src/pl_scanner.c index ab18946847..0b2f331117 100644 --- a/src/pl/plpgsql/src/pl_scanner.c +++ b/src/pl/plpgsql/src/pl_scanner.c @@ -21,6 +21,8 @@ #include "plpgsql.h" #include "pl_gram.h" /* must be after parser/scanner.h */ +/* String lookup table for keywords */ +#include "pl_unreserved_kwlist_d.h" #define PG_KEYWORD(a,b,c) {a,b,c}, @@ -96,88 +98,12 @@ static const ScanKeyword reserved_keywords[] = { static const int num_reserved_keywords = lengthof(reserved_keywords); -static const ScanKeyword unreserved_keywords[] = { - PG_KEYWORD("absolute", K_ABSOLUTE, UNRESERVED_KEYWORD) - PG_KEYWORD("alias", K_ALIAS, UNRESERVED_KEYWORD) - PG_KEYWORD("array", K_ARRAY, UNRESERVED_KEYWORD) - PG_KEYWORD("assert", K_ASSERT, UNRESERVED_KEYWORD) - PG_KEYWORD("backward", K_BACKWARD, UNRESERVED_KEYWORD) - PG_KEYWORD("call", K_CALL, UNRESERVED_KEYWORD) - PG_KEYWORD("close", K_CLOSE, UNRESERVED_KEYWORD) - PG_KEYWORD("collate", K_COLLATE, UNRESERVED_KEYWORD) - PG_KEYWORD("column", K_COLUMN, UNRESERVED_KEYWORD) - PG_KEYWORD("column_name", K_COLUMN_NAME, UNRESERVED_KEYWORD) - PG_KEYWORD("commit", K_COMMIT, UNRESERVED_KEYWORD) - PG_KEYWORD("constant", K_CONSTANT, UNRESERVED_KEYWORD) - PG_KEYWORD("constraint", K_CONSTRAINT, UNRESERVED_KEYWORD) - PG_KEYWORD("constraint_name", K_CONSTRAINT_NAME, UNRESERVED_KEYWORD) - PG_KEYWORD("continue", K_CONTINUE, UNRESERVED_KEYWORD) - PG_KEYWORD("current", K_CURRENT, UNRESERVED_KEYWORD) - PG_KEYWORD("cursor", K_CURSOR, UNRESERVED_KEYWORD) - PG_KEYWORD("datatype", K_DATATYPE, UNRESERVED_KEYWORD) - PG_KEYWORD("debug", K_DEBUG, UNRESERVED_KEYWORD) - PG_KEYWORD("default", K_DEFAULT, UNRESERVED_KEYWORD) - PG_KEYWORD("detail", K_DETAIL, UNRESERVED_KEYWORD) - PG_KEYWORD("diagnostics", K_DIAGNOSTICS, UNRESERVED_KEYWORD) - PG_KEYWORD("do", K_DO, UNRESERVED_KEYWORD) - PG_KEYWORD("dump", K_DUMP, UNRESERVED_KEYWORD) - PG_KEYWORD("elseif", K_ELSIF, UNRESERVED_KEYWORD) - PG_KEYWORD("elsif", K_ELSIF, UNRESERVED_KEYWORD) - PG_KEYWORD("errcode", K_ERRCODE, UNRESERVED_KEYWORD) - PG_KEYWORD("error", K_ERROR, UNRESERVED_KEYWORD) - PG_KEYWORD("exception", K_EXCEPTION, UNRESERVED_KEYWORD) - PG_KEYWORD("exit", K_EXIT, UNRESERVED_KEYWORD) - PG_KEYWORD("fetch", K_FETCH, UNRESERVED_KEYWORD) - PG_KEYWORD("first", K_FIRST, UNRESERVED_KEYWORD) - PG_KEYWORD("forward", K_FORWARD, UNRESERVED_KEYWORD) - PG_KEYWORD("get", K_GET, UNRESERVED_KEYWORD) - PG_KEYWORD("hint", K_HINT, UNRESERVED_KEYWORD) - PG_KEYWORD("import", K_IMPORT, UNRESERVED_KEYWORD) - PG_KEYWORD("info", K_INFO, UNRESERVED_KEYWORD) - PG_KEYWORD("insert", K_INSERT, UNRESERVED_KEYWORD) - PG_KEYWORD("is", K_IS, UNRESERVED_KEYWORD) - PG_KEYWORD("last", K_LAST, UNRESERVED_KEYWORD) - PG_KEYWORD("log", K_LOG, UNRESERVED_KEYWORD) - PG_KEYWORD("message", K_MESSAGE, UNRESERVED_KEYWORD) - PG_KEYWORD("message_text", K_MESSAGE_TEXT, UNRESERVED_KEYWORD) - PG_KEYWORD("move", K_MOVE, UNRESERVED_KEYWORD) - PG_KEYWORD("next", K_NEXT, UNRESERVED_KEYWORD) - PG_KEYWORD("no", K_NO, UNRESERVED_KEYWORD) - PG_KEYWORD("notice", K_NOTICE, UNRESERVED_KEYWORD) - PG_KEYWORD("open", K_OPEN, UNRESERVED_KEYWORD) - PG_KEYWORD("option", K_OPTION, UNRESERVED_KEYWORD) - PG_KEYWORD("perform", K_PERFORM, UNRESERVED_KEYWORD) - PG_KEYWORD("pg_context", K_PG_CONTEXT, UNRESERVED_KEYWORD) - PG_KEYWORD("pg_datatype_name", K_PG_DATATYPE_NAME, UNRESERVED_KEYWORD) - PG_KEYWORD("pg_exception_context", K_PG_EXCEPTION_CONTEXT, UNRESERVED_KEYWORD) - PG_KEYWORD("pg_exception_detail", K_PG_EXCEPTION_DETAIL, UNRESERVED_KEYWORD) - PG_KEYWORD("pg_exception_hint", K_PG_EXCEPTION_HINT, UNRESERVED_KEYWORD) - PG_KEYWORD("print_strict_params", K_PRINT_STRICT_PARAMS, UNRESERVED_KEYWORD) - PG_KEYWORD("prior", K_PRIOR, UNRESERVED_KEYWORD) - PG_KEYWORD("query", K_QUERY, UNRESERVED_KEYWORD) - PG_KEYWORD("raise", K_RAISE, UNRESERVED_KEYWORD) - PG_KEYWORD("relative", K_RELATIVE, UNRESERVED_KEYWORD) - PG_KEYWORD("reset", K_RESET, UNRESERVED_KEYWORD) - PG_KEYWORD("return", K_RETURN, UNRESERVED_KEYWORD) - PG_KEYWORD("returned_sqlstate", K_RETURNED_SQLSTATE, UNRESERVED_KEYWORD) - PG_KEYWORD("reverse", K_REVERSE, UNRESERVED_KEYWORD) - PG_KEYWORD("rollback", K_ROLLBACK, UNRESERVED_KEYWORD) - PG_KEYWORD("row_count", K_ROW_COUNT, UNRESERVED_KEYWORD) - PG_KEYWORD("rowtype", K_ROWTYPE, UNRESERVED_KEYWORD) - PG_KEYWORD("schema", K_SCHEMA, UNRESERVED_KEYWORD) - PG_KEYWORD("schema_name", K_SCHEMA_NAME, UNRESERVED_KEYWORD) - PG_KEYWORD("scroll", K_SCROLL, UNRESERVED_KEYWORD) - PG_KEYWORD("set", K_SET, UNRESERVED_KEYWORD) - PG_KEYWORD("slice", K_SLICE, UNRESERVED_KEYWORD) - PG_KEYWORD("sqlstate", K_SQLSTATE, UNRESERVED_KEYWORD) - PG_KEYWORD("stacked", K_STACKED, UNRESERVED_KEYWORD) - PG_KEYWORD("table", K_TABLE, UNRESERVED_KEYWORD) - PG_KEYWORD("table_name", K_TABLE_NAME, UNRESERVED_KEYWORD) - PG_KEYWORD("type", K_TYPE, UNRESERVED_KEYWORD) - PG_KEYWORD("use_column", K_USE_COLUMN, UNRESERVED_KEYWORD) - PG_KEYWORD("use_variable", K_USE_VARIABLE, UNRESERVED_KEYWORD) - PG_KEYWORD("variable_conflict", K_VARIABLE_CONFLICT, UNRESERVED_KEYWORD) - PG_KEYWORD("warning", K_WARNING, UNRESERVED_KEYWORD) +/* FIXME: Have to redefine this symbol for the WIP. */ +#undef PG_KEYWORD +#define PG_KEYWORD(kwname, value, category) {value, category}, + +static const ScanKeywordAux unreserved_keywords[] = { +#include "pl_unreserved_kwlist.h" }; static const int num_unreserved_keywords = lengthof(unreserved_keywords); @@ -256,7 +182,7 @@ plpgsql_yylex(void) { int tok1; TokenAuxData aux1; - const ScanKeyword *kw; + int kwnum; tok1 = internal_yylex(&aux1); if (tok1 == IDENT || tok1 == PARAM) @@ -332,12 +258,14 @@ plpgsql_yylex(void) &aux1.lval.word)) tok1 = T_DATUM; else if (!aux1.lval.word.quoted && - (kw = ScanKeywordLookup(aux1.lval.word.ident, - unreserved_keywords, - num_unreserved_keywords))) + (kwnum = ScanKeywordLookupOffset(aux1.lval.word.ident, + pl_unreserved_kw_strings, + pl_unreserved_kw_offsets, + num_unreserved_keywords)) >= 0) { - aux1.lval.keyword = kw->name; - tok1 = kw->value; + aux1.lval.keyword = pl_unreserved_kw_strings + + pl_unreserved_kw_offsets[kwnum]; + tok1 = unreserved_keywords[kwnum].value; } else tok1 = T_WORD; @@ -362,12 +290,14 @@ plpgsql_yylex(void) { /* try for unreserved keyword, then for variable name */ if (core_yy.scanbuf[aux1.lloc] != '"' && - (kw = ScanKeywordLookup(aux1.lval.str, - unreserved_keywords, - num_unreserved_keywords))) + (kwnum = ScanKeywordLookupOffset(aux1.lval.str, + pl_unreserved_kw_strings, + pl_unreserved_kw_offsets, + num_unreserved_keywords)) >= 0) { - aux1.lval.keyword = kw->name; - tok1 = kw->value; + aux1.lval.keyword = pl_unreserved_kw_strings + + pl_unreserved_kw_offsets[kwnum]; + tok1 = unreserved_keywords[kwnum].value; } else if (plpgsql_parse_word(aux1.lval.str, core_yy.scanbuf + aux1.lloc, @@ -386,12 +316,14 @@ plpgsql_yylex(void) &aux1.lval.word)) tok1 = T_DATUM; else if (!aux1.lval.word.quoted && - (kw = ScanKeywordLookup(aux1.lval.word.ident, - unreserved_keywords, - num_unreserved_keywords))) + (kwnum = ScanKeywordLookupOffset(aux1.lval.word.ident, + pl_unreserved_kw_strings, + pl_unreserved_kw_offsets, + num_unreserved_keywords)) >= 0) { - aux1.lval.keyword = kw->name; - tok1 = kw->value; + aux1.lval.keyword = pl_unreserved_kw_strings + + pl_unreserved_kw_offsets[kwnum]; + tok1 = unreserved_keywords[kwnum].value; } else tok1 = T_WORD; diff --git a/src/pl/plpgsql/src/pl_unreserved_kwlist.h b/src/pl/plpgsql/src/pl_unreserved_kwlist.h new file mode 100644 index 0000000000..4fb0dff772 --- /dev/null +++ b/src/pl/plpgsql/src/pl_unreserved_kwlist.h @@ -0,0 +1,108 @@ +/*------------------------------------------------------------------------- + * + * pl_unreserved_kwlist.h + * + * The keyword lists are kept in their own source files for use by automatic + * tools. The exact representation of a keyword is determined by the + * PG_KEYWORD macro, which is not defined in this file; it can be + * defined by the caller for special purposes. + * + * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/pl/plpgsql/src/pl_unreserved_kwlist.h + * + *------------------------------------------------------------------------- + */ + +/* There is deliberately not an #ifndef PL_UNRESERVED_KWLIST_H here. */ + +/* + * List of keyword (name, token-value, category) entries. + * + * !!WARNING!!: This list must be sorted by ASCII name, because binary + * search is used to locate entries. + */ + +/* name, value, category */ +PG_KEYWORD("absolute", K_ABSOLUTE, UNRESERVED_KEYWORD) +PG_KEYWORD("alias", K_ALIAS, UNRESERVED_KEYWORD) +PG_KEYWORD("array", K_ARRAY, UNRESERVED_KEYWORD) +PG_KEYWORD("assert", K_ASSERT, UNRESERVED_KEYWORD) +PG_KEYWORD("backward", K_BACKWARD, UNRESERVED_KEYWORD) +PG_KEYWORD("call", K_CALL, UNRESERVED_KEYWORD) +PG_KEYWORD("close", K_CLOSE, UNRESERVED_KEYWORD) +PG_KEYWORD("collate", K_COLLATE, UNRESERVED_KEYWORD) +PG_KEYWORD("column", K_COLUMN, UNRESERVED_KEYWORD) +PG_KEYWORD("column_name", K_COLUMN_NAME, UNRESERVED_KEYWORD) +PG_KEYWORD("commit", K_COMMIT, UNRESERVED_KEYWORD) +PG_KEYWORD("constant", K_CONSTANT, UNRESERVED_KEYWORD) +PG_KEYWORD("constraint", K_CONSTRAINT, UNRESERVED_KEYWORD) +PG_KEYWORD("constraint_name", K_CONSTRAINT_NAME, UNRESERVED_KEYWORD) +PG_KEYWORD("continue", K_CONTINUE, UNRESERVED_KEYWORD) +PG_KEYWORD("current", K_CURRENT, UNRESERVED_KEYWORD) +PG_KEYWORD("cursor", K_CURSOR, UNRESERVED_KEYWORD) +PG_KEYWORD("datatype", K_DATATYPE, UNRESERVED_KEYWORD) +PG_KEYWORD("debug", K_DEBUG, UNRESERVED_KEYWORD) +PG_KEYWORD("default", K_DEFAULT, UNRESERVED_KEYWORD) +PG_KEYWORD("detail", K_DETAIL, UNRESERVED_KEYWORD) +PG_KEYWORD("diagnostics", K_DIAGNOSTICS, UNRESERVED_KEYWORD) +PG_KEYWORD("do", K_DO, UNRESERVED_KEYWORD) +PG_KEYWORD("dump", K_DUMP, UNRESERVED_KEYWORD) +PG_KEYWORD("elseif", K_ELSIF, UNRESERVED_KEYWORD) +PG_KEYWORD("elsif", K_ELSIF, UNRESERVED_KEYWORD) +PG_KEYWORD("errcode", K_ERRCODE, UNRESERVED_KEYWORD) +PG_KEYWORD("error", K_ERROR, UNRESERVED_KEYWORD) +PG_KEYWORD("exception", K_EXCEPTION, UNRESERVED_KEYWORD) +PG_KEYWORD("exit", K_EXIT, UNRESERVED_KEYWORD) +PG_KEYWORD("fetch", K_FETCH, UNRESERVED_KEYWORD) +PG_KEYWORD("first", K_FIRST, UNRESERVED_KEYWORD) +PG_KEYWORD("forward", K_FORWARD, UNRESERVED_KEYWORD) +PG_KEYWORD("get", K_GET, UNRESERVED_KEYWORD) +PG_KEYWORD("hint", K_HINT, UNRESERVED_KEYWORD) +PG_KEYWORD("import", K_IMPORT, UNRESERVED_KEYWORD) +PG_KEYWORD("info", K_INFO, UNRESERVED_KEYWORD) +PG_KEYWORD("insert", K_INSERT, UNRESERVED_KEYWORD) +PG_KEYWORD("is", K_IS, UNRESERVED_KEYWORD) +PG_KEYWORD("last", K_LAST, UNRESERVED_KEYWORD) +PG_KEYWORD("log", K_LOG, UNRESERVED_KEYWORD) +PG_KEYWORD("message", K_MESSAGE, UNRESERVED_KEYWORD) +PG_KEYWORD("message_text", K_MESSAGE_TEXT, UNRESERVED_KEYWORD) +PG_KEYWORD("move", K_MOVE, UNRESERVED_KEYWORD) +PG_KEYWORD("next", K_NEXT, UNRESERVED_KEYWORD) +PG_KEYWORD("no", K_NO, UNRESERVED_KEYWORD) +PG_KEYWORD("notice", K_NOTICE, UNRESERVED_KEYWORD) +PG_KEYWORD("open", K_OPEN, UNRESERVED_KEYWORD) +PG_KEYWORD("option", K_OPTION, UNRESERVED_KEYWORD) +PG_KEYWORD("perform", K_PERFORM, UNRESERVED_KEYWORD) +PG_KEYWORD("pg_context", K_PG_CONTEXT, UNRESERVED_KEYWORD) +PG_KEYWORD("pg_datatype_name", K_PG_DATATYPE_NAME, UNRESERVED_KEYWORD) +PG_KEYWORD("pg_exception_context", K_PG_EXCEPTION_CONTEXT, UNRESERVED_KEYWORD) +PG_KEYWORD("pg_exception_detail", K_PG_EXCEPTION_DETAIL, UNRESERVED_KEYWORD) +PG_KEYWORD("pg_exception_hint", K_PG_EXCEPTION_HINT, UNRESERVED_KEYWORD) +PG_KEYWORD("print_strict_params", K_PRINT_STRICT_PARAMS, UNRESERVED_KEYWORD) +PG_KEYWORD("prior", K_PRIOR, UNRESERVED_KEYWORD) +PG_KEYWORD("query", K_QUERY, UNRESERVED_KEYWORD) +PG_KEYWORD("raise", K_RAISE, UNRESERVED_KEYWORD) +PG_KEYWORD("relative", K_RELATIVE, UNRESERVED_KEYWORD) +PG_KEYWORD("reset", K_RESET, UNRESERVED_KEYWORD) +PG_KEYWORD("return", K_RETURN, UNRESERVED_KEYWORD) +PG_KEYWORD("returned_sqlstate", K_RETURNED_SQLSTATE, UNRESERVED_KEYWORD) +PG_KEYWORD("reverse", K_REVERSE, UNRESERVED_KEYWORD) +PG_KEYWORD("rollback", K_ROLLBACK, UNRESERVED_KEYWORD) +PG_KEYWORD("row_count", K_ROW_COUNT, UNRESERVED_KEYWORD) +PG_KEYWORD("rowtype", K_ROWTYPE, UNRESERVED_KEYWORD) +PG_KEYWORD("schema", K_SCHEMA, UNRESERVED_KEYWORD) +PG_KEYWORD("schema_name", K_SCHEMA_NAME, UNRESERVED_KEYWORD) +PG_KEYWORD("scroll", K_SCROLL, UNRESERVED_KEYWORD) +PG_KEYWORD("set", K_SET, UNRESERVED_KEYWORD) +PG_KEYWORD("slice", K_SLICE, UNRESERVED_KEYWORD) +PG_KEYWORD("sqlstate", K_SQLSTATE, UNRESERVED_KEYWORD) +PG_KEYWORD("stacked", K_STACKED, UNRESERVED_KEYWORD) +PG_KEYWORD("table", K_TABLE, UNRESERVED_KEYWORD) +PG_KEYWORD("table_name", K_TABLE_NAME, UNRESERVED_KEYWORD) +PG_KEYWORD("type", K_TYPE, UNRESERVED_KEYWORD) +PG_KEYWORD("use_column", K_USE_COLUMN, UNRESERVED_KEYWORD) +PG_KEYWORD("use_variable", K_USE_VARIABLE, UNRESERVED_KEYWORD) +PG_KEYWORD("variable_conflict", K_VARIABLE_CONFLICT, UNRESERVED_KEYWORD) +PG_KEYWORD("warning", K_WARNING, UNRESERVED_KEYWORD) diff --git a/src/tools/gen_keywords.pl b/src/tools/gen_keywords.pl new file mode 100644 index 0000000000..1faa14ffca --- /dev/null +++ b/src/tools/gen_keywords.pl @@ -0,0 +1,137 @@ +#---------------------------------------------------------------------- +# +# gen_keywords.pl +# Perl script that generates *kwlist_d.h from a given *kwlist.h +# keyword list file. These headers are then included into files that +# call ScanKeywordLookup() on that keyword list. The keyword name is +# is represented as an offset into a single string. +# +# Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group +# Portions Copyright (c) 1994, Regents of the University of California +# +# src/tools/gen_keywords.pl +# +#---------------------------------------------------------------------- + + +my $kw_input_file; +my $output_path = ''; +my $prefix; + +# Process command line switches. +while (@ARGV) +{ + my $arg = shift @ARGV; + if ($arg !~ /^-/) + { + $kw_input_file = $arg; + } + elsif ($arg =~ /^-o/) + { + $output_path = length($arg) > 2 ? substr($arg, 2) : shift @ARGV; + } + elsif ($arg =~ /^-p/) + { + $prefix = length($arg) > 2 ? substr($arg, 2) : shift @ARGV; + } + else + { + usage(); + } +} + +# Sanity check arguments. +die "No input file.\n" if !$kw_input_file; + +# Make sure output_path ends in a slash. +if ($output_path ne '' && substr($output_path, -1) ne '/') +{ + $output_path .= '/'; +} + +$kw_input_file =~ /((\w*)kwlist)\.h/; +my $base_filename = $1; +$prefix = $2 if !defined $prefix; +my $kw_def_file = $output_path . $base_filename . '_d.h'; + +open(my $kif, '<', $kw_input_file) || die "$kw_input_file: $!"; +open(my $kwdef, '>', $kw_def_file) || die "$kw_def_file: $!"; + +# Opening boilerplate for keyword definition header. +printf $kwdef <<EOM, $base_filename, uc $base_filename, uc $base_filename; +/*------------------------------------------------------------------------- + * + * %s_d.h + * List of keywords represented as a keyword string and offsets into it. + * + * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * NOTES + * ****************************** + * *** DO NOT EDIT THIS FILE! *** + * ****************************** + * + * It has been GENERATED by src/tools/gen_keywords.pl + * + *------------------------------------------------------------------------- + */ + +#ifndef %s_D_H +#define %s_D_H + +EOM + +# Parse keyword header for names. +my @keywords; +while (<$kif>) +{ + if (/^PG_KEYWORD\("(\w+)",\s*\w+,\s*\w+\)/) + { + push @keywords, $1; + } +} + +# Error out if the keyword names are not in ASCII order. +for my $i (0..$#keywords - 1) +{ + die qq|The keyword "$keywords[$i + 1]" is out of order in $kw_input_file| + if ($keywords[$i] cmp $keywords[$i + 1]) >= 0; +} + +# Emit an array of numerical offsets which will be used to index into the +# keyword string. +printf $kwdef "static const uint16 %skw_offsets[] = {\n\t", $prefix; +my $offset = 0; +foreach my $name (@keywords) +{ + print $kwdef "$offset,\n\t"; + + # Calculate the cumulative offset of the next keyword, + # taking into account the null terminator. + $offset += length($name) + 1; +} + +print $kwdef "};\n\n"; + +# Now generate the keyword string. +printf $kwdef qq|static const char %skw_strings[] =\n\t"|, $prefix; +print $kwdef join qq|\\0"\n\t"|, @keywords; +print $kwdef qq|";\n\n|; +printf $kwdef "#endif\t\t\t\t\t\t\t/* %s_D_H */\n", uc $base_filename; + + +sub usage +{ + die <<EOM; +Usage: gen_keywords.pl [options] header... + +Options: + -o output path + -p optional prefix for generated data structures + +gen_keywords.pl transforms a list of keywords into a array of offsets +into a single string. + +EOM +} diff --git a/src/tools/msvc/Solution.pm b/src/tools/msvc/Solution.pm index 0b7cdf8dd5..d0a9505b07 100644 --- a/src/tools/msvc/Solution.pm +++ b/src/tools/msvc/Solution.pm @@ -411,6 +411,16 @@ sub GenerateFiles chdir('../../..'); } + if (IsNewer( + 'src/pl/plpgsql/src/pl_unreserved_kwlist_d.h', + 'src/pl/plpgsql/src/pl_unreserved_kwlist.h')) + { + print "Generating pl_unreserved_kwlist_d.h...\n"; + chdir('src/pl/plpgsql/src'); + system('perl ../../../tools/gen_keywords.pl pl_unreserved_kwlist.h'); + chdir('../../../..'); + } + if (IsNewer( 'src/interfaces/ecpg/preproc/preproc.y', 'src/backend/parser/gram.y')) -- 2.17.1
From 741afead540b219a2a3e82f0db6840e8ff93b0eb Mon Sep 17 00:00:00 2001 From: John Naylor <jcnay...@gmail.com> Date: Wed, 26 Dec 2018 21:34:34 -0500 Subject: [PATCH v4 2/2] Dispatch keyword lookup on the first character. In addition, use heuristics to improve chances of skipping binary search. For each legal keyword character where it makes sense, choose a common keyword and use its index for the first middle index of binary search. --- src/common/keywords.c | 30 ++++++++++--- src/include/common/keywords.h | 10 ++++- src/pl/plpgsql/src/pl_scanner.c | 6 +-- src/tools/gen_keywords.pl | 78 +++++++++++++++++++++++++++++++++ 4 files changed, 114 insertions(+), 10 deletions(-) diff --git a/src/common/keywords.c b/src/common/keywords.c index b0e5a721b6..207bc4274f 100644 --- a/src/common/keywords.c +++ b/src/common/keywords.c @@ -118,12 +118,14 @@ int ScanKeywordLookupOffset(const char *string_to_lookup, const char *kw_strings, const uint16 *kw_offsets, - int num_keywords) + const ScanKeywordRange *kw_ranges) { int len, - i; + i, + range_idx; char word[NAMEDATALEN]; const uint16 *low; + const uint16 *middle; const uint16 *high; len = strlen(string_to_lookup); @@ -145,17 +147,32 @@ ScanKeywordLookupOffset(const char *string_to_lookup, } word[len] = '\0'; + /* XXX assumes keywords can't start with '_' */ + range_idx = (int) word[0] - 'a'; + + if (word[0] < 'a' || word[0] > 'z' + || kw_ranges[range_idx].lower == PG_UINT16_MAX) + return -1; + /* * Now do a binary search using plain strcmp() comparison. */ - low = kw_offsets; - high = kw_offsets + (num_keywords - 1); + low = kw_offsets + kw_ranges[range_idx].lower; + high = kw_offsets + kw_ranges[range_idx].upper; + + if (kw_ranges[range_idx].middle != PG_UINT16_MAX) + { + middle = kw_offsets + kw_ranges[range_idx].middle; + } + else + { + middle = low + (high - low) / 2; + } + while (low <= high) { - const uint16 *middle; int difference; - middle = low + (high - low) / 2; difference = strcmp(kw_strings + *middle, word); if (difference == 0) return middle - kw_offsets; @@ -163,6 +180,7 @@ ScanKeywordLookupOffset(const char *string_to_lookup, low = middle + 1; else high = middle - 1; + middle = low + (high - low) / 2; } return -1; diff --git a/src/include/common/keywords.h b/src/include/common/keywords.h index 201d0fcc7a..348773c682 100644 --- a/src/include/common/keywords.h +++ b/src/include/common/keywords.h @@ -35,6 +35,14 @@ typedef struct ScanKeywordAux char category; /* see codes above */ } ScanKeywordAux; +/* Lower and upper indexes, and first guess, for a range of keywords. */ +typedef struct ScanKeywordRange +{ + uint16 lower; + uint16 middle; + uint16 upper; +} ScanKeywordRange; + #ifndef FRONTEND extern PGDLLIMPORT const ScanKeyword ScanKeywords[]; extern PGDLLIMPORT const int NumScanKeywords; @@ -51,6 +59,6 @@ extern const ScanKeyword *ScanKeywordLookup(const char *text, int ScanKeywordLookupOffset(const char *string_to_lookup, const char *kw_strings, const uint16 *kw_offsets, - int num_keywords); + const ScanKeywordRange *kw_ranges); #endif /* KEYWORDS_H */ diff --git a/src/pl/plpgsql/src/pl_scanner.c b/src/pl/plpgsql/src/pl_scanner.c index 0b2f331117..808bf465a0 100644 --- a/src/pl/plpgsql/src/pl_scanner.c +++ b/src/pl/plpgsql/src/pl_scanner.c @@ -261,7 +261,7 @@ plpgsql_yylex(void) (kwnum = ScanKeywordLookupOffset(aux1.lval.word.ident, pl_unreserved_kw_strings, pl_unreserved_kw_offsets, - num_unreserved_keywords)) >= 0) + pl_unreserved_kw_ranges)) >= 0) { aux1.lval.keyword = pl_unreserved_kw_strings + pl_unreserved_kw_offsets[kwnum]; @@ -293,7 +293,7 @@ plpgsql_yylex(void) (kwnum = ScanKeywordLookupOffset(aux1.lval.str, pl_unreserved_kw_strings, pl_unreserved_kw_offsets, - num_unreserved_keywords)) >= 0) + pl_unreserved_kw_ranges)) >= 0) { aux1.lval.keyword = pl_unreserved_kw_strings + pl_unreserved_kw_offsets[kwnum]; @@ -319,7 +319,7 @@ plpgsql_yylex(void) (kwnum = ScanKeywordLookupOffset(aux1.lval.word.ident, pl_unreserved_kw_strings, pl_unreserved_kw_offsets, - num_unreserved_keywords)) >= 0) + pl_unreserved_kw_ranges)) >= 0) { aux1.lval.keyword = pl_unreserved_kw_strings + pl_unreserved_kw_offsets[kwnum]; diff --git a/src/tools/gen_keywords.pl b/src/tools/gen_keywords.pl index 1faa14ffca..6678bfc72f 100644 --- a/src/tools/gen_keywords.pl +++ b/src/tools/gen_keywords.pl @@ -99,6 +99,84 @@ for my $i (0..$#keywords - 1) if ($keywords[$i] cmp $keywords[$i + 1]) >= 0; } +# These are the most common core keywords for each letter of the alphabet, +# if there is a clear winner, preferably near the middle of the range. +my %FIRST_GUESS = ( + and => 1, + commit => 1, # demo for WIP, since it's a plpgsql unreserved keyword + delete => 1, + exists => 1, + from => 1, + group => 1, + having => 1, + in => 1, + join => 1, + limit => 1, + not => 1, + or => 1, + select => 1, + then => 1, + update => 1, + values => 1, + where => 1, +); + +# Save the min and max indexes and a first guess middle index for each range +# of keywords starting with the same first character. +my $index = 0; +my $lower = 0; +my $middle; +my $first = 1; +my $curr_char = substr($keywords[0], 0, 1); +my %range; +foreach my $name (@keywords) +{ + if (substr($name, 0, 1) ne $curr_char) + { + $range{$curr_char} = {lower => $lower, middle => $middle, upper => $index}; + + # Set values for next range. + $curr_char = substr($name, 0, 1); + $index++; + $lower = $index; + $middle = undef; + } + elsif ($first == 1) + { + $first = 0; + } + else + { + $index++; + } + + # Save the index if the keyword is our first guess for the current + # first character. + if (exists $FIRST_GUESS{$name}) + { + $middle = $index; + } +} + +# Save last member of the list. +$range{$curr_char} = {lower => $lower, middle => $middle, upper => $index}; + +# Emit array of the upper, middle and lower indexes that we saved earlier. +# To keep the array small, we count from 'a', the lowest character that can +# start a keyword. +# XXX assumes keywords can't start with '_' +printf $kwdef "static const ScanKeywordRange %skw_ranges[] = {\n\t", $prefix; +foreach my $c ('a'..'z') +{ + my $char = $range{$c}; + + printf $kwdef "{%s, %s, %s},\n\t", + defined($char->{lower}) ? $char->{lower} : 'PG_UINT16_MAX', + defined($char->{middle}) ? $char->{middle} : 'PG_UINT16_MAX', + defined($char->{upper}) ? $char->{upper} : 'PG_UINT16_MAX', +} +print $kwdef "};\n\n"; + # Emit an array of numerical offsets which will be used to index into the # keyword string. printf $kwdef "static const uint16 %skw_offsets[] = {\n\t", $prefix; -- 2.17.1