John Naylor <john.nay...@2ndquadrant.com> writes: > On Tue, Jan 8, 2019 at 12:06 PM Andrew Dunstan > <andrew.duns...@2ndquadrant.com> wrote: >> If he doesn't I will.
> I'll take a crack at separating into a module. I'll wait a bit in > case there are any stylistic suggestions on the patch as it stands. I had a go at that myself. I'm sure there's plenty to criticize in the result, but at least it passes make check-world ;-) I resolved the worry I had last night about the range of table values by putting in logic to check the range and choose a suitable table element type. There are a couple of existing calls where we manage to fit the hashtable elements into int8 that way; of course, by definition that doesn't save a whole lot of space since such tables couldn't have many elements, but it seems cleaner anyway. regards, tom lane
diff --git a/src/common/Makefile b/src/common/Makefile index 317b071..d0c2b97 100644 *** a/src/common/Makefile --- b/src/common/Makefile *************** OBJS_FRONTEND = $(OBJS_COMMON) fe_memuti *** 63,68 **** --- 63,73 ---- OBJS_SHLIB = $(OBJS_FRONTEND:%.o=%_shlib.o) OBJS_SRV = $(OBJS_COMMON:%.o=%_srv.o) + # where to find gen_keywordlist.pl and subsidiary files + TOOLSDIR = $(top_srcdir)/src/tools + GEN_KEYWORDLIST = $(PERL) -I $(TOOLSDIR) $(TOOLSDIR)/gen_keywordlist.pl + GEN_KEYWORDLIST_DEPS = $(TOOLSDIR)/gen_keywordlist.pl $(TOOLSDIR)/PerfectHash.pm + all: libpgcommon.a libpgcommon_shlib.a libpgcommon_srv.a distprep: kwlist_d.h *************** libpgcommon_srv.a: $(OBJS_SRV) *** 118,125 **** $(CC) $(CFLAGS) $(subst -DFRONTEND,, $(CPPFLAGS)) -c $< -o $@ # generate SQL keyword lookup table to be included into keywords*.o. ! kwlist_d.h: $(top_srcdir)/src/include/parser/kwlist.h $(top_srcdir)/src/tools/gen_keywordlist.pl ! $(PERL) $(top_srcdir)/src/tools/gen_keywordlist.pl --extern $< # Dependencies of keywords*.o need to be managed explicitly to make sure # that you don't get broken parsing code, even in a non-enable-depend build. --- 123,130 ---- $(CC) $(CFLAGS) $(subst -DFRONTEND,, $(CPPFLAGS)) -c $< -o $@ # generate SQL keyword lookup table to be included into keywords*.o. ! kwlist_d.h: $(top_srcdir)/src/include/parser/kwlist.h $(GEN_KEYWORDLIST_DEPS) ! $(GEN_KEYWORDLIST) --extern $< # Dependencies of keywords*.o need to be managed explicitly to make sure # that you don't get broken parsing code, even in a non-enable-depend build. diff --git a/src/common/kwlookup.c b/src/common/kwlookup.c index d72842e..9dc1fee 100644 *** a/src/common/kwlookup.c --- b/src/common/kwlookup.c *************** *** 35,94 **** * receive a different case-normalization mapping. */ int ! ScanKeywordLookup(const char *text, const ScanKeywordList *keywords) { ! int len, ! i; ! char word[NAMEDATALEN]; ! const char *kw_string; ! const uint16 *kw_offsets; ! const uint16 *low; ! const uint16 *high; ! ! len = strlen(text); if (len > keywords->max_kw_len) ! return -1; /* too long to be any keyword */ ! ! /* We assume all keywords are shorter than NAMEDATALEN. */ ! Assert(len < NAMEDATALEN); /* ! * Apply an ASCII-only downcasing. We must not use tolower() since it may ! * produce the wrong translation in some locales (eg, Turkish). */ ! for (i = 0; i < len; i++) ! { ! char ch = text[i]; ! if (ch >= 'A' && ch <= 'Z') ! ch += 'a' - 'A'; ! word[i] = ch; ! } ! word[len] = '\0'; /* ! * Now do a binary search using plain strcmp() comparison. */ ! kw_string = keywords->kw_string; ! kw_offsets = keywords->kw_offsets; ! low = kw_offsets; ! high = kw_offsets + (keywords->num_keywords - 1); ! while (low <= high) { ! const uint16 *middle; ! int difference; ! middle = low + (high - low) / 2; ! difference = strcmp(kw_string + *middle, word); ! if (difference == 0) ! return middle - kw_offsets; ! else if (difference < 0) ! low = middle + 1; ! else ! high = middle - 1; } ! return -1; } --- 35,89 ---- * receive a different case-normalization mapping. */ int ! ScanKeywordLookup(const char *str, const ScanKeywordList *keywords) { ! size_t len; ! int h; ! const char *kw; + /* + * Reject immediately if too long to be any keyword. This saves useless + * hashing and downcasing work on long strings. + */ + len = strlen(str); if (len > keywords->max_kw_len) ! return -1; /* ! * Compute the hash function. We assume it was generated to produce ! * case-insensitive results. Since it's a perfect hash, we need only ! * match to the specific keyword it identifies. */ ! h = keywords->hash(str, len); ! /* ! * An out-of-range result implies no match. (This can happen for ! * non-keyword inputs because the hash function will sum two unrelated ! * hashtable entries.) ! */ ! if (h < 0 || h >= keywords->num_keywords) ! return -1; /* ! * Compare character-by-character to see if we have a match, applying an ! * ASCII-only downcasing to the input characters. We must not use ! * tolower() since it may produce the wrong translation in some locales ! * (eg, Turkish). */ ! kw = GetScanKeyword(h, keywords); ! while (*str != '\0') { ! char ch = *str++; ! if (ch >= 'A' && ch <= 'Z') ! ch += 'a' - 'A'; ! if (ch != *kw++) ! return -1; } + if (*kw != '\0') + return -1; ! /* Success! */ ! return h; } diff --git a/src/include/common/kwlookup.h b/src/include/common/kwlookup.h index 39efb35..dbff367 100644 *** a/src/include/common/kwlookup.h --- b/src/include/common/kwlookup.h *************** *** 14,19 **** --- 14,22 ---- #ifndef KWLOOKUP_H #define KWLOOKUP_H + /* Hash function used by ScanKeywordLookup */ + typedef int (*ScanKeywordHashFunc) (const void *key, size_t keylen); + /* * This struct contains the data needed by ScanKeywordLookup to perform a * search within a set of keywords. The contents are typically generated by *************** typedef struct ScanKeywordList *** 23,28 **** --- 26,32 ---- { const char *kw_string; /* all keywords in order, separated by \0 */ const uint16 *kw_offsets; /* offsets to the start of each keyword */ + ScanKeywordHashFunc hash; /* perfect hash function for keywords */ int num_keywords; /* number of keywords */ int max_kw_len; /* length of longest keyword */ } ScanKeywordList; diff --git a/src/interfaces/ecpg/preproc/Makefile b/src/interfaces/ecpg/preproc/Makefile index b5b74a3..6c02f97 100644 *** a/src/interfaces/ecpg/preproc/Makefile --- b/src/interfaces/ecpg/preproc/Makefile *************** OBJS= preproc.o pgc.o type.o ecpg.o outp *** 28,34 **** keywords.o c_keywords.o ecpg_keywords.o typename.o descriptor.o variable.o \ $(WIN32RES) ! GEN_KEYWORDLIST = $(top_srcdir)/src/tools/gen_keywordlist.pl # Suppress parallel build to avoid a bug in GNU make 3.82 # (see comments in ../Makefile) --- 28,37 ---- keywords.o c_keywords.o ecpg_keywords.o typename.o descriptor.o variable.o \ $(WIN32RES) ! # where to find gen_keywordlist.pl and subsidiary files ! TOOLSDIR = $(top_srcdir)/src/tools ! GEN_KEYWORDLIST = $(PERL) -I $(TOOLSDIR) $(TOOLSDIR)/gen_keywordlist.pl ! GEN_KEYWORDLIST_DEPS = $(TOOLSDIR)/gen_keywordlist.pl $(TOOLSDIR)/PerfectHash.pm # Suppress parallel build to avoid a bug in GNU make 3.82 # (see comments in ../Makefile) *************** preproc.y: ../../../backend/parser/gram. *** 56,66 **** $(PERL) $(srcdir)/check_rules.pl $(srcdir) $< # generate keyword headers ! c_kwlist_d.h: c_kwlist.h $(GEN_KEYWORDLIST) ! $(PERL) $(GEN_KEYWORDLIST) --varname ScanCKeywords $< ! ecpg_kwlist_d.h: ecpg_kwlist.h $(GEN_KEYWORDLIST) ! $(PERL) $(GEN_KEYWORDLIST) --varname ScanECPGKeywords $< # Force these dependencies to be known even without dependency info built: ecpg_keywords.o c_keywords.o keywords.o preproc.o pgc.o parser.o: preproc.h --- 59,69 ---- $(PERL) $(srcdir)/check_rules.pl $(srcdir) $< # generate keyword headers ! c_kwlist_d.h: c_kwlist.h $(GEN_KEYWORDLIST_DEPS) ! $(GEN_KEYWORDLIST) --varname ScanCKeywords --case $< ! ecpg_kwlist_d.h: ecpg_kwlist.h $(GEN_KEYWORDLIST_DEPS) ! $(GEN_KEYWORDLIST) --varname ScanECPGKeywords $< # Force these dependencies to be known even without dependency info built: ecpg_keywords.o c_keywords.o keywords.o preproc.o pgc.o parser.o: preproc.h diff --git a/src/interfaces/ecpg/preproc/c_keywords.c b/src/interfaces/ecpg/preproc/c_keywords.c index 38ddf6f..387298b 100644 *** a/src/interfaces/ecpg/preproc/c_keywords.c --- b/src/interfaces/ecpg/preproc/c_keywords.c *************** *** 9,16 **** */ #include "postgres_fe.h" - #include <ctype.h> - #include "preproc_extern.h" #include "preproc.h" --- 9,14 ---- *************** static const uint16 ScanCKeywordTokens[] *** 32,70 **** * * Returns the token value of the keyword, or -1 if no match. * ! * Do a binary search using plain strcmp() comparison. This is much like * ScanKeywordLookup(), except we want case-sensitive matching. */ int ! ScanCKeywordLookup(const char *text) { ! const char *kw_string; ! const uint16 *kw_offsets; ! const uint16 *low; ! const uint16 *high; ! if (strlen(text) > ScanCKeywords.max_kw_len) ! return -1; /* too long to be any keyword */ ! kw_string = ScanCKeywords.kw_string; ! kw_offsets = ScanCKeywords.kw_offsets; ! low = kw_offsets; ! high = kw_offsets + (ScanCKeywords.num_keywords - 1); ! while (low <= high) ! { ! const uint16 *middle; ! int difference; ! middle = low + (high - low) / 2; ! difference = strcmp(kw_string + *middle, text); ! if (difference == 0) ! return ScanCKeywordTokens[middle - kw_offsets]; ! else if (difference < 0) ! low = middle + 1; ! else ! high = middle - 1; ! } return -1; } --- 30,71 ---- * * Returns the token value of the keyword, or -1 if no match. * ! * Do a hash search using plain strcmp() comparison. This is much like * ScanKeywordLookup(), except we want case-sensitive matching. */ int ! ScanCKeywordLookup(const char *str) { ! size_t len; ! int h; ! const char *kw; ! /* ! * Reject immediately if too long to be any keyword. This saves useless ! * hashing work on long strings. ! */ ! len = strlen(str); ! if (len > ScanCKeywords.max_kw_len) ! return -1; ! /* ! * Compute the hash function. Since it's a perfect hash, we need only ! * match to the specific keyword it identifies. ! */ ! h = ScanCKeywords_hash_func(str, len); ! /* ! * An out-of-range result implies no match. (This can happen for ! * non-keyword inputs because the hash function will sum two unrelated ! * hashtable entries.) ! */ ! if (h < 0 || h >= ScanCKeywords.num_keywords) ! return -1; ! kw = GetScanKeyword(h, &ScanCKeywords); ! ! if (strcmp(kw, str) == 0) ! return ScanCKeywordTokens[h]; return -1; } diff --git a/src/pl/plpgsql/src/Makefile b/src/pl/plpgsql/src/Makefile index 9dd4a74..8a0f294 100644 *** a/src/pl/plpgsql/src/Makefile --- b/src/pl/plpgsql/src/Makefile *************** REGRESS_OPTS = --dbname=$(PL_TESTDB) *** 29,35 **** REGRESS = plpgsql_call plpgsql_control plpgsql_domain plpgsql_record \ plpgsql_cache plpgsql_transaction plpgsql_varprops ! GEN_KEYWORDLIST = $(top_srcdir)/src/tools/gen_keywordlist.pl all: all-lib --- 29,38 ---- REGRESS = plpgsql_call plpgsql_control plpgsql_domain plpgsql_record \ plpgsql_cache plpgsql_transaction plpgsql_varprops ! # where to find gen_keywordlist.pl and subsidiary files ! TOOLSDIR = $(top_srcdir)/src/tools ! GEN_KEYWORDLIST = $(PERL) -I $(TOOLSDIR) $(TOOLSDIR)/gen_keywordlist.pl ! GEN_KEYWORDLIST_DEPS = $(TOOLSDIR)/gen_keywordlist.pl $(TOOLSDIR)/PerfectHash.pm all: all-lib *************** plerrcodes.h: $(top_srcdir)/src/backend/ *** 76,86 **** $(PERL) $(srcdir)/generate-plerrcodes.pl $< > $@ # generate keyword headers for the scanner ! pl_reserved_kwlist_d.h: pl_reserved_kwlist.h $(GEN_KEYWORDLIST) ! $(PERL) $(GEN_KEYWORDLIST) --varname ReservedPLKeywords $< ! pl_unreserved_kwlist_d.h: pl_unreserved_kwlist.h $(GEN_KEYWORDLIST) ! $(PERL) $(GEN_KEYWORDLIST) --varname UnreservedPLKeywords $< check: submake --- 79,89 ---- $(PERL) $(srcdir)/generate-plerrcodes.pl $< > $@ # generate keyword headers for the scanner ! pl_reserved_kwlist_d.h: pl_reserved_kwlist.h $(GEN_KEYWORDLIST_DEPS) ! $(GEN_KEYWORDLIST) --varname ReservedPLKeywords $< ! pl_unreserved_kwlist_d.h: pl_unreserved_kwlist.h $(GEN_KEYWORDLIST_DEPS) ! $(GEN_KEYWORDLIST) --varname UnreservedPLKeywords $< check: submake diff --git a/src/tools/PerfectHash.pm b/src/tools/PerfectHash.pm index ...34d55cf . *** a/src/tools/PerfectHash.pm --- b/src/tools/PerfectHash.pm *************** *** 0 **** --- 1,336 ---- + #---------------------------------------------------------------------- + # + # PerfectHash.pm + # Perl module that constructs minimal perfect hash functions + # + # This code constructs a minimal perfect hash function for the given + # set of keys, using an algorithm described in + # "An optimal algorithm for generating minimal perfect hash functions" + # by Czech, Havas and Majewski in Information Processing Letters, + # 43(5):256-264, October 1992. + # This implementation is loosely based on NetBSD's "nbperf", + # which was written by Joerg Sonnenberger. + # + # The resulting hash function is perfect in the sense that if the presented + # key is one of the original set, it will return the key's index in the set + # (in range 0..N-1). However, the caller must still verify the match, + # as false positives are possible. Also, the hash function may return + # values that are out of range (negative, or >= N). This indicates that + # the presented key is definitely not in the set. + # + # + # Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group + # Portions Copyright (c) 1994, Regents of the University of California + # + # src/tools/PerfectHash.pm + # + #---------------------------------------------------------------------- + + package PerfectHash; + + use strict; + use warnings; + use Exporter 'import'; + + our @EXPORT_OK = qw( + generate_hash_function + ); + + + # At runtime, we'll compute two simple hash functions of the input key, + # and use them to index into a mapping table. The hash functions are just + # multiply-and-add in uint32 arithmetic, with different multipliers but + # the same initial seed. All the complexity in this module is concerned + # with selecting hash parameters that will work and building the mapping + # table. + + # We support making case-insensitive hash functions, though this only + # works for a strict-ASCII interpretation of case insensitivity, + # ie, A-Z maps onto a-z and nothing else. + my $case_insensitive = 0; + + + # + # Construct a C function implementing a perfect hash for the given keys. + # The C function definition is returned as a string. + # + # The keys can be any set of Perl strings; it is caller's responsibility + # that there not be any duplicates. (Note that the "strings" can be + # binary data, but endianness is the caller's problem.) + # + # The name to use for the function is caller-specified, but its signature + # is always "int f(const void *key, size_t keylen)". The caller may + # prepend "static " to the result string if it wants a static function. + # + # If $ci is true, the function is case-insensitive, for the limited idea + # of case-insensitivity explained above. + # + sub generate_hash_function + { + my ($keys_ref, $funcname, $ci) = @_; + + # It's not worth passing this around as a parameter; just use a global. + $case_insensitive = $ci; + + # Try different hash function parameters until we find a set that works + # for these keys. In principle we might need to change multipliers, + # but these two multipliers are chosen to be cheap to calculate via + # shift-and-add, so don't change them except at great need. + my $hash_mult1 = 31; + my $hash_mult2 = 37; + + # We just try successive hash seed values until we find one that works. + # (Commonly, random seeds are tried, but we want reproducible results + # from this program so we don't do that.) + my $hash_seed; + my @subresult; + for ($hash_seed = 0;; $hash_seed++) + { + @subresult = + _construct_hash_table($keys_ref, $hash_mult1, $hash_mult2, + $hash_seed); + last if @subresult; + } + + # Extract info from the function result array. + my $elemtype = $subresult[0]; + my @hashtab = @{ $subresult[1] }; + my $nhash = scalar(@hashtab); + + # OK, construct the hash function definition including the hash table. + my $f = ''; + $f .= sprintf "int\n"; + $f .= sprintf "%s(const void *key, size_t keylen)\n{\n", $funcname; + $f .= sprintf "\tstatic const %s h[%d] = {\n", $elemtype, $nhash; + for (my $i = 0; $i < $nhash; $i++) + { + $f .= sprintf "%s%6d,%s", + ($i % 8 == 0 ? "\t\t" : " "), + $hashtab[$i], + ($i % 8 == 7 ? "\n" : ""); + } + $f .= sprintf "\n" if ($nhash % 8 != 0); + $f .= sprintf "\t};\n\n"; + $f .= sprintf "\tconst unsigned char *k = key;\n"; + $f .= sprintf "\tuint32\t\ta = %d;\n", $hash_seed; + $f .= sprintf "\tuint32\t\tb = %d;\n\n", $hash_seed; + $f .= sprintf "\twhile (keylen--)\n\t{\n"; + $f .= sprintf "\t\tunsigned char c = *k++"; + $f .= sprintf " | 0x20" if $case_insensitive; # see comment below + $f .= sprintf ";\n\n"; + $f .= sprintf "\t\ta = a * %d + c;\n", $hash_mult1; + $f .= sprintf "\t\tb = b * %d + c;\n", $hash_mult2; + $f .= sprintf "\t}\n"; + $f .= sprintf "\treturn h[a %% %d] + h[b %% %d];\n", $nhash, $nhash; + $f .= sprintf "}\n"; + + return $f; + } + + + # Calculate a hash function as the run-time code will do. + # + # If we are making a case-insensitive hash function, we implement that + # by OR'ing 0x20 into each byte of the key. This correctly transforms + # upper-case ASCII into lower-case ASCII, while not changing digits or + # dollar signs. (It does change '_', else we could just skip adjusting + # $cn here at all, for typical keyword strings.) + sub _calc_hash + { + my ($key, $mult, $seed) = @_; + + my $result = $seed; + for my $c (split //, $key) + { + my $cn = ord($c); + $cn |= 0x20 if $case_insensitive; + $result = ($result * $mult + $cn) % 4294967296; + } + return $result; + } + + + # Attempt to construct a mapping table for a minimal perfect hash function + # for the given keys, using the specified hash parameters. + # + # Returns an array containing the mapping table element type name as the + # first element, and a ref to an array of the table values as the second. + # + # Returns an empty array on failure; then caller should choose different + # hash parameter(s) and try again. + sub _construct_hash_table + { + my ($keys_ref, $hash_mult1, $hash_mult2, $hash_seed) = @_; + my @keys = @{$keys_ref}; + + # This algorithm is based on a graph whose edges correspond to the + # keys and whose vertices correspond to entries of the mapping table. + # A key edge links the two vertices whose indexes are the outputs of + # the two hash functions for that key. For K keys, the mapping + # table must have at least 2*K+1 entries, guaranteeing that there's at + # least one unused entry. (In principle, larger mapping tables make it + # easier to find a workable hash and increase the number of inputs that + # can be rejected due to touching unused hashtable entries. In practice, + # neither effect seems strong enough to justify using a larger table.) + my $nedges = scalar @keys; # number of edges + my $nverts = 2 * $nedges + 1; # number of vertices + + # Initialize the array of edges. + my @E = (); + foreach my $kw (@keys) + { + # Calculate hashes for this key. + # The hashes are immediately reduced modulo the mapping table size. + my $hash1 = _calc_hash($kw, $hash_mult1, $hash_seed) % $nverts; + my $hash2 = _calc_hash($kw, $hash_mult2, $hash_seed) % $nverts; + + # If the two hashes are the same for any key, we have to fail + # since this edge would itself form a cycle in the graph. + return () if $hash1 == $hash2; + + # Add the edge for this key. + push @E, { left => $hash1, right => $hash2 }; + } + + # Initialize the array of vertices, giving them all empty lists + # of associated edges. (The lists will be hashes of edge numbers.) + my @V = (); + for (my $v = 0; $v < $nverts; $v++) + { + push @V, { edges => {} }; + } + + # Insert each edge in the lists of edges using its vertices. + for (my $e = 0; $e < $nedges; $e++) + { + my $v = $E[$e]{left}; + $V[$v]{edges}->{$e} = 1; + + $v = $E[$e]{right}; + $V[$v]{edges}->{$e} = 1; + } + + # Now we attempt to prove the graph acyclic. + # A cycle-free graph is either empty or has some vertex of degree 1. + # Removing the edge attached to that vertex doesn't change this property, + # so doing that repeatedly will reduce the size of the graph. + # If the graph is empty at the end of the process, it was acyclic. + # We track the order of edge removal so that the next phase can process + # them in reverse order of removal. + my @output_order = (); + + # Consider each vertex as a possible starting point for edge-removal. + for (my $startv = 0; $startv < $nverts; $startv++) + { + my $v = $startv; + + # If vertex v is of degree 1 (i.e. exactly 1 edge connects to it), + # remove that edge, and then consider the edge's other vertex to see + # if it is now of degree 1. The inner loop repeats until reaching a + # vertex not of degree 1. + while (scalar(keys(%{ $V[$v]{edges} })) == 1) + { + # Unlink its only edge. + my $e = (keys(%{ $V[$v]{edges} }))[0]; + delete($V[$v]{edges}->{$e}); + + # Unlink the edge from its other vertex, too. + my $v2 = $E[$e]{left}; + $v2 = $E[$e]{right} if ($v2 == $v); + delete($V[$v2]{edges}->{$e}); + + # Push e onto the front of the output-order list. + unshift @output_order, $e; + + # Consider v2 on next iteration of inner loop. + $v = $v2; + } + } + + # We succeeded only if all edges were removed from the graph. + return () if (scalar(@output_order) != $nedges); + + # OK, build the hash table of size $nverts. + my @hashtab = (0) x $nverts; + # We need a "visited" flag array in this step, too. + my @visited = (0) x $nverts; + + # The idea is that for any key, the sum of the hash table entries + # for its first and second hash values is the desired output (i.e., the + # key number). By assigning hash table values in the selected edge + # order, we can guarantee that that's true. + foreach my $e (@output_order) + { + my $l = $E[$e]{left}; + my $r = $E[$e]{right}; + if (!$visited[$l]) + { + # $hashtab[$r] might be zero, or some previously assigned value. + $hashtab[$l] = $e - $hashtab[$r]; + } + else + { + die "oops, doubly used hashtab entry" if $visited[$r]; + # $hashtab[$l] might be zero, or some previously assigned value. + $hashtab[$r] = $e - $hashtab[$l]; + } + # Now freeze both of these hashtab entries. + $visited[$l] = 1; + $visited[$r] = 1; + } + + # Detect range of values needed in hash table. + my $hmin = $nedges; + my $hmax = 0; + for (my $v = 0; $v < $nverts; $v++) + { + $hmin = $hashtab[$v] if $hashtab[$v] < $hmin; + $hmax = $hashtab[$v] if $hashtab[$v] > $hmax; + } + + # Choose width of hashtable entries. In addition to the actual values, + # we need to be able to store a flag for unused entries, and we wish to + # have the property that adding any other entry value to the flag gives + # an out-of-range result (>= $nedges). + my $elemtype; + my $unused_flag; + + if ( $hmin >= -0x7F + && $hmax <= 0x7F + && $hmin + 0x7F >= $nedges) + { + # int8 will work + $elemtype = 'int8'; + $unused_flag = 0x7F; + } + elsif ($hmin >= -0x7FFF + && $hmax <= 0x7FFF + && $hmin + 0x7FFF >= $nedges) + { + # int16 will work + $elemtype = 'int16'; + $unused_flag = 0x7FFF; + } + elsif ($hmin >= -0x7FFFFFFF + && $hmax <= 0x7FFFFFFF + && $hmin + 0x3FFFFFFF >= $nedges) + { + # int32 will work + $elemtype = 'int32'; + $unused_flag = 0x3FFFFFFF; + } + else + { + die "hash table values too wide"; + } + + # Set any unvisited hashtable entries to $unused_flag. + for (my $v = 0; $v < $nverts; $v++) + { + $hashtab[$v] = $unused_flag if !$visited[$v]; + } + + return ($elemtype, \@hashtab); + } + + 1; diff --git a/src/tools/gen_keywordlist.pl b/src/tools/gen_keywordlist.pl index d764aff..e912c3e 100644 *** a/src/tools/gen_keywordlist.pl --- b/src/tools/gen_keywordlist.pl *************** *** 14,19 **** --- 14,25 ---- # variable named according to the -v switch ("ScanKeywords" by default). # The variable is marked "static" unless the -e switch is given. # + # ScanKeywordList uses hash-based lookup, so this script also selects + # a minimal perfect hash function for the keyword set, and emits a + # static hash function that is referenced in the ScanKeywordList struct. + # The hash function is case-insensitive unless --case is specified. + # Note that case insensitivity assumes all-ASCII keywords! + # # # Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group # Portions Copyright (c) 1994, Regents of the University of California *************** *** 25,39 **** use strict; use warnings; use Getopt::Long; my $output_path = ''; my $extern = 0; my $varname = 'ScanKeywords'; GetOptions( ! 'output:s' => \$output_path, ! 'extern' => \$extern, ! 'varname:s' => \$varname) || usage(); my $kw_input_file = shift @ARGV || die "No input file.\n"; --- 31,48 ---- use strict; use warnings; use Getopt::Long; + use PerfectHash; my $output_path = ''; my $extern = 0; + my $case_sensitive = 0; my $varname = 'ScanKeywords'; GetOptions( ! 'output:s' => \$output_path, ! 'extern' => \$extern, ! 'case-sensitive' => \$case_sensitive, ! 'varname:s' => \$varname) || usage(); my $kw_input_file = shift @ARGV || die "No input file.\n"; *************** while (<$kif>) *** 87,93 **** --- 96,117 ---- } } + # When being case-insensitive, insist that the input be all-lower-case. + if (!$case_sensitive) + { + foreach my $kw (@keywords) + { + die qq|The keyword "$kw" is not lower-case in $kw_input_file\n| + if ($kw ne lc $kw); + } + } + # Error out if the keyword names are not in ASCII order. + # + # While this isn't really necessary with hash-based lookup, it's still + # helpful because it provides a cheap way to reject duplicate keywords. + # Also, insisting on sorted order ensures that code that scans the keyword + # table linearly will see the keywords in a canonical order. for my $i (0..$#keywords - 1) { die qq|The keyword "$keywords[$i + 1]" is out of order in $kw_input_file\n| *************** print $kwdef "};\n\n"; *** 128,142 **** printf $kwdef "#define %s_NUM_KEYWORDS %d\n\n", uc $varname, scalar @keywords; # Emit the struct that wraps all this lookup info into one variable. ! print $kwdef "static " if !$extern; printf $kwdef "const ScanKeywordList %s = {\n", $varname; printf $kwdef qq|\t%s_kw_string,\n|, $varname; printf $kwdef qq|\t%s_kw_offsets,\n|, $varname; printf $kwdef qq|\t%s_NUM_KEYWORDS,\n|, uc $varname; printf $kwdef qq|\t%d\n|, $max_len; ! print $kwdef "};\n\n"; printf $kwdef "#endif\t\t\t\t\t\t\t/* %s_H */\n", uc $base_filename; --- 152,176 ---- printf $kwdef "#define %s_NUM_KEYWORDS %d\n\n", uc $varname, scalar @keywords; + # Emit the definition of the hash function. + + my $funcname = $varname . "_hash_func"; + + my $f = PerfectHash::generate_hash_function(\@keywords, + $funcname, !$case_sensitive); + + printf $kwdef qq|static %s\n|, $f; + # Emit the struct that wraps all this lookup info into one variable. ! printf $kwdef "static " if !$extern; printf $kwdef "const ScanKeywordList %s = {\n", $varname; printf $kwdef qq|\t%s_kw_string,\n|, $varname; printf $kwdef qq|\t%s_kw_offsets,\n|, $varname; + printf $kwdef qq|\t%s,\n|, $funcname; printf $kwdef qq|\t%s_NUM_KEYWORDS,\n|, uc $varname; printf $kwdef qq|\t%d\n|, $max_len; ! printf $kwdef "};\n\n"; printf $kwdef "#endif\t\t\t\t\t\t\t/* %s_H */\n", uc $base_filename; *************** Usage: gen_keywordlist.pl [--output/-o < *** 148,153 **** --- 182,188 ---- --output Output directory (default '.') --varname Name for ScanKeywordList variable (default 'ScanKeywords') --extern Allow the ScanKeywordList variable to be globally visible + --case Keyword matching is to be case-sensitive gen_keywordlist.pl transforms a list of keywords into a ScanKeywordList. The output filename is derived from the input file by inserting _d, diff --git a/src/tools/msvc/Solution.pm b/src/tools/msvc/Solution.pm index 937bf18..8f54e45 100644 *** a/src/tools/msvc/Solution.pm --- b/src/tools/msvc/Solution.pm *************** sub GenerateFiles *** 414,420 **** 'src/include/parser/kwlist.h')) { print "Generating kwlist_d.h...\n"; ! system('perl src/tools/gen_keywordlist.pl --extern -o src/common src/include/parser/kwlist.h'); } if (IsNewer( --- 414,420 ---- 'src/include/parser/kwlist.h')) { print "Generating kwlist_d.h...\n"; ! system('perl -I src/tools src/tools/gen_keywordlist.pl --extern -o src/common src/include/parser/kwlist.h'); } if (IsNewer( *************** sub GenerateFiles *** 426,433 **** { print "Generating pl_reserved_kwlist_d.h and pl_unreserved_kwlist_d.h...\n"; chdir('src/pl/plpgsql/src'); ! system('perl ../../../tools/gen_keywordlist.pl --varname ReservedPLKeywords pl_reserved_kwlist.h'); ! system('perl ../../../tools/gen_keywordlist.pl --varname UnreservedPLKeywords pl_unreserved_kwlist.h'); chdir('../../../..'); } --- 426,433 ---- { print "Generating pl_reserved_kwlist_d.h and pl_unreserved_kwlist_d.h...\n"; chdir('src/pl/plpgsql/src'); ! system('perl -I ../../../tools ../../../tools/gen_keywordlist.pl --varname ReservedPLKeywords pl_reserved_kwlist.h'); ! system('perl -I ../../../tools ../../../tools/gen_keywordlist.pl --varname UnreservedPLKeywords pl_unreserved_kwlist.h'); chdir('../../../..'); } *************** sub GenerateFiles *** 440,447 **** { print "Generating c_kwlist_d.h and ecpg_kwlist_d.h...\n"; chdir('src/interfaces/ecpg/preproc'); ! system('perl ../../../tools/gen_keywordlist.pl --varname ScanCKeywords c_kwlist.h'); ! system('perl ../../../tools/gen_keywordlist.pl --varname ScanECPGKeywords ecpg_kwlist.h'); chdir('../../../..'); } --- 440,447 ---- { print "Generating c_kwlist_d.h and ecpg_kwlist_d.h...\n"; chdir('src/interfaces/ecpg/preproc'); ! system('perl -I ../../../tools ../../../tools/gen_keywordlist.pl --varname ScanCKeywords --case c_kwlist.h'); ! system('perl -I ../../../tools ../../../tools/gen_keywordlist.pl --varname ScanECPGKeywords ecpg_kwlist.h'); chdir('../../../..'); }