From c64d0cd5ce24a344798534f1bc5827a9199b7a6e Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Wed, 9 Jan 2019 19:47:38 -0500 Subject: [PATCH] Use perfect hashing, instead of binary search, for keyword lookup. We've been speculating for a long time that hash-based keyword lookup ought to be faster than binary search, but up to now we hadn't found a suitable tool for generating the hash function. Joerg Sonnenberger provided the inspiration, and sample code, to show us that rolling our own generator wasn't a ridiculous idea. Hence, do that. The method used here requires a lookup table of approximately 4 bytes per keyword, but that's less than what we saved in the predecessor commit afb0d0712, so it's not a big problem. The time savings is indeed significant: preliminary testing suggests that the total time for raw parsing (flex + bison phases) drops by ~20%. Patch by me, but it owes its existence to Joerg Sonnenberger; thanks also to John Naylor for review. Discussion: https://postgr.es/m/20190103163340.GA15803@britannica.bec.de --- src/common/Makefile | 9 +- src/common/kwlookup.c | 81 +++-- src/include/common/kwlookup.h | 4 + src/include/parser/kwlist.h | 3 +- src/interfaces/ecpg/preproc/Makefile | 13 +- src/interfaces/ecpg/preproc/c_keywords.c | 51 ++- src/interfaces/ecpg/preproc/c_kwlist.h | 3 +- src/interfaces/ecpg/preproc/ecpg_kwlist.h | 3 +- src/pl/plpgsql/src/Makefile | 13 +- src/pl/plpgsql/src/pl_reserved_kwlist.h | 5 +- src/pl/plpgsql/src/pl_unreserved_kwlist.h | 7 +- src/tools/PerfectHash.pm | 376 ++++++++++++++++++++++ src/tools/gen_keywordlist.pl | 53 ++- src/tools/msvc/Solution.pm | 10 +- 14 files changed, 520 insertions(+), 111 deletions(-) create mode 100644 src/tools/PerfectHash.pm diff --git a/src/common/Makefile b/src/common/Makefile index 317b071e02..d0c2b970eb 100644 --- a/src/common/Makefile +++ b/src/common/Makefile @@ -63,6 +63,11 @@ OBJS_FRONTEND = $(OBJS_COMMON) fe_memutils.o file_utils.o restricted_token.o OBJS_SHLIB = $(OBJS_FRONTEND:%.o=%_shlib.o) OBJS_SRV = $(OBJS_COMMON:%.o=%_srv.o) +# where to find gen_keywordlist.pl and subsidiary files +TOOLSDIR = $(top_srcdir)/src/tools +GEN_KEYWORDLIST = $(PERL) -I $(TOOLSDIR) $(TOOLSDIR)/gen_keywordlist.pl +GEN_KEYWORDLIST_DEPS = $(TOOLSDIR)/gen_keywordlist.pl $(TOOLSDIR)/PerfectHash.pm + all: libpgcommon.a libpgcommon_shlib.a libpgcommon_srv.a distprep: kwlist_d.h @@ -118,8 +123,8 @@ libpgcommon_srv.a: $(OBJS_SRV) $(CC) $(CFLAGS) $(subst -DFRONTEND,, $(CPPFLAGS)) -c $< -o $@ # generate SQL keyword lookup table to be included into keywords*.o. -kwlist_d.h: $(top_srcdir)/src/include/parser/kwlist.h $(top_srcdir)/src/tools/gen_keywordlist.pl - $(PERL) $(top_srcdir)/src/tools/gen_keywordlist.pl --extern $< +kwlist_d.h: $(top_srcdir)/src/include/parser/kwlist.h $(GEN_KEYWORDLIST_DEPS) + $(GEN_KEYWORDLIST) --extern $< # Dependencies of keywords*.o need to be managed explicitly to make sure # that you don't get broken parsing code, even in a non-enable-depend build. diff --git a/src/common/kwlookup.c b/src/common/kwlookup.c index d72842e759..6545480b5c 100644 --- a/src/common/kwlookup.c +++ b/src/common/kwlookup.c @@ -35,60 +35,51 @@ * receive a different case-normalization mapping. */ int -ScanKeywordLookup(const char *text, +ScanKeywordLookup(const char *str, const ScanKeywordList *keywords) { - int len, - i; - char word[NAMEDATALEN]; - const char *kw_string; - const uint16 *kw_offsets; - const uint16 *low; - const uint16 *high; - - len = strlen(text); - - if (len > keywords->max_kw_len) - return -1; /* too long to be any keyword */ - - /* We assume all keywords are shorter than NAMEDATALEN. */ - Assert(len < NAMEDATALEN); + size_t len; + int h; + const char *kw; /* - * Apply an ASCII-only downcasing. We must not use tolower() since it may - * produce the wrong translation in some locales (eg, Turkish). + * Reject immediately if too long to be any keyword. This saves useless + * hashing and downcasing work on long strings. */ - for (i = 0; i < len; i++) + len = strlen(str); + if (len > keywords->max_kw_len) + return -1; + + /* + * Compute the hash function. We assume it was generated to produce + * case-insensitive results. Since it's a perfect hash, we need only + * match to the specific keyword it identifies. + */ + h = keywords->hash(str, len); + + /* An out-of-range result implies no match */ + if (h < 0 || h >= keywords->num_keywords) + return -1; + + /* + * Compare character-by-character to see if we have a match, applying an + * ASCII-only downcasing to the input characters. We must not use + * tolower() since it may produce the wrong translation in some locales + * (eg, Turkish). + */ + kw = GetScanKeyword(h, keywords); + while (*str != '\0') { - char ch = text[i]; + char ch = *str++; if (ch >= 'A' && ch <= 'Z') ch += 'a' - 'A'; - word[i] = ch; + if (ch != *kw++) + return -1; } - word[len] = '\0'; + if (*kw != '\0') + return -1; - /* - * Now do a binary search using plain strcmp() comparison. - */ - kw_string = keywords->kw_string; - kw_offsets = keywords->kw_offsets; - low = kw_offsets; - high = kw_offsets + (keywords->num_keywords - 1); - while (low <= high) - { - const uint16 *middle; - int difference; - - middle = low + (high - low) / 2; - difference = strcmp(kw_string + *middle, word); - if (difference == 0) - return middle - kw_offsets; - else if (difference < 0) - low = middle + 1; - else - high = middle - 1; - } - - return -1; + /* Success! */ + return h; } diff --git a/src/include/common/kwlookup.h b/src/include/common/kwlookup.h index 39efb3503f..dbff36713d 100644 --- a/src/include/common/kwlookup.h +++ b/src/include/common/kwlookup.h @@ -14,6 +14,9 @@ #ifndef KWLOOKUP_H #define KWLOOKUP_H +/* Hash function used by ScanKeywordLookup */ +typedef int (*ScanKeywordHashFunc) (const void *key, size_t keylen); + /* * This struct contains the data needed by ScanKeywordLookup to perform a * search within a set of keywords. The contents are typically generated by @@ -23,6 +26,7 @@ typedef struct ScanKeywordList { const char *kw_string; /* all keywords in order, separated by \0 */ const uint16 *kw_offsets; /* offsets to the start of each keyword */ + ScanKeywordHashFunc hash; /* perfect hash function for keywords */ int num_keywords; /* number of keywords */ int max_kw_len; /* length of longest keyword */ } ScanKeywordList; diff --git a/src/include/parser/kwlist.h b/src/include/parser/kwlist.h index b8902d3403..adeb834ce8 100644 --- a/src/include/parser/kwlist.h +++ b/src/include/parser/kwlist.h @@ -21,8 +21,7 @@ /* * List of keyword (name, token-value, category) entries. * - * !!WARNING!!: This list must be sorted by ASCII name, because binary - * search is used to locate entries. + * Note: gen_keywordlist.pl requires the entries to appear in ASCII order. */ /* name, value, category */ diff --git a/src/interfaces/ecpg/preproc/Makefile b/src/interfaces/ecpg/preproc/Makefile index b5b74a3b81..20e3b47874 100644 --- a/src/interfaces/ecpg/preproc/Makefile +++ b/src/interfaces/ecpg/preproc/Makefile @@ -28,7 +28,10 @@ OBJS= preproc.o pgc.o type.o ecpg.o output.o parser.o \ keywords.o c_keywords.o ecpg_keywords.o typename.o descriptor.o variable.o \ $(WIN32RES) -GEN_KEYWORDLIST = $(top_srcdir)/src/tools/gen_keywordlist.pl +# where to find gen_keywordlist.pl and subsidiary files +TOOLSDIR = $(top_srcdir)/src/tools +GEN_KEYWORDLIST = $(PERL) -I $(TOOLSDIR) $(TOOLSDIR)/gen_keywordlist.pl +GEN_KEYWORDLIST_DEPS = $(TOOLSDIR)/gen_keywordlist.pl $(TOOLSDIR)/PerfectHash.pm # Suppress parallel build to avoid a bug in GNU make 3.82 # (see comments in ../Makefile) @@ -56,11 +59,11 @@ preproc.y: ../../../backend/parser/gram.y parse.pl ecpg.addons ecpg.header ecpg. $(PERL) $(srcdir)/check_rules.pl $(srcdir) $< # generate keyword headers -c_kwlist_d.h: c_kwlist.h $(GEN_KEYWORDLIST) - $(PERL) $(GEN_KEYWORDLIST) --varname ScanCKeywords $< +c_kwlist_d.h: c_kwlist.h $(GEN_KEYWORDLIST_DEPS) + $(GEN_KEYWORDLIST) --varname ScanCKeywords --no-case-fold $< -ecpg_kwlist_d.h: ecpg_kwlist.h $(GEN_KEYWORDLIST) - $(PERL) $(GEN_KEYWORDLIST) --varname ScanECPGKeywords $< +ecpg_kwlist_d.h: ecpg_kwlist.h $(GEN_KEYWORDLIST_DEPS) + $(GEN_KEYWORDLIST) --varname ScanECPGKeywords $< # Force these dependencies to be known even without dependency info built: ecpg_keywords.o c_keywords.o keywords.o preproc.o pgc.o parser.o: preproc.h diff --git a/src/interfaces/ecpg/preproc/c_keywords.c b/src/interfaces/ecpg/preproc/c_keywords.c index 38ddf6f135..80aa7d5339 100644 --- a/src/interfaces/ecpg/preproc/c_keywords.c +++ b/src/interfaces/ecpg/preproc/c_keywords.c @@ -9,8 +9,6 @@ */ #include "postgres_fe.h" -#include - #include "preproc_extern.h" #include "preproc.h" @@ -32,39 +30,38 @@ static const uint16 ScanCKeywordTokens[] = { * * Returns the token value of the keyword, or -1 if no match. * - * Do a binary search using plain strcmp() comparison. This is much like + * Do a hash search using plain strcmp() comparison. This is much like * ScanKeywordLookup(), except we want case-sensitive matching. */ int -ScanCKeywordLookup(const char *text) +ScanCKeywordLookup(const char *str) { - const char *kw_string; - const uint16 *kw_offsets; - const uint16 *low; - const uint16 *high; + size_t len; + int h; + const char *kw; - if (strlen(text) > ScanCKeywords.max_kw_len) - return -1; /* too long to be any keyword */ + /* + * Reject immediately if too long to be any keyword. This saves useless + * hashing work on long strings. + */ + len = strlen(str); + if (len > ScanCKeywords.max_kw_len) + return -1; - kw_string = ScanCKeywords.kw_string; - kw_offsets = ScanCKeywords.kw_offsets; - low = kw_offsets; - high = kw_offsets + (ScanCKeywords.num_keywords - 1); + /* + * Compute the hash function. Since it's a perfect hash, we need only + * match to the specific keyword it identifies. + */ + h = ScanCKeywords_hash_func(str, len); - while (low <= high) - { - const uint16 *middle; - int difference; + /* An out-of-range result implies no match */ + if (h < 0 || h >= ScanCKeywords.num_keywords) + return -1; - middle = low + (high - low) / 2; - difference = strcmp(kw_string + *middle, text); - if (difference == 0) - return ScanCKeywordTokens[middle - kw_offsets]; - else if (difference < 0) - low = middle + 1; - else - high = middle - 1; - } + kw = GetScanKeyword(h, &ScanCKeywords); + + if (strcmp(kw, str) == 0) + return ScanCKeywordTokens[h]; return -1; } diff --git a/src/interfaces/ecpg/preproc/c_kwlist.h b/src/interfaces/ecpg/preproc/c_kwlist.h index 4545505298..610a4b1e05 100644 --- a/src/interfaces/ecpg/preproc/c_kwlist.h +++ b/src/interfaces/ecpg/preproc/c_kwlist.h @@ -20,8 +20,7 @@ /* * List of (keyword-name, keyword-token-value) pairs. * - * !!WARNING!!: This list must be sorted by ASCII name, because binary - * search is used to locate entries. + * Note: gen_keywordlist.pl requires the entries to appear in ASCII order. */ /* name, value */ diff --git a/src/interfaces/ecpg/preproc/ecpg_kwlist.h b/src/interfaces/ecpg/preproc/ecpg_kwlist.h index 97ef254166..bdd9854925 100644 --- a/src/interfaces/ecpg/preproc/ecpg_kwlist.h +++ b/src/interfaces/ecpg/preproc/ecpg_kwlist.h @@ -20,8 +20,7 @@ /* * List of (keyword-name, keyword-token-value) pairs. * - * !!WARNING!!: This list must be sorted by ASCII name, because binary - * search is used to locate entries. + * Note: gen_keywordlist.pl requires the entries to appear in ASCII order. */ /* name, value */ diff --git a/src/pl/plpgsql/src/Makefile b/src/pl/plpgsql/src/Makefile index f5958d1267..cc1c2613d3 100644 --- a/src/pl/plpgsql/src/Makefile +++ b/src/pl/plpgsql/src/Makefile @@ -29,7 +29,10 @@ REGRESS_OPTS = --dbname=$(PL_TESTDB) REGRESS = plpgsql_call plpgsql_control plpgsql_domain plpgsql_record \ plpgsql_cache plpgsql_transaction plpgsql_trigger plpgsql_varprops -GEN_KEYWORDLIST = $(top_srcdir)/src/tools/gen_keywordlist.pl +# where to find gen_keywordlist.pl and subsidiary files +TOOLSDIR = $(top_srcdir)/src/tools +GEN_KEYWORDLIST = $(PERL) -I $(TOOLSDIR) $(TOOLSDIR)/gen_keywordlist.pl +GEN_KEYWORDLIST_DEPS = $(TOOLSDIR)/gen_keywordlist.pl $(TOOLSDIR)/PerfectHash.pm all: all-lib @@ -76,11 +79,11 @@ plerrcodes.h: $(top_srcdir)/src/backend/utils/errcodes.txt generate-plerrcodes.p $(PERL) $(srcdir)/generate-plerrcodes.pl $< > $@ # generate keyword headers for the scanner -pl_reserved_kwlist_d.h: pl_reserved_kwlist.h $(GEN_KEYWORDLIST) - $(PERL) $(GEN_KEYWORDLIST) --varname ReservedPLKeywords $< +pl_reserved_kwlist_d.h: pl_reserved_kwlist.h $(GEN_KEYWORDLIST_DEPS) + $(GEN_KEYWORDLIST) --varname ReservedPLKeywords $< -pl_unreserved_kwlist_d.h: pl_unreserved_kwlist.h $(GEN_KEYWORDLIST) - $(PERL) $(GEN_KEYWORDLIST) --varname UnreservedPLKeywords $< +pl_unreserved_kwlist_d.h: pl_unreserved_kwlist.h $(GEN_KEYWORDLIST_DEPS) + $(GEN_KEYWORDLIST) --varname UnreservedPLKeywords $< check: submake diff --git a/src/pl/plpgsql/src/pl_reserved_kwlist.h b/src/pl/plpgsql/src/pl_reserved_kwlist.h index 5c2e0c1c4b..8425c3ca2e 100644 --- a/src/pl/plpgsql/src/pl_reserved_kwlist.h +++ b/src/pl/plpgsql/src/pl_reserved_kwlist.h @@ -20,10 +20,9 @@ /* * List of (keyword-name, keyword-token-value) pairs. * - * Be careful not to put the same word in both lists. + * Be careful not to put the same word into pl_unreserved_kwlist.h. * - * !!WARNING!!: This list must be sorted by ASCII name, because binary - * search is used to locate entries. + * Note: gen_keywordlist.pl requires the entries to appear in ASCII order. */ /* name, value */ diff --git a/src/pl/plpgsql/src/pl_unreserved_kwlist.h b/src/pl/plpgsql/src/pl_unreserved_kwlist.h index ef2aea05b8..ce4be81dd8 100644 --- a/src/pl/plpgsql/src/pl_unreserved_kwlist.h +++ b/src/pl/plpgsql/src/pl_unreserved_kwlist.h @@ -20,11 +20,10 @@ /* * List of (keyword-name, keyword-token-value) pairs. * - * Be careful not to put the same word in both lists. Also be sure that - * pl_gram.y's unreserved_keyword production agrees with this list. + * Be careful not to put the same word into pl_reserved_kwlist.h. Also be + * sure that pl_gram.y's unreserved_keyword production agrees with this list. * - * !!WARNING!!: This list must be sorted by ASCII name, because binary - * search is used to locate entries. + * Note: gen_keywordlist.pl requires the entries to appear in ASCII order. */ /* name, value */ diff --git a/src/tools/PerfectHash.pm b/src/tools/PerfectHash.pm new file mode 100644 index 0000000000..bd339e38a2 --- /dev/null +++ b/src/tools/PerfectHash.pm @@ -0,0 +1,376 @@ +#---------------------------------------------------------------------- +# +# PerfectHash.pm +# Perl module that constructs minimal perfect hash functions +# +# This code constructs a minimal perfect hash function for the given +# set of keys, using an algorithm described in +# "An optimal algorithm for generating minimal perfect hash functions" +# by Czech, Havas and Majewski in Information Processing Letters, +# 43(5):256-264, October 1992. +# This implementation is loosely based on NetBSD's "nbperf", +# which was written by Joerg Sonnenberger. +# +# The resulting hash function is perfect in the sense that if the presented +# key is one of the original set, it will return the key's index in the set +# (in range 0..N-1). However, the caller must still verify the match, +# as false positives are possible. Also, the hash function may return +# values that are out of range (negative or >= N), due to summing unrelated +# hashtable entries. This indicates that the presented key is definitely +# not in the set. +# +# +# Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group +# Portions Copyright (c) 1994, Regents of the University of California +# +# src/tools/PerfectHash.pm +# +#---------------------------------------------------------------------- + +package PerfectHash; + +use strict; +use warnings; + + +# At runtime, we'll compute two simple hash functions of the input key, +# and use them to index into a mapping table. The hash functions are just +# multiply-and-add in uint32 arithmetic, with different multipliers and +# initial seeds. All the complexity in this module is concerned with +# selecting hash parameters that will work and building the mapping table. + +# We support making case-insensitive hash functions, though this only +# works for a strict-ASCII interpretation of case insensitivity, +# ie, A-Z maps onto a-z and nothing else. +my $case_fold = 0; + + +# +# Construct a C function implementing a perfect hash for the given keys. +# The C function definition is returned as a string. +# +# The keys should be passed as an array reference. They can be any set +# of Perl strings; it is caller's responsibility that there not be any +# duplicates. (Note that the "strings" can be binary data, but hashing +# e.g. OIDs has endianness hazards that callers must overcome.) +# +# The name to use for the function is specified as the second argument. +# It will be a global function by default, but the caller may prepend +# "static " to the result string if it wants a static function. +# +# Additional options can be specified as keyword-style arguments: +# +# case_fold => bool +# If specified as true, the hash function is case-insensitive, for the +# limited idea of case-insensitivity explained above. +# +# fixed_key_length => N +# If specified, all keys are assumed to have length N bytes, and the +# hash function signature will be just "int f(const void *key)" +# rather than "int f(const void *key, size_t keylen)". +# +sub generate_hash_function +{ + my ($keys_ref, $funcname, %options) = @_; + + # It's not worth passing this around as a parameter; just use a global. + $case_fold = $options{case_fold} || 0; + + # Try different hash function parameters until we find a set that works + # for these keys. The multipliers are chosen to be primes that are cheap + # to calculate via shift-and-add, so don't change them without care. + # (Commonly, random seeds are tried, but we want reproducible results + # from this program so we don't do that.) + my $hash_mult1 = 31; + my $hash_mult2; + my $hash_seed1; + my $hash_seed2; + my @subresult; + FIND_PARAMS: + foreach (127, 257, 521, 1033, 2053) + { + $hash_mult2 = $_; # "foreach $hash_mult2" doesn't work + for ($hash_seed1 = 0; $hash_seed1 < 10; $hash_seed1++) + { + for ($hash_seed2 = 0; $hash_seed2 < 10; $hash_seed2++) + { + @subresult = _construct_hash_table( + $keys_ref, $hash_mult1, $hash_mult2, + $hash_seed1, $hash_seed2); + last FIND_PARAMS if @subresult; + } + } + } + + # Choke if we couldn't find a workable set of parameters. + die "failed to generate perfect hash" if !@subresult; + + # Extract info from _construct_hash_table's result array. + my $elemtype = $subresult[0]; + my @hashtab = @{ $subresult[1] }; + my $nhash = scalar(@hashtab); + + # OK, construct the hash function definition including the hash table. + my $f = ''; + $f .= sprintf "int\n"; + if (defined $options{fixed_key_length}) + { + $f .= sprintf "%s(const void *key)\n{\n", $funcname; + } + else + { + $f .= sprintf "%s(const void *key, size_t keylen)\n{\n", $funcname; + } + $f .= sprintf "\tstatic const %s h[%d] = {\n", $elemtype, $nhash; + for (my $i = 0; $i < $nhash; $i++) + { + $f .= sprintf "%s%6d,%s", + ($i % 8 == 0 ? "\t\t" : " "), + $hashtab[$i], + ($i % 8 == 7 ? "\n" : ""); + } + $f .= sprintf "\n" if ($nhash % 8 != 0); + $f .= sprintf "\t};\n\n"; + $f .= sprintf "\tconst unsigned char *k = key;\n"; + $f .= sprintf "\tsize_t\t\tkeylen = %d;\n", $options{fixed_key_length} + if (defined $options{fixed_key_length}); + $f .= sprintf "\tuint32\t\ta = %d;\n", $hash_seed1; + $f .= sprintf "\tuint32\t\tb = %d;\n\n", $hash_seed2; + $f .= sprintf "\twhile (keylen--)\n\t{\n"; + $f .= sprintf "\t\tunsigned char c = *k++"; + $f .= sprintf " | 0x20" if $case_fold; # see comment below + $f .= sprintf ";\n\n"; + $f .= sprintf "\t\ta = a * %d + c;\n", $hash_mult1; + $f .= sprintf "\t\tb = b * %d + c;\n", $hash_mult2; + $f .= sprintf "\t}\n"; + $f .= sprintf "\treturn h[a %% %d] + h[b %% %d];\n", $nhash, $nhash; + $f .= sprintf "}\n"; + + return $f; +} + + +# Calculate a hash function as the run-time code will do. +# +# If we are making a case-insensitive hash function, we implement that +# by OR'ing 0x20 into each byte of the key. This correctly transforms +# upper-case ASCII into lower-case ASCII, while not changing digits or +# dollar signs. (It does change '_', as well as other characters not +# likely to appear in keywords; this has little effect on the hash's +# ability to discriminate keywords.) +sub _calc_hash +{ + my ($key, $mult, $seed) = @_; + + my $result = $seed; + for my $c (split //, $key) + { + my $cn = ord($c); + $cn |= 0x20 if $case_fold; + $result = ($result * $mult + $cn) % 4294967296; + } + return $result; +} + + +# Attempt to construct a mapping table for a minimal perfect hash function +# for the given keys, using the specified hash parameters. +# +# Returns an array containing the mapping table element type name as the +# first element, and a ref to an array of the table values as the second. +# +# Returns an empty array on failure; then caller should choose different +# hash parameter(s) and try again. +sub _construct_hash_table +{ + my ($keys_ref, $hash_mult1, $hash_mult2, $hash_seed1, $hash_seed2) = @_; + my @keys = @{$keys_ref}; + + # This algorithm is based on a graph whose edges correspond to the + # keys and whose vertices correspond to entries of the mapping table. + # A key's edge links the two vertices whose indexes are the outputs of + # the two hash functions for that key. For K keys, the mapping + # table must have at least 2*K+1 entries, guaranteeing that there's at + # least one unused entry. (In principle, larger mapping tables make it + # easier to find a workable hash and increase the number of inputs that + # can be rejected due to touching unused hashtable entries. In practice, + # neither effect seems strong enough to justify using a larger table.) + my $nedges = scalar @keys; # number of edges + my $nverts = 2 * $nedges + 1; # number of vertices + + # However, it would be very bad if $nverts were exactly equal to either + # $hash_mult1 or $hash_mult2: effectively, that hash function would be + # sensitive to only the last byte of each key. Cases where $nverts is a + # multiple of either multiplier likewise lose information. (But $nverts + # can't actually divide them, if they've been intelligently chosen as + # primes.) We can avoid such problems by adjusting the table size. + while ($nverts % $hash_mult1 == 0 + || $nverts % $hash_mult2 == 0) + { + $nverts++; + } + + # Initialize the array of edges. + my @E = (); + foreach my $kw (@keys) + { + # Calculate hashes for this key. + # The hashes are immediately reduced modulo the mapping table size. + my $hash1 = _calc_hash($kw, $hash_mult1, $hash_seed1) % $nverts; + my $hash2 = _calc_hash($kw, $hash_mult2, $hash_seed2) % $nverts; + + # If the two hashes are the same for any key, we have to fail + # since this edge would itself form a cycle in the graph. + return () if $hash1 == $hash2; + + # Add the edge for this key. + push @E, { left => $hash1, right => $hash2 }; + } + + # Initialize the array of vertices, giving them all empty lists + # of associated edges. (The lists will be hashes of edge numbers.) + my @V = (); + for (my $v = 0; $v < $nverts; $v++) + { + push @V, { edges => {} }; + } + + # Insert each edge in the lists of edges connected to its vertices. + for (my $e = 0; $e < $nedges; $e++) + { + my $v = $E[$e]{left}; + $V[$v]{edges}->{$e} = 1; + + $v = $E[$e]{right}; + $V[$v]{edges}->{$e} = 1; + } + + # Now we attempt to prove the graph acyclic. + # A cycle-free graph is either empty or has some vertex of degree 1. + # Removing the edge attached to that vertex doesn't change this property, + # so doing that repeatedly will reduce the size of the graph. + # If the graph is empty at the end of the process, it was acyclic. + # We track the order of edge removal so that the next phase can process + # them in reverse order of removal. + my @output_order = (); + + # Consider each vertex as a possible starting point for edge-removal. + for (my $startv = 0; $startv < $nverts; $startv++) + { + my $v = $startv; + + # If vertex v is of degree 1 (i.e. exactly 1 edge connects to it), + # remove that edge, and then consider the edge's other vertex to see + # if it is now of degree 1. The inner loop repeats until reaching a + # vertex not of degree 1. + while (scalar(keys(%{ $V[$v]{edges} })) == 1) + { + # Unlink its only edge. + my $e = (keys(%{ $V[$v]{edges} }))[0]; + delete($V[$v]{edges}->{$e}); + + # Unlink the edge from its other vertex, too. + my $v2 = $E[$e]{left}; + $v2 = $E[$e]{right} if ($v2 == $v); + delete($V[$v2]{edges}->{$e}); + + # Push e onto the front of the output-order list. + unshift @output_order, $e; + + # Consider v2 on next iteration of inner loop. + $v = $v2; + } + } + + # We succeeded only if all edges were removed from the graph. + return () if (scalar(@output_order) != $nedges); + + # OK, build the hash table of size $nverts. + my @hashtab = (0) x $nverts; + # We need a "visited" flag array in this step, too. + my @visited = (0) x $nverts; + + # The goal is that for any key, the sum of the hash table entries for + # its first and second hash values is the desired output (i.e., the key + # number). By assigning hash table values in the selected edge order, + # we can guarantee that that's true. This works because the edge first + # removed from the graph (and hence last to be visited here) must have + # at least one vertex it shared with no other edge; hence it will have at + # least one vertex (hashtable entry) still unvisited when we reach it here, + # and we can assign that unvisited entry a value that makes the sum come + # out as we wish. By induction, the same holds for all the other edges. + foreach my $e (@output_order) + { + my $l = $E[$e]{left}; + my $r = $E[$e]{right}; + if (!$visited[$l]) + { + # $hashtab[$r] might be zero, or some previously assigned value. + $hashtab[$l] = $e - $hashtab[$r]; + } + else + { + die "oops, doubly used hashtab entry" if $visited[$r]; + # $hashtab[$l] might be zero, or some previously assigned value. + $hashtab[$r] = $e - $hashtab[$l]; + } + # Now freeze both of these hashtab entries. + $visited[$l] = 1; + $visited[$r] = 1; + } + + # Detect range of values needed in hash table. + my $hmin = $nedges; + my $hmax = 0; + for (my $v = 0; $v < $nverts; $v++) + { + $hmin = $hashtab[$v] if $hashtab[$v] < $hmin; + $hmax = $hashtab[$v] if $hashtab[$v] > $hmax; + } + + # Choose width of hashtable entries. In addition to the actual values, + # we need to be able to store a flag for unused entries, and we wish to + # have the property that adding any other entry value to the flag gives + # an out-of-range result (>= $nedges). + my $elemtype; + my $unused_flag; + + if ( $hmin >= -0x7F + && $hmax <= 0x7F + && $hmin + 0x7F >= $nedges) + { + # int8 will work + $elemtype = 'int8'; + $unused_flag = 0x7F; + } + elsif ($hmin >= -0x7FFF + && $hmax <= 0x7FFF + && $hmin + 0x7FFF >= $nedges) + { + # int16 will work + $elemtype = 'int16'; + $unused_flag = 0x7FFF; + } + elsif ($hmin >= -0x7FFFFFFF + && $hmax <= 0x7FFFFFFF + && $hmin + 0x3FFFFFFF >= $nedges) + { + # int32 will work + $elemtype = 'int32'; + $unused_flag = 0x3FFFFFFF; + } + else + { + die "hash table values too wide"; + } + + # Set any unvisited hashtable entries to $unused_flag. + for (my $v = 0; $v < $nverts; $v++) + { + $hashtab[$v] = $unused_flag if !$visited[$v]; + } + + return ($elemtype, \@hashtab); +} + +1; diff --git a/src/tools/gen_keywordlist.pl b/src/tools/gen_keywordlist.pl index d764affaa6..78ac1cd1c1 100644 --- a/src/tools/gen_keywordlist.pl +++ b/src/tools/gen_keywordlist.pl @@ -14,6 +14,12 @@ # variable named according to the -v switch ("ScanKeywords" by default). # The variable is marked "static" unless the -e switch is given. # +# ScanKeywordList uses hash-based lookup, so this script also selects +# a minimal perfect hash function for the keyword set, and emits a +# static hash function that is referenced in the ScanKeywordList struct. +# The hash function is case-insensitive unless --no-case-fold is specified. +# Note that case folding works correctly only for all-ASCII keywords! +# # # Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group # Portions Copyright (c) 1994, Regents of the University of California @@ -25,15 +31,18 @@ use strict; use warnings; use Getopt::Long; +use PerfectHash; my $output_path = ''; my $extern = 0; +my $case_fold = 1; my $varname = 'ScanKeywords'; GetOptions( - 'output:s' => \$output_path, - 'extern' => \$extern, - 'varname:s' => \$varname) || usage(); + 'output:s' => \$output_path, + 'extern' => \$extern, + 'case-fold!' => \$case_fold, + 'varname:s' => \$varname) || usage(); my $kw_input_file = shift @ARGV || die "No input file.\n"; @@ -87,7 +96,22 @@ while (<$kif>) } } +# When being case-insensitive, insist that the input be all-lower-case. +if ($case_fold) +{ + foreach my $kw (@keywords) + { + die qq|The keyword "$kw" is not lower-case in $kw_input_file\n| + if ($kw ne lc $kw); + } +} + # Error out if the keyword names are not in ASCII order. +# +# While this isn't really necessary with hash-based lookup, it's still +# helpful because it provides a cheap way to reject duplicate keywords. +# Also, insisting on sorted order ensures that code that scans the keyword +# table linearly will see the keywords in a canonical order. for my $i (0..$#keywords - 1) { die qq|The keyword "$keywords[$i + 1]" is out of order in $kw_input_file\n| @@ -128,15 +152,25 @@ print $kwdef "};\n\n"; printf $kwdef "#define %s_NUM_KEYWORDS %d\n\n", uc $varname, scalar @keywords; +# Emit the definition of the hash function. + +my $funcname = $varname . "_hash_func"; + +my $f = PerfectHash::generate_hash_function(\@keywords, $funcname, + case_fold => $case_fold); + +printf $kwdef qq|static %s\n|, $f; + # Emit the struct that wraps all this lookup info into one variable. -print $kwdef "static " if !$extern; +printf $kwdef "static " if !$extern; printf $kwdef "const ScanKeywordList %s = {\n", $varname; printf $kwdef qq|\t%s_kw_string,\n|, $varname; printf $kwdef qq|\t%s_kw_offsets,\n|, $varname; +printf $kwdef qq|\t%s,\n|, $funcname; printf $kwdef qq|\t%s_NUM_KEYWORDS,\n|, uc $varname; printf $kwdef qq|\t%d\n|, $max_len; -print $kwdef "};\n\n"; +printf $kwdef "};\n\n"; printf $kwdef "#endif\t\t\t\t\t\t\t/* %s_H */\n", uc $base_filename; @@ -144,10 +178,11 @@ printf $kwdef "#endif\t\t\t\t\t\t\t/* %s_H */\n", uc $base_filename; sub usage { die <] [--varname/-v ] [--extern/-e] input_file - --output Output directory (default '.') - --varname Name for ScanKeywordList variable (default 'ScanKeywords') - --extern Allow the ScanKeywordList variable to be globally visible +Usage: gen_keywordlist.pl [--output/-o ] [--varname/-v ] [--extern/-e] [--[no-]case-fold] input_file + --output Output directory (default '.') + --varname Name for ScanKeywordList variable (default 'ScanKeywords') + --extern Allow the ScanKeywordList variable to be globally visible + --no-case-fold Keyword matching is to be case-sensitive gen_keywordlist.pl transforms a list of keywords into a ScanKeywordList. The output filename is derived from the input file by inserting _d, diff --git a/src/tools/msvc/Solution.pm b/src/tools/msvc/Solution.pm index 937bf184e2..e4bb9d2394 100644 --- a/src/tools/msvc/Solution.pm +++ b/src/tools/msvc/Solution.pm @@ -414,7 +414,7 @@ sub GenerateFiles 'src/include/parser/kwlist.h')) { print "Generating kwlist_d.h...\n"; - system('perl src/tools/gen_keywordlist.pl --extern -o src/common src/include/parser/kwlist.h'); + system('perl -I src/tools src/tools/gen_keywordlist.pl --extern -o src/common src/include/parser/kwlist.h'); } if (IsNewer( @@ -426,8 +426,8 @@ sub GenerateFiles { print "Generating pl_reserved_kwlist_d.h and pl_unreserved_kwlist_d.h...\n"; chdir('src/pl/plpgsql/src'); - system('perl ../../../tools/gen_keywordlist.pl --varname ReservedPLKeywords pl_reserved_kwlist.h'); - system('perl ../../../tools/gen_keywordlist.pl --varname UnreservedPLKeywords pl_unreserved_kwlist.h'); + system('perl -I ../../../tools ../../../tools/gen_keywordlist.pl --varname ReservedPLKeywords pl_reserved_kwlist.h'); + system('perl -I ../../../tools ../../../tools/gen_keywordlist.pl --varname UnreservedPLKeywords pl_unreserved_kwlist.h'); chdir('../../../..'); } @@ -440,8 +440,8 @@ sub GenerateFiles { print "Generating c_kwlist_d.h and ecpg_kwlist_d.h...\n"; chdir('src/interfaces/ecpg/preproc'); - system('perl ../../../tools/gen_keywordlist.pl --varname ScanCKeywords c_kwlist.h'); - system('perl ../../../tools/gen_keywordlist.pl --varname ScanECPGKeywords ecpg_kwlist.h'); + system('perl -I ../../../tools ../../../tools/gen_keywordlist.pl --varname ScanCKeywords --no-case-fold c_kwlist.h'); + system('perl -I ../../../tools ../../../tools/gen_keywordlist.pl --varname ScanECPGKeywords ecpg_kwlist.h'); chdir('../../../..'); }