Use perfect hashing, instead of binary search, for keyword lookup.

tglsfdc · tglsfdc · commit c64d0cd5ce24 · 2019-01-09T19:47:46.000-05:00
We've been speculating for a long time that hash-based keyword lookup ought to be faster than binary search, but up to now we hadn't found a suitable tool for generating the hash function. Joerg Sonnenberger provided the inspiration, and sample code, to show us that rolling our own generator wasn't a ridiculous idea. Hence, do that. The method used here requires a lookup table of approximately 4 bytes per keyword, but that's less than what we saved in the predecessor commit afb0d07, so it's not a big problem. The time savings is indeed significant: preliminary testing suggests that the total time for raw parsing (flex + bison phases) drops by ~20%. Patch by me, but it owes its existence to Joerg Sonnenberger; thanks also to John Naylor for review. Discussion: https://postgr.es/m/20190103163340.GA15803@britannica.bec.de
diff --git a/src/common/Makefile b/src/common/Makefile
@@ -63,6 +63,11 @@ OBJS_FRONTEND = $(OBJS_COMMON) fe_memutils.o file_utils.o restricted_token.o
 OBJS_SHLIB = $(OBJS_FRONTEND:%.o=%_shlib.o)
 OBJS_SRV = $(OBJS_COMMON:%.o=%_srv.o)
 
+# where to find gen_keywordlist.pl and subsidiary files
+TOOLSDIR = $(top_srcdir)/src/tools
+GEN_KEYWORDLIST = $(PERL) -I $(TOOLSDIR) $(TOOLSDIR)/gen_keywordlist.pl
+GEN_KEYWORDLIST_DEPS = $(TOOLSDIR)/gen_keywordlist.pl $(TOOLSDIR)/PerfectHash.pm
+
 all: libpgcommon.a libpgcommon_shlib.a libpgcommon_srv.a
 
 distprep: kwlist_d.h
@@ -118,8 +123,8 @@ libpgcommon_srv.a: $(OBJS_SRV)
 	$(CC) $(CFLAGS) $(subst -DFRONTEND,, $(CPPFLAGS)) -c $< -o $@
 
 # generate SQL keyword lookup table to be included into keywords*.o.
-kwlist_d.h: $(top_srcdir)/src/include/parser/kwlist.h $(top_srcdir)/src/tools/gen_keywordlist.pl
-	$(PERL) $(top_srcdir)/src/tools/gen_keywordlist.pl --extern $<
+kwlist_d.h: $(top_srcdir)/src/include/parser/kwlist.h $(GEN_KEYWORDLIST_DEPS)
+	$(GEN_KEYWORDLIST) --extern $<
 
 # Dependencies of keywords*.o need to be managed explicitly to make sure
 # that you don't get broken parsing code, even in a non-enable-depend build.
diff --git a/src/common/kwlookup.c b/src/common/kwlookup.c
@@ -35,60 +35,51 @@
  * receive a different case-normalization mapping.
  */
 int
-ScanKeywordLookup(const char *text,
+ScanKeywordLookup(const char *str,
 				  const ScanKeywordList *keywords)
 {
-	int			len,
-				i;
-	char		word[NAMEDATALEN];
-	const char *kw_string;
-	const uint16 *kw_offsets;
-	const uint16 *low;
-	const uint16 *high;
-
-	len = strlen(text);
+	size_t		len;
+	int			h;
+	const char *kw;
 
+	/*
+	 * Reject immediately if too long to be any keyword.  This saves useless
+	 * hashing and downcasing work on long strings.
+	 */
+	len = strlen(str);
 	if (len > keywords->max_kw_len)
-		return -1;				/* too long to be any keyword */
-
-	/* We assume all keywords are shorter than NAMEDATALEN. */
-	Assert(len < NAMEDATALEN);
+		return -1;
 
 	/*
-	 * Apply an ASCII-only downcasing.  We must not use tolower() since it may
-	 * produce the wrong translation in some locales (eg, Turkish).
+	 * Compute the hash function.  We assume it was generated to produce
+	 * case-insensitive results.  Since it's a perfect hash, we need only
+	 * match to the specific keyword it identifies.
 	 */
-	for (i = 0; i < len; i++)
-	{
-		char		ch = text[i];
+	h = keywords->hash(str, len);
 
-		if (ch >= 'A' && ch <= 'Z')
-			ch += 'a' - 'A';
-		word[i] = ch;
-	}
-	word[len] = '\0';
+	/* An out-of-range result implies no match */
+	if (h < 0 || h >= keywords->num_keywords)
+		return -1;
 
 	/*
-	 * Now do a binary search using plain strcmp() comparison.
+	 * Compare character-by-character to see if we have a match, applying an
+	 * ASCII-only downcasing to the input characters.  We must not use
+	 * tolower() since it may produce the wrong translation in some locales
+	 * (eg, Turkish).
 	 */
-	kw_string = keywords->kw_string;
-	kw_offsets = keywords->kw_offsets;
-	low = kw_offsets;
-	high = kw_offsets + (keywords->num_keywords - 1);
-	while (low <= high)
+	kw = GetScanKeyword(h, keywords);
+	while (*str != '\0')
 	{
-		const uint16 *middle;
-		int			difference;
+		char		ch = *str++;
 
-		middle = low + (high - low) / 2;
-		difference = strcmp(kw_string + *middle, word);
-		if (difference == 0)
-			return middle - kw_offsets;
-		else if (difference < 0)
-			low = middle + 1;
-		else
-			high = middle - 1;
+		if (ch >= 'A' && ch <= 'Z')
+			ch += 'a' - 'A';
+		if (ch != *kw++)
+			return -1;
 	}
+	if (*kw != '\0')
+		return -1;
 
-	return -1;
+	/* Success! */
+	return h;
 }
diff --git a/src/include/common/kwlookup.h b/src/include/common/kwlookup.h
@@ -14,6 +14,9 @@
 #ifndef KWLOOKUP_H
 #define KWLOOKUP_H
 
+/* Hash function used by ScanKeywordLookup */
+typedef int (*ScanKeywordHashFunc) (const void *key, size_t keylen);
+
 /*
  * This struct contains the data needed by ScanKeywordLookup to perform a
  * search within a set of keywords.  The contents are typically generated by
@@ -23,6 +26,7 @@ typedef struct ScanKeywordList
 {
 	const char *kw_string;		/* all keywords in order, separated by \0 */
 	const uint16 *kw_offsets;	/* offsets to the start of each keyword */
+	ScanKeywordHashFunc hash;	/* perfect hash function for keywords */
 	int			num_keywords;	/* number of keywords */
 	int			max_kw_len;		/* length of longest keyword */
 } ScanKeywordList;
diff --git a/src/include/parser/kwlist.h b/src/include/parser/kwlist.h
@@ -21,8 +21,7 @@
 /*
  * List of keyword (name, token-value, category) entries.
  *
- * !!WARNING!!: This list must be sorted by ASCII name, because binary
- *		 search is used to locate entries.
+ * Note: gen_keywordlist.pl requires the entries to appear in ASCII order.
  */
 
 /* name, value, category */
diff --git a/src/interfaces/ecpg/preproc/Makefile b/src/interfaces/ecpg/preproc/Makefile
@@ -28,7 +28,10 @@ OBJS=	preproc.o pgc.o type.o ecpg.o output.o parser.o \
 	keywords.o c_keywords.o ecpg_keywords.o typename.o descriptor.o variable.o \
 	$(WIN32RES)
 
-GEN_KEYWORDLIST = $(top_srcdir)/src/tools/gen_keywordlist.pl
+# where to find gen_keywordlist.pl and subsidiary files
+TOOLSDIR = $(top_srcdir)/src/tools
+GEN_KEYWORDLIST = $(PERL) -I $(TOOLSDIR) $(TOOLSDIR)/gen_keywordlist.pl
+GEN_KEYWORDLIST_DEPS = $(TOOLSDIR)/gen_keywordlist.pl $(TOOLSDIR)/PerfectHash.pm
 
 # Suppress parallel build to avoid a bug in GNU make 3.82
 # (see comments in ../Makefile)
@@ -56,11 +59,11 @@ preproc.y: ../../../backend/parser/gram.y parse.pl ecpg.addons ecpg.header ecpg.
 	$(PERL) $(srcdir)/check_rules.pl $(srcdir) $<
 
 # generate keyword headers
-c_kwlist_d.h: c_kwlist.h $(GEN_KEYWORDLIST)
-	$(PERL) $(GEN_KEYWORDLIST) --varname ScanCKeywords $<
+c_kwlist_d.h: c_kwlist.h $(GEN_KEYWORDLIST_DEPS)
+	$(GEN_KEYWORDLIST) --varname ScanCKeywords --no-case-fold $<
 
-ecpg_kwlist_d.h: ecpg_kwlist.h $(GEN_KEYWORDLIST)
-	$(PERL) $(GEN_KEYWORDLIST) --varname ScanECPGKeywords $<
+ecpg_kwlist_d.h: ecpg_kwlist.h $(GEN_KEYWORDLIST_DEPS)
+	$(GEN_KEYWORDLIST) --varname ScanECPGKeywords $<
 
 # Force these dependencies to be known even without dependency info built:
 ecpg_keywords.o c_keywords.o keywords.o preproc.o pgc.o parser.o: preproc.h
diff --git a/src/interfaces/ecpg/preproc/c_keywords.c b/src/interfaces/ecpg/preproc/c_keywords.c
@@ -9,8 +9,6 @@
  */
 #include "postgres_fe.h"
 
-#include <ctype.h>
-
 #include "preproc_extern.h"
 #include "preproc.h"
 
@@ -32,39 +30,38 @@ static const uint16 ScanCKeywordTokens[] = {
  *
  * Returns the token value of the keyword, or -1 if no match.
  *
- * Do a binary search using plain strcmp() comparison.  This is much like
+ * Do a hash search using plain strcmp() comparison.  This is much like
  * ScanKeywordLookup(), except we want case-sensitive matching.
  */
 int
-ScanCKeywordLookup(const char *text)
+ScanCKeywordLookup(const char *str)
 {
-	const char *kw_string;
-	const uint16 *kw_offsets;
-	const uint16 *low;
-	const uint16 *high;
+	size_t		len;
+	int			h;
+	const char *kw;
+
+	/*
+	 * Reject immediately if too long to be any keyword.  This saves useless
+	 * hashing work on long strings.
+	 */
+	len = strlen(str);
+	if (len > ScanCKeywords.max_kw_len)
+		return -1;
 
-	if (strlen(text) > ScanCKeywords.max_kw_len)
-		return -1;				/* too long to be any keyword */
+	/*
+	 * Compute the hash function.  Since it's a perfect hash, we need only
+	 * match to the specific keyword it identifies.
+	 */
+	h = ScanCKeywords_hash_func(str, len);
 
-	kw_string = ScanCKeywords.kw_string;
-	kw_offsets = ScanCKeywords.kw_offsets;
-	low = kw_offsets;
-	high = kw_offsets + (ScanCKeywords.num_keywords - 1);
+	/* An out-of-range result implies no match */
+	if (h < 0 || h >= ScanCKeywords.num_keywords)
+		return -1;
 
-	while (low <= high)
-	{
-		const uint16 *middle;
-		int			difference;
+	kw = GetScanKeyword(h, &ScanCKeywords);
 
-		middle = low + (high - low) / 2;
-		difference = strcmp(kw_string + *middle, text);
-		if (difference == 0)
-			return ScanCKeywordTokens[middle - kw_offsets];
-		else if (difference < 0)
-			low = middle + 1;
-		else
-			high = middle - 1;
-	}
+	if (strcmp(kw, str) == 0)
+		return ScanCKeywordTokens[h];
 
 	return -1;
 }
diff --git a/src/interfaces/ecpg/preproc/c_kwlist.h b/src/interfaces/ecpg/preproc/c_kwlist.h
@@ -20,8 +20,7 @@
 /*
  * List of (keyword-name, keyword-token-value) pairs.
  *
- * !!WARNING!!: This list must be sorted by ASCII name, because binary
- *		 search is used to locate entries.
+ * Note: gen_keywordlist.pl requires the entries to appear in ASCII order.
  */
 
 /* name, value */
diff --git a/src/interfaces/ecpg/preproc/ecpg_kwlist.h b/src/interfaces/ecpg/preproc/ecpg_kwlist.h
@@ -20,8 +20,7 @@
 /*
  * List of (keyword-name, keyword-token-value) pairs.
  *
- * !!WARNING!!: This list must be sorted by ASCII name, because binary
- *		 search is used to locate entries.
+ * Note: gen_keywordlist.pl requires the entries to appear in ASCII order.
  */
 
 /* name, value */
diff --git a/src/pl/plpgsql/src/Makefile b/src/pl/plpgsql/src/Makefile
@@ -29,7 +29,10 @@ REGRESS_OPTS = --dbname=$(PL_TESTDB)
 REGRESS = plpgsql_call plpgsql_control plpgsql_domain plpgsql_record \
 	plpgsql_cache plpgsql_transaction plpgsql_trigger plpgsql_varprops
 
-GEN_KEYWORDLIST = $(top_srcdir)/src/tools/gen_keywordlist.pl
+# where to find gen_keywordlist.pl and subsidiary files
+TOOLSDIR = $(top_srcdir)/src/tools
+GEN_KEYWORDLIST = $(PERL) -I $(TOOLSDIR) $(TOOLSDIR)/gen_keywordlist.pl
+GEN_KEYWORDLIST_DEPS = $(TOOLSDIR)/gen_keywordlist.pl $(TOOLSDIR)/PerfectHash.pm
 
 all: all-lib
 
@@ -76,11 +79,11 @@ plerrcodes.h: $(top_srcdir)/src/backend/utils/errcodes.txt generate-plerrcodes.p
 	$(PERL) $(srcdir)/generate-plerrcodes.pl $< > $@
 
 # generate keyword headers for the scanner
-pl_reserved_kwlist_d.h: pl_reserved_kwlist.h $(GEN_KEYWORDLIST)
-	$(PERL) $(GEN_KEYWORDLIST) --varname ReservedPLKeywords $<
+pl_reserved_kwlist_d.h: pl_reserved_kwlist.h $(GEN_KEYWORDLIST_DEPS)
+	$(GEN_KEYWORDLIST) --varname ReservedPLKeywords $<
 
-pl_unreserved_kwlist_d.h: pl_unreserved_kwlist.h $(GEN_KEYWORDLIST)
-	$(PERL) $(GEN_KEYWORDLIST) --varname UnreservedPLKeywords $<
+pl_unreserved_kwlist_d.h: pl_unreserved_kwlist.h $(GEN_KEYWORDLIST_DEPS)
+	$(GEN_KEYWORDLIST) --varname UnreservedPLKeywords $<
 
 
 check: submake
diff --git a/src/pl/plpgsql/src/pl_reserved_kwlist.h b/src/pl/plpgsql/src/pl_reserved_kwlist.h
@@ -20,10 +20,9 @@
 /*
  * List of (keyword-name, keyword-token-value) pairs.
  *
- * Be careful not to put the same word in both lists.
+ * Be careful not to put the same word into pl_unreserved_kwlist.h.
  *
- * !!WARNING!!: This list must be sorted by ASCII name, because binary
- *		 search is used to locate entries.
+ * Note: gen_keywordlist.pl requires the entries to appear in ASCII order.
  */
 
 /* name, value */
diff --git a/src/pl/plpgsql/src/pl_unreserved_kwlist.h b/src/pl/plpgsql/src/pl_unreserved_kwlist.h
@@ -20,11 +20,10 @@
 /*
  * List of (keyword-name, keyword-token-value) pairs.
  *
- * Be careful not to put the same word in both lists.  Also be sure that
- * pl_gram.y's unreserved_keyword production agrees with this list.
+ * Be careful not to put the same word into pl_reserved_kwlist.h.  Also be
+ * sure that pl_gram.y's unreserved_keyword production agrees with this list.
  *
- * !!WARNING!!: This list must be sorted by ASCII name, because binary
- *		 search is used to locate entries.
+ * Note: gen_keywordlist.pl requires the entries to appear in ASCII order.
  */
 
 /* name, value */
diff --git a/src/tools/PerfectHash.pm b/src/tools/PerfectHash.pm
diff --git a/src/tools/gen_keywordlist.pl b/src/tools/gen_keywordlist.pl
diff --git a/src/tools/msvc/Solution.pm b/src/tools/msvc/Solution.pm

Original file line number	Diff line number	Diff line change
`@@ -21,8 +21,7 @@`
`21`	`21`	`/*`
`22`	`22`	`* List of keyword (name, token-value, category) entries.`
`23`	`23`	`*`
`24`		`- * !!WARNING!!: This list must be sorted by ASCII name, because binary`
`25`		`- * search is used to locate entries.`
	`24`	`+ * Note: gen_keywordlist.pl requires the entries to appear in ASCII order.`
`26`	`25`	`*/`
`27`	`26`
`28`	`27`	`/* name, value, category */`
Original file line number	Diff line number	Diff line change
`@@ -20,8 +20,7 @@`
`20`	`20`	`/*`
`21`	`21`	`* List of (keyword-name, keyword-token-value) pairs.`
`22`	`22`	`*`
`23`		`- * !!WARNING!!: This list must be sorted by ASCII name, because binary`
`24`		`- * search is used to locate entries.`
	`23`	`+ * Note: gen_keywordlist.pl requires the entries to appear in ASCII order.`
`25`	`24`	`*/`
`26`	`25`
`27`	`26`	`/* name, value */`
Original file line number	Diff line number	Diff line change
`@@ -20,10 +20,9 @@`
`20`	`20`	`/*`
`21`	`21`	`* List of (keyword-name, keyword-token-value) pairs.`
`22`	`22`	`*`
`23`		`- * Be careful not to put the same word in both lists.`
	`23`	`+ * Be careful not to put the same word into pl_unreserved_kwlist.h.`
`24`	`24`	`*`
`25`		`- * !!WARNING!!: This list must be sorted by ASCII name, because binary`
`26`		`- * search is used to locate entries.`
	`25`	`+ * Note: gen_keywordlist.pl requires the entries to appear in ASCII order.`
`27`	`26`	`*/`
`28`	`27`
`29`	`28`	`/* name, value */`