On Wed, Dec 20, 2023 at 1:48 PM John Naylor <johncnaylo...@gmail.com> wrote:
>
> On Wed, Dec 20, 2023 at 3:23 AM Jeff Davis <pg...@j-davis.com> wrote:
> >
> > The reason I looked here is that the inner while statement (to find the
> > chunk size) looked out of place and possibly slow, and there's a
> > bitwise trick we can use instead.
>
> There are other bit tricks we can use. In v11-0005 Just for fun, I
> translated a couple more into C from
>
> https://github.com/openbsd/src/blob/master/lib/libc/arch/amd64/string/strlen.S

I wanted to see if this gets us anything so ran a couple microbenchmarks.

0001-0003 are same as earlier
0004 takes Jeff's idea and adds in an optimization from NetBSD's
strlen (I said OpenBSD earlier, but it goes back further). I added
stub code to simulate big-endian when requested at compile time, but a
later patch removes it. Since it benched well, I made the extra effort
to generalize it for other callers. After adding to the hash state, it
returns the length so the caller can pass it to the finalizer.
0005 is the benchmark (not for commit) -- I took the parser keyword
list and added enough padding to make every string aligned when the
whole thing is copied to an alloc'd area.

Each of the bench_*.sql files named below are just running the
similarly-named function, all with the same argument, e.g. "select *
from bench_pgstat_hash_fh(100000);", so not attached.

Strings:

-- strlen + hash_bytes
pgbench -n -T 20 -f bench_hash_bytes.sql -M prepared | grep latency
latency average = 1036.732 ms

-- word-at-a-time hashing, with bytewise lookahead
pgbench -n -T 20 -f bench_cstr_unaligned.sql -M prepared | grep latency
latency average = 664.632 ms

-- word-at-a-time for both hashing and lookahead (Jeff's aligned
coding plus a technique from NetBSD strlen)
pgbench -n -T 20 -f bench_cstr_aligned.sql -M prepared | grep latency
latency average = 436.701 ms

So, the fully optimized aligned case is worth it if it's convenient.

0006 adds a byteswap for big-endian so we can reuse little endian
coding for the lookahead.

0007 - I also wanted to put numbers to 0003 (pgstat hash). While the
motivation for that was cleanup, I had a hunch it would shave cycles
and take up less binary space. It does on both accounts:

-- 3x murmur + hash_combine
pgbench -n -T 20 -f bench_pgstat_orig.sql -M prepared | grep latency
latency average = 333.540 ms

-- fasthash32 (simple call, no state setup and final needed for a single value)
pgbench -n -T 20 -f bench_pgstat_fh.sql -M prepared | grep latency
latency average = 277.591 ms

0008 - We can optimize the tail load when it's 4 bytes -- to save
loads, shifts, and OR's. My compiler can't figure this out for the
pgstat hash, with its fixed 4-byte tail. It's pretty simple and should
help other cases:

pgbench -n -T 20 -f bench_pgstat_fh.sql -M prepared | grep latency
latency average = 226.113 ms
From 778e3bdfc761dace149cd6c136e4c2847f793c61 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@postgresql.org>
Date: Sun, 24 Dec 2023 09:46:44 +0700
Subject: [PATCH v11 5/8] Add benchmark for hashing C strings

---
 contrib/bench_hash/Makefile            |  23 +
 contrib/bench_hash/aligned_keywords.h  | 991 +++++++++++++++++++++++++
 contrib/bench_hash/bench_hash--1.0.sql |  21 +
 contrib/bench_hash/bench_hash.c        | 103 +++
 contrib/bench_hash/bench_hash.control  |   5 +
 contrib/bench_hash/meson.build         |  19 +
 contrib/meson.build                    |   1 +
 7 files changed, 1163 insertions(+)
 create mode 100644 contrib/bench_hash/Makefile
 create mode 100644 contrib/bench_hash/aligned_keywords.h
 create mode 100644 contrib/bench_hash/bench_hash--1.0.sql
 create mode 100644 contrib/bench_hash/bench_hash.c
 create mode 100644 contrib/bench_hash/bench_hash.control
 create mode 100644 contrib/bench_hash/meson.build

diff --git a/contrib/bench_hash/Makefile b/contrib/bench_hash/Makefile
new file mode 100644
index 0000000000..5327080376
--- /dev/null
+++ b/contrib/bench_hash/Makefile
@@ -0,0 +1,23 @@
+# src/test/modules/test_parser/Makefile
+
+MODULE_big = test_parser
+OBJS = \
+	$(WIN32RES) \
+	test_parser.o
+PGFILEDESC = "test_parser - example of a custom parser for full-text search"
+
+EXTENSION = test_parser
+DATA = test_parser--1.0.sql
+
+REGRESS = test_parser
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = src/test/modules/test_parser
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/contrib/bench_hash/aligned_keywords.h b/contrib/bench_hash/aligned_keywords.h
new file mode 100644
index 0000000000..c2bd67c856
--- /dev/null
+++ b/contrib/bench_hash/aligned_keywords.h
@@ -0,0 +1,991 @@
+/* created by copying from kwlist_d.h with this patch:
+
+--- a/src/tools/gen_keywordlist.pl
++++ b/src/tools/gen_keywordlist.pl
+@@ -97,7 +97,9 @@ while (<$kif>)
+ {
+        if (/^PG_KEYWORD\("(\w+)"/)
+        {
+-               push @keywords, $1;
++               my $len = length($1) + 1;
++               my $aligned = $1 . "\\0" . "_" x ( ($len % 8) == 0 ? 0 : (8-($len % 8)) );
++               push @keywords, $aligned;
+        }
+ }
+ 
+@@ -127,7 +129,7 @@ for my $i (0 .. $#keywords - 1)
+ # Emit the string containing all the keywords.
+ 
+ printf $kwdef qq|static const char %s_kw_string[] =\n\t"|, $varname;
+-print $kwdef join qq|\\0"\n\t"|, @keywords;
++print $kwdef join qq|"\n\t"|, @keywords;
+ print $kwdef qq|";\n\n|;
+ 
+ # Emit an array of numerical offsets which will be used to index into the
+@@ -145,7 +147,7 @@ foreach my $name (@keywords)
+ 
+        # Calculate the cumulative offset of the next keyword,
+        # taking into account the null terminator.
+-       $offset += $this_length + 1;
++       $offset += $this_length -1;
+ 
+        # Update max keyword length.
+        $max_len = $this_length if $max_len < $this_length;
+
+*/
+
+
+static const char aligned_words[] =
+	"abort\0__"
+	"absent\0_"
+	"absolute\0_______"
+	"access\0_"
+	"action\0_"
+	"add\0____"
+	"admin\0__"
+	"after\0__"
+	"aggregate\0______"
+	"all\0____"
+	"also\0___"
+	"alter\0__"
+	"always\0_"
+	"analyse\0"
+	"analyze\0"
+	"and\0____"
+	"any\0____"
+	"array\0__"
+	"as\0_____"
+	"asc\0____"
+	"asensitive\0_____"
+	"assertion\0______"
+	"assignment\0_____"
+	"asymmetric\0_____"
+	"at\0_____"
+	"atomic\0_"
+	"attach\0_"
+	"attribute\0______"
+	"authorization\0__"
+	"backward\0_______"
+	"before\0_"
+	"begin\0__"
+	"between\0"
+	"bigint\0_"
+	"binary\0_"
+	"bit\0____"
+	"boolean\0"
+	"both\0___"
+	"breadth\0"
+	"by\0_____"
+	"cache\0__"
+	"call\0___"
+	"called\0_"
+	"cascade\0"
+	"cascaded\0_______"
+	"case\0___"
+	"cast\0___"
+	"catalog\0"
+	"chain\0__"
+	"char\0___"
+	"character\0______"
+	"characteristics\0"
+	"check\0__"
+	"checkpoint\0_____"
+	"class\0__"
+	"close\0__"
+	"cluster\0"
+	"coalesce\0_______"
+	"collate\0"
+	"collation\0______"
+	"column\0_"
+	"columns\0"
+	"comment\0"
+	"comments\0_______"
+	"commit\0_"
+	"committed\0______"
+	"compression\0____"
+	"concurrently\0___"
+	"configuration\0__"
+	"conflict\0_______"
+	"connection\0_____"
+	"constraint\0_____"
+	"constraints\0____"
+	"content\0"
+	"continue\0_______"
+	"conversion\0_____"
+	"copy\0___"
+	"cost\0___"
+	"create\0_"
+	"cross\0__"
+	"csv\0____"
+	"cube\0___"
+	"current\0"
+	"current_catalog\0"
+	"current_date\0___"
+	"current_role\0___"
+	"current_schema\0_"
+	"current_time\0___"
+	"current_timestamp\0______"
+	"current_user\0___"
+	"cursor\0_"
+	"cycle\0__"
+	"data\0___"
+	"database\0_______"
+	"day\0____"
+	"deallocate\0_____"
+	"dec\0____"
+	"decimal\0"
+	"declare\0"
+	"default\0"
+	"defaults\0_______"
+	"deferrable\0_____"
+	"deferred\0_______"
+	"definer\0"
+	"delete\0_"
+	"delimiter\0______"
+	"delimiters\0_____"
+	"depends\0"
+	"depth\0__"
+	"desc\0___"
+	"detach\0_"
+	"dictionary\0_____"
+	"disable\0"
+	"discard\0"
+	"distinct\0_______"
+	"do\0_____"
+	"document\0_______"
+	"domain\0_"
+	"double\0_"
+	"drop\0___"
+	"each\0___"
+	"else\0___"
+	"enable\0_"
+	"encoding\0_______"
+	"encrypted\0______"
+	"end\0____"
+	"enum\0___"
+	"escape\0_"
+	"event\0__"
+	"except\0_"
+	"exclude\0"
+	"excluding\0______"
+	"exclusive\0______"
+	"execute\0"
+	"exists\0_"
+	"explain\0"
+	"expression\0_____"
+	"extension\0______"
+	"external\0_______"
+	"extract\0"
+	"false\0__"
+	"family\0_"
+	"fetch\0__"
+	"filter\0_"
+	"finalize\0_______"
+	"first\0__"
+	"float\0__"
+	"following\0______"
+	"for\0____"
+	"force\0__"
+	"foreign\0"
+	"format\0_"
+	"forward\0"
+	"freeze\0_"
+	"from\0___"
+	"full\0___"
+	"function\0_______"
+	"functions\0______"
+	"generated\0______"
+	"global\0_"
+	"grant\0__"
+	"granted\0"
+	"greatest\0_______"
+	"group\0__"
+	"grouping\0_______"
+	"groups\0_"
+	"handler\0"
+	"having\0_"
+	"header\0_"
+	"hold\0___"
+	"hour\0___"
+	"identity\0_______"
+	"if\0_____"
+	"ilike\0__"
+	"immediate\0______"
+	"immutable\0______"
+	"implicit\0_______"
+	"import\0_"
+	"in\0_____"
+	"include\0"
+	"including\0______"
+	"increment\0______"
+	"indent\0_"
+	"index\0__"
+	"indexes\0"
+	"inherit\0"
+	"inherits\0_______"
+	"initially\0______"
+	"inline\0_"
+	"inner\0__"
+	"inout\0__"
+	"input\0__"
+	"insensitive\0____"
+	"insert\0_"
+	"instead\0"
+	"int\0____"
+	"integer\0"
+	"intersect\0______"
+	"interval\0_______"
+	"into\0___"
+	"invoker\0"
+	"is\0_____"
+	"isnull\0_"
+	"isolation\0______"
+	"join\0___"
+	"json\0___"
+	"json_array\0_____"
+	"json_arrayagg\0__"
+	"json_object\0____"
+	"json_objectagg\0_"
+	"json_scalar\0____"
+	"json_serialize\0_"
+	"key\0____"
+	"keys\0___"
+	"label\0__"
+	"language\0_______"
+	"large\0__"
+	"last\0___"
+	"lateral\0"
+	"leading\0"
+	"leakproof\0______"
+	"least\0__"
+	"left\0___"
+	"level\0__"
+	"like\0___"
+	"limit\0__"
+	"listen\0_"
+	"load\0___"
+	"local\0__"
+	"localtime\0______"
+	"localtimestamp\0_"
+	"location\0_______"
+	"lock\0___"
+	"locked\0_"
+	"logged\0_"
+	"mapping\0"
+	"match\0__"
+	"matched\0"
+	"materialized\0___"
+	"maxvalue\0_______"
+	"merge\0__"
+	"method\0_"
+	"minute\0_"
+	"minvalue\0_______"
+	"mode\0___"
+	"month\0__"
+	"move\0___"
+	"name\0___"
+	"names\0__"
+	"national\0_______"
+	"natural\0"
+	"nchar\0__"
+	"new\0____"
+	"next\0___"
+	"nfc\0____"
+	"nfd\0____"
+	"nfkc\0___"
+	"nfkd\0___"
+	"no\0_____"
+	"none\0___"
+	"normalize\0______"
+	"normalized\0_____"
+	"not\0____"
+	"nothing\0"
+	"notify\0_"
+	"notnull\0"
+	"nowait\0_"
+	"null\0___"
+	"nullif\0_"
+	"nulls\0__"
+	"numeric\0"
+	"object\0_"
+	"of\0_____"
+	"off\0____"
+	"offset\0_"
+	"oids\0___"
+	"old\0____"
+	"on\0_____"
+	"only\0___"
+	"operator\0_______"
+	"option\0_"
+	"options\0"
+	"or\0_____"
+	"order\0__"
+	"ordinality\0_____"
+	"others\0_"
+	"out\0____"
+	"outer\0__"
+	"over\0___"
+	"overlaps\0_______"
+	"overlay\0"
+	"overriding\0_____"
+	"owned\0__"
+	"owner\0__"
+	"parallel\0_______"
+	"parameter\0______"
+	"parser\0_"
+	"partial\0"
+	"partition\0______"
+	"passing\0"
+	"password\0_______"
+	"placing\0"
+	"plans\0__"
+	"policy\0_"
+	"position\0_______"
+	"preceding\0______"
+	"precision\0______"
+	"prepare\0"
+	"prepared\0_______"
+	"preserve\0_______"
+	"primary\0"
+	"prior\0__"
+	"privileges\0_____"
+	"procedural\0_____"
+	"procedure\0______"
+	"procedures\0_____"
+	"program\0"
+	"publication\0____"
+	"quote\0__"
+	"range\0__"
+	"read\0___"
+	"real\0___"
+	"reassign\0_______"
+	"recheck\0"
+	"recursive\0______"
+	"ref\0____"
+	"references\0_____"
+	"referencing\0____"
+	"refresh\0"
+	"reindex\0"
+	"relative\0_______"
+	"release\0"
+	"rename\0_"
+	"repeatable\0_____"
+	"replace\0"
+	"replica\0"
+	"reset\0__"
+	"restart\0"
+	"restrict\0_______"
+	"return\0_"
+	"returning\0______"
+	"returns\0"
+	"revoke\0_"
+	"right\0__"
+	"role\0___"
+	"rollback\0_______"
+	"rollup\0_"
+	"routine\0"
+	"routines\0_______"
+	"row\0____"
+	"rows\0___"
+	"rule\0___"
+	"savepoint\0______"
+	"scalar\0_"
+	"schema\0_"
+	"schemas\0"
+	"scroll\0_"
+	"search\0_"
+	"second\0_"
+	"security\0_______"
+	"select\0_"
+	"sequence\0_______"
+	"sequences\0______"
+	"serializable\0___"
+	"server\0_"
+	"session\0"
+	"session_user\0___"
+	"set\0____"
+	"setof\0__"
+	"sets\0___"
+	"share\0__"
+	"show\0___"
+	"similar\0"
+	"simple\0_"
+	"skip\0___"
+	"smallint\0_______"
+	"snapshot\0_______"
+	"some\0___"
+	"sql\0____"
+	"stable\0_"
+	"standalone\0_____"
+	"start\0__"
+	"statement\0______"
+	"statistics\0_____"
+	"stdin\0__"
+	"stdout\0_"
+	"storage\0"
+	"stored\0_"
+	"strict\0_"
+	"strip\0__"
+	"subscription\0___"
+	"substring\0______"
+	"support\0"
+	"symmetric\0______"
+	"sysid\0__"
+	"system\0_"
+	"system_user\0____"
+	"table\0__"
+	"tables\0_"
+	"tablesample\0____"
+	"tablespace\0_____"
+	"temp\0___"
+	"template\0_______"
+	"temporary\0______"
+	"text\0___"
+	"then\0___"
+	"ties\0___"
+	"time\0___"
+	"timestamp\0______"
+	"to\0_____"
+	"trailing\0_______"
+	"transaction\0____"
+	"transform\0______"
+	"treat\0__"
+	"trigger\0"
+	"trim\0___"
+	"true\0___"
+	"truncate\0_______"
+	"trusted\0"
+	"type\0___"
+	"types\0__"
+	"uescape\0"
+	"unbounded\0______"
+	"uncommitted\0____"
+	"unencrypted\0____"
+	"union\0__"
+	"unique\0_"
+	"unknown\0"
+	"unlisten\0_______"
+	"unlogged\0_______"
+	"until\0__"
+	"update\0_"
+	"user\0___"
+	"using\0__"
+	"vacuum\0_"
+	"valid\0__"
+	"validate\0_______"
+	"validator\0______"
+	"value\0__"
+	"values\0_"
+	"varchar\0"
+	"variadic\0_______"
+	"varying\0"
+	"verbose\0"
+	"version\0"
+	"view\0___"
+	"views\0__"
+	"volatile\0_______"
+	"when\0___"
+	"where\0__"
+	"whitespace\0_____"
+	"window\0_"
+	"with\0___"
+	"within\0_"
+	"without\0"
+	"work\0___"
+	"wrapper\0"
+	"write\0__"
+	"xml\0____"
+	"xmlattributes\0__"
+	"xmlconcat\0______"
+	"xmlelement\0_____"
+	"xmlexists\0______"
+	"xmlforest\0______"
+	"xmlnamespaces\0__"
+	"xmlparse\0_______"
+	"xmlpi\0__"
+	"xmlroot\0"
+	"xmlserialize\0___"
+	"xmltable\0_______"
+	"year\0___"
+	"yes\0____"
+	"zone\0___";
+
+static const uint16 word_offsets[] = {
+	0,
+	8,
+	16,
+	32,
+	40,
+	48,
+	56,
+	64,
+	72,
+	88,
+	96,
+	104,
+	112,
+	120,
+	128,
+	136,
+	144,
+	152,
+	160,
+	168,
+	176,
+	192,
+	208,
+	224,
+	240,
+	248,
+	256,
+	264,
+	280,
+	296,
+	312,
+	320,
+	328,
+	336,
+	344,
+	352,
+	360,
+	368,
+	376,
+	384,
+	392,
+	400,
+	408,
+	416,
+	424,
+	440,
+	448,
+	456,
+	464,
+	472,
+	480,
+	496,
+	512,
+	520,
+	536,
+	544,
+	552,
+	560,
+	576,
+	584,
+	600,
+	608,
+	616,
+	624,
+	640,
+	648,
+	664,
+	680,
+	696,
+	712,
+	728,
+	744,
+	760,
+	776,
+	784,
+	800,
+	816,
+	824,
+	832,
+	840,
+	848,
+	856,
+	864,
+	872,
+	888,
+	904,
+	920,
+	936,
+	952,
+	976,
+	992,
+	1000,
+	1008,
+	1016,
+	1032,
+	1040,
+	1056,
+	1064,
+	1072,
+	1080,
+	1088,
+	1104,
+	1120,
+	1136,
+	1144,
+	1152,
+	1168,
+	1184,
+	1192,
+	1200,
+	1208,
+	1216,
+	1232,
+	1240,
+	1248,
+	1264,
+	1272,
+	1288,
+	1296,
+	1304,
+	1312,
+	1320,
+	1328,
+	1336,
+	1352,
+	1368,
+	1376,
+	1384,
+	1392,
+	1400,
+	1408,
+	1416,
+	1432,
+	1448,
+	1456,
+	1464,
+	1472,
+	1488,
+	1504,
+	1520,
+	1528,
+	1536,
+	1544,
+	1552,
+	1560,
+	1576,
+	1584,
+	1592,
+	1608,
+	1616,
+	1624,
+	1632,
+	1640,
+	1648,
+	1656,
+	1664,
+	1672,
+	1688,
+	1704,
+	1720,
+	1728,
+	1736,
+	1744,
+	1760,
+	1768,
+	1784,
+	1792,
+	1800,
+	1808,
+	1816,
+	1824,
+	1832,
+	1848,
+	1856,
+	1864,
+	1880,
+	1896,
+	1912,
+	1920,
+	1928,
+	1936,
+	1952,
+	1968,
+	1976,
+	1984,
+	1992,
+	2000,
+	2016,
+	2032,
+	2040,
+	2048,
+	2056,
+	2064,
+	2080,
+	2088,
+	2096,
+	2104,
+	2112,
+	2128,
+	2144,
+	2152,
+	2160,
+	2168,
+	2176,
+	2192,
+	2200,
+	2208,
+	2224,
+	2240,
+	2256,
+	2272,
+	2288,
+	2304,
+	2312,
+	2320,
+	2328,
+	2344,
+	2352,
+	2360,
+	2368,
+	2376,
+	2392,
+	2400,
+	2408,
+	2416,
+	2424,
+	2432,
+	2440,
+	2448,
+	2456,
+	2472,
+	2488,
+	2504,
+	2512,
+	2520,
+	2528,
+	2536,
+	2544,
+	2552,
+	2568,
+	2584,
+	2592,
+	2600,
+	2608,
+	2624,
+	2632,
+	2640,
+	2648,
+	2656,
+	2664,
+	2680,
+	2688,
+	2696,
+	2704,
+	2712,
+	2720,
+	2728,
+	2736,
+	2744,
+	2752,
+	2760,
+	2776,
+	2792,
+	2800,
+	2808,
+	2816,
+	2824,
+	2832,
+	2840,
+	2848,
+	2856,
+	2864,
+	2872,
+	2880,
+	2888,
+	2896,
+	2904,
+	2912,
+	2920,
+	2928,
+	2944,
+	2952,
+	2960,
+	2968,
+	2976,
+	2992,
+	3000,
+	3008,
+	3016,
+	3024,
+	3040,
+	3048,
+	3064,
+	3072,
+	3080,
+	3096,
+	3112,
+	3120,
+	3128,
+	3144,
+	3152,
+	3168,
+	3176,
+	3184,
+	3192,
+	3208,
+	3224,
+	3240,
+	3248,
+	3264,
+	3280,
+	3288,
+	3296,
+	3312,
+	3328,
+	3344,
+	3360,
+	3368,
+	3384,
+	3392,
+	3400,
+	3408,
+	3416,
+	3432,
+	3440,
+	3456,
+	3464,
+	3480,
+	3496,
+	3504,
+	3512,
+	3528,
+	3536,
+	3544,
+	3560,
+	3568,
+	3576,
+	3584,
+	3592,
+	3608,
+	3616,
+	3632,
+	3640,
+	3648,
+	3656,
+	3664,
+	3680,
+	3688,
+	3696,
+	3712,
+	3720,
+	3728,
+	3736,
+	3752,
+	3760,
+	3768,
+	3776,
+	3784,
+	3792,
+	3800,
+	3816,
+	3824,
+	3840,
+	3856,
+	3872,
+	3880,
+	3888,
+	3904,
+	3912,
+	3920,
+	3928,
+	3936,
+	3944,
+	3952,
+	3960,
+	3968,
+	3984,
+	4000,
+	4008,
+	4016,
+	4024,
+	4040,
+	4048,
+	4064,
+	4080,
+	4088,
+	4096,
+	4104,
+	4112,
+	4120,
+	4128,
+	4144,
+	4160,
+	4168,
+	4184,
+	4192,
+	4200,
+	4216,
+	4224,
+	4232,
+	4248,
+	4264,
+	4272,
+	4288,
+	4304,
+	4312,
+	4320,
+	4328,
+	4336,
+	4352,
+	4360,
+	4376,
+	4392,
+	4408,
+	4416,
+	4424,
+	4432,
+	4440,
+	4456,
+	4464,
+	4472,
+	4480,
+	4488,
+	4504,
+	4520,
+	4536,
+	4544,
+	4552,
+	4560,
+	4576,
+	4592,
+	4600,
+	4608,
+	4616,
+	4624,
+	4632,
+	4640,
+	4656,
+	4672,
+	4680,
+	4688,
+	4696,
+	4712,
+	4720,
+	4728,
+	4736,
+	4744,
+	4752,
+	4768,
+	4776,
+	4784,
+	4800,
+	4808,
+	4816,
+	4824,
+	4832,
+	4840,
+	4848,
+	4856,
+	4864,
+	4880,
+	4896,
+	4912,
+	4928,
+	4944,
+	4960,
+	4976,
+	4984,
+	4992,
+	5008,
+	5024,
+	5032,
+	5040,
+};
+
+
+#define SCANKEYWORDS_NUM_KEYWORDS 473
+
diff --git a/contrib/bench_hash/bench_hash--1.0.sql b/contrib/bench_hash/bench_hash--1.0.sql
new file mode 100644
index 0000000000..b3a5747432
--- /dev/null
+++ b/contrib/bench_hash/bench_hash--1.0.sql
@@ -0,0 +1,21 @@
+/* src/test/modules/bench_hash/bench_hash--1.0.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION bench_hash" to load this file. \quit
+
+CREATE FUNCTION bench_string_hash(int4)
+RETURNS int
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT;
+
+
+CREATE FUNCTION bench_cstring_hash_unaligned(int4)
+RETURNS int
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT;
+
+CREATE FUNCTION bench_cstring_hash_aligned(int4)
+RETURNS int
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT;
+
diff --git a/contrib/bench_hash/bench_hash.c b/contrib/bench_hash/bench_hash.c
new file mode 100644
index 0000000000..9c9dba93f0
--- /dev/null
+++ b/contrib/bench_hash/bench_hash.c
@@ -0,0 +1,103 @@
+/*-------------------------------------------------------------------------
+ *
+ * bench_hash.c
+ *
+ * Copyright (c) 2023, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/test/modules/bench_hash/bench_hash.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "fmgr.h"
+
+PG_MODULE_MAGIC;
+
+#include "aligned_keywords.h"
+
+#include "common/hashfn.h"
+#include "common/hashfn_unstable.h"
+#include "miscadmin.h"
+#include "utils/memutils.h"
+
+
+PG_FUNCTION_INFO_V1(bench_string_hash);
+Datum
+bench_string_hash(PG_FUNCTION_ARGS)
+{
+	int32		count = PG_GETARG_INT32(0);
+	uint32 		hash = 0;
+
+	while (count-- > 0)
+	{
+		for (int i=0; i< SCANKEYWORDS_NUM_KEYWORDS; i++)
+		{
+			int idx = word_offsets[i];
+			int s_len = strlen(&aligned_words[idx]);
+			hash += hash_bytes((const unsigned char *) &aligned_words[idx], s_len);
+		}
+		CHECK_FOR_INTERRUPTS();
+	}
+
+	PG_RETURN_INT32(hash);
+}
+
+
+PG_FUNCTION_INFO_V1(bench_cstring_hash_unaligned);
+Datum
+bench_cstring_hash_unaligned(PG_FUNCTION_ARGS)
+{
+	int32		count = PG_GETARG_INT32(0);
+	uint32 		hash = 0;
+
+	char* p = (char*) palloc(5048);
+	memcpy(p, aligned_words, 5048);
+
+	while (count-- > 0)
+	{
+		for (int i=0; i< SCANKEYWORDS_NUM_KEYWORDS; i++)
+		{
+			int idx = word_offsets[i];
+			int s_len;
+			fasthash_state hs;
+
+			fasthash_init(&hs, FH_UNKNOWN_LENGTH, 0);
+			s_len = fasthash_accum_cstring_unaligned(&hs, &p[idx]);
+			hash += fasthash_final32(&hs, s_len);
+		}
+		CHECK_FOR_INTERRUPTS();
+	}
+
+	PG_RETURN_INT32(hash);
+}
+
+
+PG_FUNCTION_INFO_V1(bench_cstring_hash_aligned);
+Datum
+bench_cstring_hash_aligned(PG_FUNCTION_ARGS)
+{
+	int32		count = PG_GETARG_INT32(0);
+	uint32 		hash = 0;
+
+	char* p = (char*) palloc(5048);
+	memcpy(p, aligned_words, 5048);
+
+	while (count-- > 0)
+	{
+		for (int i=0; i< SCANKEYWORDS_NUM_KEYWORDS; i++)
+		{
+			int idx = word_offsets[i];
+			int s_len;
+			fasthash_state hs;
+
+			fasthash_init(&hs, FH_UNKNOWN_LENGTH, 0);
+			s_len = fasthash_accum_cstring_aligned(&hs, &p[idx]);
+			hash += fasthash_final32(&hs, s_len);
+		}
+		CHECK_FOR_INTERRUPTS();
+	}
+
+	PG_RETURN_INT32(hash);
+}
diff --git a/contrib/bench_hash/bench_hash.control b/contrib/bench_hash/bench_hash.control
new file mode 100644
index 0000000000..ffc63858d2
--- /dev/null
+++ b/contrib/bench_hash/bench_hash.control
@@ -0,0 +1,5 @@
+# bench_hash extension
+comment = 'benchmark some hash functions'
+default_version = '1.0'
+module_pathname = '$libdir/bench_hash'
+relocatable = true
diff --git a/contrib/bench_hash/meson.build b/contrib/bench_hash/meson.build
new file mode 100644
index 0000000000..f8d88d8b5c
--- /dev/null
+++ b/contrib/bench_hash/meson.build
@@ -0,0 +1,19 @@
+# Copyright (c) 2022-2023, PostgreSQL Global Development Group
+
+bench_hash_sources = files(
+  'bench_hash.c',
+)
+
+bench_hash = shared_module('bench_hash',
+  bench_hash_sources,
+#  link_with: pgport_srv,
+  kwargs: contrib_mod_args,
+)
+contrib_targets += bench_hash
+
+install_data(
+  'bench_hash.control',
+  'bench_hash--1.0.sql',
+  kwargs: contrib_data_args,
+)
+
diff --git a/contrib/meson.build b/contrib/meson.build
index c0b267c632..0e99195476 100644
--- a/contrib/meson.build
+++ b/contrib/meson.build
@@ -12,6 +12,7 @@ contrib_doc_args = {
   'install_dir': contrib_doc_dir,
 }
 
+subdir('bench_hash')
 subdir('adminpack')
 subdir('amcheck')
 subdir('auth_delay')
-- 
2.43.0

From 3dd1cdb0b322fdec0955c999bbffc8bf86e5a941 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@postgresql.org>
Date: Wed, 20 Dec 2023 11:40:11 +0700
Subject: [PATCH v11 4/8] Add optimized string hashing to hashfn_unstable.h

Given an already-initialized hash state and a C-string,
accumulate the hash of the string into the hash state
and return the length for the caller to save for the
finalizer. This avoids a strlen call.

If the string pointer is aligned, we can use a word-
at-a-time algorithm for the NUL check and for computing
the remainder length up to the NUL. This is only used on 64-bit,
since it's not worth the extra complexity for 32-bit platforms.
The big-endian case is simulated, and this will be rationalized
in a later commit.

Based on Jeff Davis's v10jd-0004, with
optimized tail inspired by NetBSD's strlen.

simulate big endian coding
---
 src/backend/catalog/namespace.c      |  30 ++++---
 src/include/common/hashfn_unstable.h | 116 ++++++++++++++++++++++++++-
 2 files changed, 133 insertions(+), 13 deletions(-)

diff --git a/src/backend/catalog/namespace.c b/src/backend/catalog/namespace.c
index 7fe2fd1fd4..32597bea20 100644
--- a/src/backend/catalog/namespace.c
+++ b/src/backend/catalog/namespace.c
@@ -247,25 +247,31 @@ static bool MatchNamedCall(HeapTuple proctup, int nargs, List *argnames,
 static inline uint32
 spcachekey_hash(SearchPathCacheKey key)
 {
-	const char *const start = key.searchPath;
-	const char *buf = key.searchPath;
 	fasthash_state hs;
+	int sp_len;
 
 	/* WIP: maybe roleid should be mixed in normally */
-	fasthash_init(&hs, FH_UNKNOWN_LENGTH, key.roleid);
-	while (*buf)
-	{
-		int			chunk_len = 0;
+	uint64 seed = key.roleid;
 
-		while (chunk_len < FH_SIZEOF_ACCUM && buf[chunk_len] != '\0')
-			chunk_len++;
+	// XXX not for commit
+#ifdef USE_ASSERT_CHECKING
 
-		fasthash_accum(&hs, buf, chunk_len);
-		buf += chunk_len;
-	}
+	int			blen = strlen(key.searchPath);
+
+	uint64 h_orig = fasthash64(key.searchPath, blen, key.roleid);
+
+	// Compare orig to optimized string interface
+	fasthash_init(&hs, blen, key.roleid);
+	(void) fasthash_accum_cstring(&hs, key.searchPath);
+	Assert(fasthash_final64(&hs, 0) == h_orig);
+#endif
+
+	fasthash_init(&hs, FH_UNKNOWN_LENGTH, seed);
+
+	sp_len = fasthash_accum_cstring(&hs, key.searchPath);
 
 	/* pass the length to tweak the final mix */
-	return fasthash_final32(&hs, buf - start);
+	return fasthash_final32(&hs, sp_len);
 }
 
 static inline bool
diff --git a/src/include/common/hashfn_unstable.h b/src/include/common/hashfn_unstable.h
index bf1dbee28d..4fc9edba6e 100644
--- a/src/include/common/hashfn_unstable.h
+++ b/src/include/common/hashfn_unstable.h
@@ -13,6 +13,8 @@ the same hashes between versions.
 #ifndef HASHFN_UNSTABLE_H
 #define HASHFN_UNSTABLE_H
 
+#include "port/pg_bitutils.h"
+
 /*
  * fasthash is a modification of code taken from
  * https://code.google.com/archive/p/fast-hash/source/default/source
@@ -63,11 +65,12 @@ return fasthash_final32(&hs, <final length>);
 typedef struct fasthash_state
 {
 	uint64		accum;
-#define FH_SIZEOF_ACCUM sizeof(uint64)
 
 	uint64		hash;
 } fasthash_state;
 
+#define FH_SIZEOF_ACCUM 8
+StaticAssertDecl(sizeof(((fasthash_state*) 0)->accum) == FH_SIZEOF_ACCUM, "wrong size for size macro");
 
 #define FH_UNKNOWN_LENGTH 1
 
@@ -145,6 +148,117 @@ fasthash_accum(fasthash_state *hs, const char *k, int len)
 	fasthash_combine(hs);
 }
 
+/* From: https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord */
+#define haszero64(v) \
+	(((v) - 0x0101010101010101UL) & ~(v) & 0x8080808080808080UL)
+
+#define SIM_BE 1
+#include "port/pg_bswap.h"
+
+/*
+ * With an aligned pointer, we consume the string a word at a time. Loading
+ * the word containing the NUL terminator cannot segfault since page boundaries
+ * are MAXALIGN'd. For that last word, only use bytes up to the NUL for the hash.
+ * The algorithm was adopted from NetBSD's strlen.
+ */
+static inline int
+fasthash_accum_cstring_aligned(fasthash_state *hs, const char *str)
+{
+	const char *const start = str;
+	const char *buf = start;
+	int remainder;
+	uint64 zero_bytes;
+
+	Assert(PointerIsAligned(start, uint64));
+	while (true)
+	{
+		uint64 chunk = *(uint64 *)buf;
+#ifdef SIM_BE
+		uint64 low_bits = 0x7F7F7F7F7F7F7F7F;
+
+		chunk = pg_bswap64(chunk); /* simulate BE */
+
+		/*
+		 * This expression evaluates has the useful property that all bytes in the result word
+		 * that correspond to non-zero bytes in the original word have
+		 * the value 0x00, while all bytes corresponding to zero bytes have
+		 * the value 0x80.
+		 */
+		zero_bytes = ~(((chunk & low_bits) + low_bits) | chunk | low_bits);
+#else
+		/*
+		 * On little endian machines, we can use a slightly faster calculation,
+		 * which sets bits in the first byte in the result word
+		 * that corresponds to a zero byte in the original word.
+		 * The rest of the bytes are indeterminate, so cannot be used
+		 * on big-endian machines unless we resort to a bytewise check.
+		 */
+		zero_bytes = haszero64(chunk);
+#endif
+		if (zero_bytes)
+			break;
+
+#ifdef SIM_BE
+		hs->accum = pg_bswap64(chunk); /* not needed with real BE, because we won't need the same answer */
+#else
+		hs->accum = chunk;
+#endif
+		fasthash_combine(hs);
+		buf += FH_SIZEOF_ACCUM;
+	}
+
+	/*
+	 * Bytes with set bits will be 0x80, so
+	 * calculate the first occurrence of a zero byte within the input word
+	 * by counting the number of leading (on BE) or trailing (on LE)
+	 * zeros and dividing the result by 8.
+	 */
+#ifdef SIM_BE
+	remainder = (63 - pg_leftmost_one_pos64(zero_bytes)) / BITS_PER_BYTE;
+#else
+	remainder = pg_rightmost_one_pos64(zero_bytes) / BITS_PER_BYTE;
+#endif
+	fasthash_accum(hs, buf, remainder);
+	buf += remainder;
+
+	return buf - start;
+}
+
+static inline int
+fasthash_accum_cstring_unaligned(fasthash_state *hs, const char *str)
+{
+	const char *const start = str;
+	const char *buf = str;
+
+	while (*buf)
+	{
+		int			chunk_len = 0;
+
+		while (chunk_len < FH_SIZEOF_ACCUM && buf[chunk_len] != '\0')
+			chunk_len++;
+
+		fasthash_accum(hs, buf, chunk_len);
+		buf += chunk_len;
+	}
+
+	return buf - start;
+}
+
+/*
+ * Accumulate the input into the hash state
+ * and return the length of the string.
+ */
+static inline int
+fasthash_accum_cstring(fasthash_state *hs, const char *str)
+{
+#if SIZEOF_VOID_P >= FH_SIZEOF_ACCUM
+	if (PointerIsAligned(str, uint64))
+		return fasthash_accum_cstring_aligned(hs, str);
+	else
+#endif
+		return fasthash_accum_cstring_unaligned(hs, str);
+}
+
 /*
  * The finalizer
  *
-- 
2.43.0

From 3a2bee493bb30f3c2f253ab27a715d3ca1262111 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@postgresql.org>
Date: Tue, 26 Dec 2023 12:18:22 +0700
Subject: [PATCH v11 7/8] Add bench for pgstat

---
 contrib/bench_hash/bench_hash--1.0.sql |  9 ++++
 contrib/bench_hash/bench_hash.c        | 66 ++++++++++++++++++++++++++
 2 files changed, 75 insertions(+)

diff --git a/contrib/bench_hash/bench_hash--1.0.sql b/contrib/bench_hash/bench_hash--1.0.sql
index b3a5747432..43ce946bf6 100644
--- a/contrib/bench_hash/bench_hash--1.0.sql
+++ b/contrib/bench_hash/bench_hash--1.0.sql
@@ -19,3 +19,12 @@ RETURNS int
 AS 'MODULE_PATHNAME'
 LANGUAGE C STRICT;
 
+CREATE FUNCTION bench_pgstat_hash(int4)
+RETURNS int
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT;
+
+CREATE FUNCTION bench_pgstat_hash_FH(int4)
+RETURNS int
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT;
diff --git a/contrib/bench_hash/bench_hash.c b/contrib/bench_hash/bench_hash.c
index 9c9dba93f0..017cf333ce 100644
--- a/contrib/bench_hash/bench_hash.c
+++ b/contrib/bench_hash/bench_hash.c
@@ -21,6 +21,7 @@ PG_MODULE_MAGIC;
 #include "common/hashfn_unstable.h"
 #include "miscadmin.h"
 #include "utils/memutils.h"
+#include "utils/pgstat_internal.h"
 
 
 PG_FUNCTION_INFO_V1(bench_string_hash);
@@ -101,3 +102,68 @@ bench_cstring_hash_aligned(PG_FUNCTION_ARGS)
 
 	PG_RETURN_INT32(hash);
 }
+
+static inline uint32
+pgstat_hash_hash_key_orig(const void *d, size_t size, void *arg)
+{
+	const PgStat_HashKey *key = (PgStat_HashKey *) d;
+	uint32		hash;
+
+	Assert(size == sizeof(PgStat_HashKey) && arg == NULL);
+
+	hash = murmurhash32(key->kind);
+	hash = hash_combine(hash, murmurhash32(key->dboid));
+	hash = hash_combine(hash, murmurhash32(key->objoid));
+
+	return hash;
+}
+
+static inline uint32
+pgstat_hash_hash_key_FH(const void *d, size_t size, void *arg)
+{
+	const PgStat_HashKey *key = (PgStat_HashKey *) d;
+
+	Assert(size == sizeof(PgStat_HashKey) && arg == NULL);
+
+	return fasthash32((const char *) key, size, 0);
+}
+
+PG_FUNCTION_INFO_V1(bench_pgstat_hash);
+Datum
+bench_pgstat_hash(PG_FUNCTION_ARGS)
+{
+	int32		count = PG_GETARG_INT32(0);
+	uint32 		hash = 0;
+
+	while (count-- > 0)
+	{
+		for (int i=0; i< SCANKEYWORDS_NUM_KEYWORDS - 3; i++)
+		{
+			int idx = word_offsets[i];
+			hash += pgstat_hash_hash_key_orig((const void *) &aligned_words[idx], sizeof(PgStat_HashKey), NULL);
+		}
+		CHECK_FOR_INTERRUPTS();
+	}
+
+	PG_RETURN_INT32(hash);
+}
+
+PG_FUNCTION_INFO_V1(bench_pgstat_hash_fh);
+Datum
+bench_pgstat_hash_fh(PG_FUNCTION_ARGS)
+{
+	int32		count = PG_GETARG_INT32(0);
+	uint32 		hash = 0;
+
+	while (count-- > 0)
+	{
+		for (int i=0; i< SCANKEYWORDS_NUM_KEYWORDS - 3; i++)
+		{
+			int idx = word_offsets[i];
+			hash += pgstat_hash_hash_key_FH((const void *) &aligned_words[idx], sizeof(PgStat_HashKey), NULL);
+		}
+		CHECK_FOR_INTERRUPTS();
+	}
+
+	PG_RETURN_INT32(hash);
+}
-- 
2.43.0

From 59c7f7933b7cf65222bda047f2cf915fdae8b5fd Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@postgresql.org>
Date: Tue, 26 Dec 2023 13:10:26 +0700
Subject: [PATCH v11 8/8] Optimize loading tail when >= 4 bytes

---
 src/include/common/hashfn_unstable.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/include/common/hashfn_unstable.h b/src/include/common/hashfn_unstable.h
index 5bc1fc88ec..79defbfeb5 100644
--- a/src/include/common/hashfn_unstable.h
+++ b/src/include/common/hashfn_unstable.h
@@ -113,6 +113,8 @@ fasthash_combine(fasthash_state *hs)
 static inline void
 fasthash_accum(fasthash_state *hs, const char *k, int len)
 {
+	uint32 lower_four;
+
 	Assert(hs->accum == 0);
 	Assert(len <= FH_SIZEOF_ACCUM);
 
@@ -131,8 +133,9 @@ fasthash_accum(fasthash_state *hs, const char *k, int len)
 			hs->accum |= (uint64) k[4] << 32;
 			/* FALLTHROUGH */
 		case 4:
-			hs->accum |= (uint64) k[3] << 24;
-			/* FALLTHROUGH */
+			memcpy(&lower_four, k, sizeof(lower_four));
+			hs->accum |= lower_four;
+			break;
 		case 3:
 			hs->accum |= (uint64) k[2] << 16;
 			/* FALLTHROUGH */
-- 
2.43.0

From 02875939fa45246140b34554c23eedccc66ba972 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@postgresql.org>
Date: Sun, 24 Dec 2023 15:14:46 +0700
Subject: [PATCH v11 6/8] Try simply byte-swapping on BE machines and then
 handling like LE

---
 src/include/common/hashfn_unstable.h | 36 +++++++---------------------
 1 file changed, 9 insertions(+), 27 deletions(-)

diff --git a/src/include/common/hashfn_unstable.h b/src/include/common/hashfn_unstable.h
index 4fc9edba6e..5bc1fc88ec 100644
--- a/src/include/common/hashfn_unstable.h
+++ b/src/include/common/hashfn_unstable.h
@@ -14,6 +14,7 @@ the same hashes between versions.
 #define HASHFN_UNSTABLE_H
 
 #include "port/pg_bitutils.h"
+#include "port/pg_bswap.h"
 
 /*
  * fasthash is a modification of code taken from
@@ -152,9 +153,6 @@ fasthash_accum(fasthash_state *hs, const char *k, int len)
 #define haszero64(v) \
 	(((v) - 0x0101010101010101UL) & ~(v) & 0x8080808080808080UL)
 
-#define SIM_BE 1
-#include "port/pg_bswap.h"
-
 /*
  * With an aligned pointer, we consume the string a word at a time. Loading
  * the word containing the NUL terminator cannot segfault since page boundaries
@@ -170,39 +168,27 @@ fasthash_accum_cstring_aligned(fasthash_state *hs, const char *str)
 	uint64 zero_bytes;
 
 	Assert(PointerIsAligned(start, uint64));
-	while (true)
+	for (;;)
 	{
 		uint64 chunk = *(uint64 *)buf;
-#ifdef SIM_BE
-		uint64 low_bits = 0x7F7F7F7F7F7F7F7F;
 
-		chunk = pg_bswap64(chunk); /* simulate BE */
+#ifdef WORDS_BIGENDIAN
+		/* switch to little endian, to make later calculations easier */
+		chunk = pg_bswap64(chunk);
+#endif
 
 		/*
-		 * This expression evaluates has the useful property that all bytes in the result word
-		 * that correspond to non-zero bytes in the original word have
-		 * the value 0x00, while all bytes corresponding to zero bytes have
-		 * the value 0x80.
-		 */
-		zero_bytes = ~(((chunk & low_bits) + low_bits) | chunk | low_bits);
-#else
-		/*
-		 * On little endian machines, we can use a slightly faster calculation,
+		 * With little-endian representation, we can use this calculation,
 		 * which sets bits in the first byte in the result word
 		 * that corresponds to a zero byte in the original word.
 		 * The rest of the bytes are indeterminate, so cannot be used
-		 * on big-endian machines unless we resort to a bytewise check.
+		 * on big-endian machines without either swapping or a bytewise check.
 		 */
 		zero_bytes = haszero64(chunk);
-#endif
 		if (zero_bytes)
 			break;
 
-#ifdef SIM_BE
-		hs->accum = pg_bswap64(chunk); /* not needed with real BE, because we won't need the same answer */
-#else
 		hs->accum = chunk;
-#endif
 		fasthash_combine(hs);
 		buf += FH_SIZEOF_ACCUM;
 	}
@@ -210,14 +196,10 @@ fasthash_accum_cstring_aligned(fasthash_state *hs, const char *str)
 	/*
 	 * Bytes with set bits will be 0x80, so
 	 * calculate the first occurrence of a zero byte within the input word
-	 * by counting the number of leading (on BE) or trailing (on LE)
+	 * by counting the number of trailing (for LE)
 	 * zeros and dividing the result by 8.
 	 */
-#ifdef SIM_BE
-	remainder = (63 - pg_leftmost_one_pos64(zero_bytes)) / BITS_PER_BYTE;
-#else
 	remainder = pg_rightmost_one_pos64(zero_bytes) / BITS_PER_BYTE;
-#endif
 	fasthash_accum(hs, buf, remainder);
 	buf += remainder;
 
-- 
2.43.0

From 291dc5818022d142b3b6cff5b503465f4acc5de9 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@postgresql.org>
Date: Mon, 27 Nov 2023 17:03:38 +0700
Subject: [PATCH v11 1/8] Add inlineable, incremental hash functions for
 in-memory use

A number of places hash NUL-termminated strings. Currently, we need
to call strlen first because hash_bytes needs the length. For short
strings the C library call has a large overhead, and strlen calls
show up prominently in profiles.

Per suggestion from Andres Freund, add hash functions with an
incremental interface. Instead of trying to whack around hash_bytes
while maintaining its current behavior on all platforms, we base
this work on fasthash (MIT licensed) which is simple, faster than
hash_bytes for inputs over 12 bytes long, and also passes the hash
function testing suite SMHasher.

The original functions have been reimplemented using our new
incremental interface to validate that this method will still give the
same answer, provided we have the input length ahead of time. Future
work will use these for some existing uses of simplehash and dynahash.

The new functionality lives in a new header hashfn_unstable.h. The
name implies we have the freedom to change things across versions that
would be unacceptable for our other hash functions that are used for
e.g. hash indexes and hash partitioning. As such, these should only
be used for in-memory data structures like hash tables. There is also
no guarantee of being endian-independent.

Reviewed (in an earlier version) by Heikki Linnakangas

Discussion: https://www.postgresql.org/message-id/20231122223432.lywt4yz2bn7tlp27%40awork3.anarazel.de
---
 src/include/common/hashfn_unstable.h | 213 +++++++++++++++++++++++++++
 src/tools/pgindent/typedefs.list     |   1 +
 2 files changed, 214 insertions(+)
 create mode 100644 src/include/common/hashfn_unstable.h

diff --git a/src/include/common/hashfn_unstable.h b/src/include/common/hashfn_unstable.h
new file mode 100644
index 0000000000..bf1dbee28d
--- /dev/null
+++ b/src/include/common/hashfn_unstable.h
@@ -0,0 +1,213 @@
+/*
+Building blocks for creating fast inlineable hash functions. The
+unstable designation is in contrast to hashfn.h, which cannot break
+compatibility because hashes can be writen to disk and so must produce
+the same hashes between versions.
+
+ *
+ * Portions Copyright (c) 2018-2023, PostgreSQL Global Development Group
+ *
+ * src/include/common/hashfn_unstable.c
+ */
+
+#ifndef HASHFN_UNSTABLE_H
+#define HASHFN_UNSTABLE_H
+
+/*
+ * fasthash is a modification of code taken from
+ * https://code.google.com/archive/p/fast-hash/source/default/source
+ * under the terms of the MIT licencse. The original copyright
+ * notice follows:
+ */
+
+/* The MIT License
+
+   Copyright (C) 2012 Zilong Tan (eric.zltan@gmail.com)
+
+   Permission is hereby granted, free of charge, to any person
+   obtaining a copy of this software and associated documentation
+   files (the "Software"), to deal in the Software without
+   restriction, including without limitation the rights to use, copy,
+   modify, merge, publish, distribute, sublicense, and/or sell copies
+   of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/*
+ * There are two interfaces available. Examples assume a 32-bit hash:
+ *
+ * 1) When the length is known ahead of time, use fasthash32().
+ * 2) When the length is not known, use the incremental interface. To
+ *    ensure good results, keep track of the length and pass it to the finalizer:
+
+fasthash_state hs;
+fasthash_init(&hs, FH_UNKNOWN_LENGTH, 0);
+fasthash_accum(&hs, <pointer to a chunk of the input>, <chunk length, up to 8>);
+return fasthash_final32(&hs, <final length>);
+
+*/
+
+
+typedef struct fasthash_state
+{
+	uint64		accum;
+#define FH_SIZEOF_ACCUM sizeof(uint64)
+
+	uint64		hash;
+} fasthash_state;
+
+
+#define FH_UNKNOWN_LENGTH 1
+
+/*
+ * Initialize the hash state.
+ *
+ * "len" is the length of the input, if known ahead of time.
+ * If that is not known, pass FH_UNKNOWN_LENGTH.
+ * "seed" can be zero.
+ */
+static inline void
+fasthash_init(fasthash_state *hs, int len, uint64 seed)
+{
+	memset(hs, 0, sizeof(fasthash_state));
+	hs->hash = seed ^ (len * 0x880355f21e6d1965);
+}
+
+/* Both the finalizer and part of the combining step */
+static inline uint64
+fasthash_mix(uint64 h, uint64 tweak)
+{
+	h ^= (h >> 23) + tweak;
+	h *= 0x2127599bf4325c37;
+	h ^= h >> 47;
+	return h;
+}
+
+static inline void
+fasthash_combine(fasthash_state *hs)
+{
+	hs->hash ^= fasthash_mix(hs->accum, 0);
+	hs->hash *= 0x880355f21e6d1965;
+
+	/* reset hash state for next input */
+	hs->accum = 0;
+}
+
+/* Accumulate up to 8 bytes of input and combine it into the hash */
+static inline void
+fasthash_accum(fasthash_state *hs, const char *k, int len)
+{
+	Assert(hs->accum == 0);
+	Assert(len <= FH_SIZEOF_ACCUM);
+
+	switch (len)
+	{
+		case 8:
+			memcpy(&hs->accum, k, 8);
+			break;
+		case 7:
+			hs->accum |= (uint64) k[6] << 48;
+			/* FALLTHROUGH */
+		case 6:
+			hs->accum |= (uint64) k[5] << 40;
+			/* FALLTHROUGH */
+		case 5:
+			hs->accum |= (uint64) k[4] << 32;
+			/* FALLTHROUGH */
+		case 4:
+			hs->accum |= (uint64) k[3] << 24;
+			/* FALLTHROUGH */
+		case 3:
+			hs->accum |= (uint64) k[2] << 16;
+			/* FALLTHROUGH */
+		case 2:
+			hs->accum |= (uint64) k[1] << 8;
+			/* FALLTHROUGH */
+		case 1:
+			hs->accum |= (uint64) k[0];
+			break;
+		case 0:
+			return;
+	}
+
+	fasthash_combine(hs);
+}
+
+/*
+ * The finalizer
+ *
+ * "tweak" is the input length when the caller doesn't know
+ * the length ahead of time, such as for NUL-terminated
+ * strings, otherwise zero.
+ */
+static inline uint64
+fasthash_final64(fasthash_state *hs, uint64 tweak)
+{
+	return fasthash_mix(hs->hash, tweak);
+}
+
+/*
+ * Reduce a 64-bit hash to a 32-bit hash.
+ *
+ * This provides a bit more additional mixing compared to
+ * just taking the lower 32-bits.
+ */
+static inline uint32
+fasthash_reduce32(uint64 h)
+{
+	/*
+	 * The following trick converts the 64-bit hashcode to Fermat residue,
+	 * which shall retain information from both the higher and lower parts of
+	 * hashcode.
+	 */
+	return h - (h >> 32);
+}
+
+static inline uint32
+fasthash_final32(fasthash_state *hs, uint64 tweak)
+{
+	return fasthash_reduce32(fasthash_final64(hs, tweak));
+}
+
+/*
+ * The original fasthash64 function, re-implemented using
+ * the incremental interface.
+ */
+static inline uint64
+fasthash64(const char *k, int len, uint64 seed)
+{
+	fasthash_state hs;
+
+	fasthash_init(&hs, len, seed);
+
+	while (len >= FH_SIZEOF_ACCUM)
+	{
+		fasthash_accum(&hs, k, FH_SIZEOF_ACCUM);
+		k += FH_SIZEOF_ACCUM;
+		len -= FH_SIZEOF_ACCUM;
+	}
+
+	fasthash_accum(&hs, k, len);
+	return fasthash_final64(&hs, 0);
+}
+
+/* Like fasthash64, but returns a 32-bit hash */
+static inline uint64
+fasthash32(const char *k, int len, uint64 seed)
+{
+	return fasthash_reduce32(fasthash64(k, len, seed));
+}
+
+#endif							/* HASHFN_UNSTABLE_H */
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index d659adbfd6..4038d07458 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -3321,6 +3321,7 @@ exec_thread_arg
 execution_state
 explain_get_index_name_hook_type
 f_smgr
+fasthash_state
 fd_set
 fe_scram_state
 fe_scram_state_enum
-- 
2.43.0

From e825abb530f2edb8649d6d294ca501b082435eca Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@postgresql.org>
Date: Mon, 18 Dec 2023 11:10:28 +0700
Subject: [PATCH v11 3/8] Use fasthash for the search path cache

This serves to demonstrate the incremental API, allowing inlined
hash calculation without a strlen call. This brings the general case
performance closer to the optimization done in commit a86c61c9ee.

WIP: roleid should be mixed in normally, unless we have
reason to just use it as a seed.

Jeff Davis, with switch to chunked interface by me

Discussion: https://www.postgresql.org/message-id/b40292c99e623defe5eadedab1d438cf51a4107c.camel%40j-davis.com
---
 src/backend/catalog/namespace.c | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/src/backend/catalog/namespace.c b/src/backend/catalog/namespace.c
index 5027efc91d..7fe2fd1fd4 100644
--- a/src/backend/catalog/namespace.c
+++ b/src/backend/catalog/namespace.c
@@ -41,7 +41,7 @@
 #include "catalog/pg_ts_template.h"
 #include "catalog/pg_type.h"
 #include "commands/dbcommands.h"
-#include "common/hashfn.h"
+#include "common/hashfn_unstable.h"
 #include "funcapi.h"
 #include "mb/pg_wchar.h"
 #include "miscadmin.h"
@@ -247,11 +247,25 @@ static bool MatchNamedCall(HeapTuple proctup, int nargs, List *argnames,
 static inline uint32
 spcachekey_hash(SearchPathCacheKey key)
 {
-	const unsigned char *bytes = (const unsigned char *) key.searchPath;
-	int			blen = strlen(key.searchPath);
+	const char *const start = key.searchPath;
+	const char *buf = key.searchPath;
+	fasthash_state hs;
 
-	return hash_combine(hash_bytes(bytes, blen),
-						hash_uint32(key.roleid));
+	/* WIP: maybe roleid should be mixed in normally */
+	fasthash_init(&hs, FH_UNKNOWN_LENGTH, key.roleid);
+	while (*buf)
+	{
+		int			chunk_len = 0;
+
+		while (chunk_len < FH_SIZEOF_ACCUM && buf[chunk_len] != '\0')
+			chunk_len++;
+
+		fasthash_accum(&hs, buf, chunk_len);
+		buf += chunk_len;
+	}
+
+	/* pass the length to tweak the final mix */
+	return fasthash_final32(&hs, buf - start);
 }
 
 static inline bool
-- 
2.43.0

From 04afcddc051abc2727cd413e9000ecaba4b38037 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@postgresql.org>
Date: Sat, 9 Dec 2023 16:24:56 +0700
Subject: [PATCH v11 2/8] Use fasthash for pgstat_hash_hash_key

Previously this called the 32-bit Murmur finalizer on the three elements,
then joined with hash_combine(). Fasthash is simpler, executes faster
and takes up less binary space. While the collision and bias behavior
were almost certainly fine with the previous coding, now we have
measurements to prove it.

Discussion:
---
 src/include/utils/pgstat_internal.h | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/src/include/utils/pgstat_internal.h b/src/include/utils/pgstat_internal.h
index 60fbf9394b..ecc46bef04 100644
--- a/src/include/utils/pgstat_internal.h
+++ b/src/include/utils/pgstat_internal.h
@@ -14,7 +14,7 @@
 #define PGSTAT_INTERNAL_H
 
 
-#include "common/hashfn.h"
+#include "common/hashfn_unstable.h"
 #include "lib/dshash.h"
 #include "lib/ilist.h"
 #include "pgstat.h"
@@ -777,15 +777,10 @@ static inline uint32
 pgstat_hash_hash_key(const void *d, size_t size, void *arg)
 {
 	const PgStat_HashKey *key = (PgStat_HashKey *) d;
-	uint32		hash;
 
 	Assert(size == sizeof(PgStat_HashKey) && arg == NULL);
 
-	hash = murmurhash32(key->kind);
-	hash = hash_combine(hash, murmurhash32(key->dboid));
-	hash = hash_combine(hash, murmurhash32(key->objoid));
-
-	return hash;
+	return fasthash32((const char *) key, size, 0);
 }
 
 /*
-- 
2.43.0

Reply via email to