Allow simplehash to use already-calculated hash values

Jeff Davis Tue, 16 Jul 2019 15:20:54 -0700

The attached small patch adds new entry points to simplehash.h that
allow the caller to pass in the already-calculated hash value, so that
simplehash doesn't need to recalculate it.


This is helpful for Memory-Bounded Hash Aggregation[1], which uses the
hash value for multiple purposes. For instance, if the hash table is
full and the group is not already present in the hash table, it needs
to spill the tuple to disk. In that case, it would use the hash value
for the initial lookup, then to select the right spill partition.
Later, when it processes the batch, it will again need the same hash
value to perform a lookup. By separating the hash value calculation
from where it's used, we can avoid needlessly recalculating it for each
of these steps.

There is already an option for simplehash to cache the calculated hash
value and return it with the entry, but that doesn't quite fit the
need. The hash value is needed in cases where the lookup fails, because
that is when the tuple must be spilled; but if the lookup fails, it
returns NULL, discarding the calculated hash value.

I am including this patch separately from Hash Aggregation because it
is a small and independently-reviewable change.

In theory, this could add overhead for "SH_SCOPE extern" for callers
not specifying their own hash value, because it adds an extra external
function call. I looked at the generated LLVM and it's a simple tail
call, and I looked at the generated assembly and it's just an extra
jmp. I tested by doing a hash aggregation of 30M zeroes, which should
exercise that path a lot, and I didn't see any difference. Also, once
we actually use this for hash aggregation, there will be no "SH_SCOPE
extern" callers that don't specify the hash value anyway.

Regards,
        Jeff Davis

[1] 
https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel%40j-davis.com

From ac4877d1e7b64da6caead189b85813fb5fed81a0 Mon Sep 17 00:00:00 2001
From: Jeff Davis <jda...@postgresql.org>
Date: Tue, 16 Jul 2019 12:07:37 -0700
Subject: [PATCH] Allow simplehash to use already-calculated hash values.

Add _lookup_hash and _insert_hash functions for callers that have
already calculated the hash value of the key.

This is intended for use with hash algorithms that write to disk in
partitions. The hash value can be calculated once, used to perform a
lookup, used to select the partition, then written to the partition
along with the tuple. When the tuple is read back, the hash value does
not need to be recalculated.
---
 src/include/lib/simplehash.h | 36 ++++++++++++++++++++++++++++++++++--
 1 file changed, 34 insertions(+), 2 deletions(-)

diff --git a/src/include/lib/simplehash.h b/src/include/lib/simplehash.h
index 5c6bd93bc7..065062ee96 100644
--- a/src/include/lib/simplehash.h
+++ b/src/include/lib/simplehash.h
@@ -74,8 +74,10 @@
 #define SH_DESTROY SH_MAKE_NAME(destroy)
 #define SH_RESET SH_MAKE_NAME(reset)
 #define SH_INSERT SH_MAKE_NAME(insert)
+#define SH_INSERT_HASH SH_MAKE_NAME(insert_hash)
 #define SH_DELETE SH_MAKE_NAME(delete)
 #define SH_LOOKUP SH_MAKE_NAME(lookup)
+#define SH_LOOKUP_HASH SH_MAKE_NAME(lookup_hash)
 #define SH_GROW SH_MAKE_NAME(grow)
 #define SH_START_ITERATE SH_MAKE_NAME(start_iterate)
 #define SH_START_ITERATE_AT SH_MAKE_NAME(start_iterate_at)
@@ -144,7 +146,11 @@ SH_SCOPE void SH_DESTROY(SH_TYPE * tb);
 SH_SCOPE void SH_RESET(SH_TYPE * tb);
 SH_SCOPE void SH_GROW(SH_TYPE * tb, uint32 newsize);
 SH_SCOPE	SH_ELEMENT_TYPE *SH_INSERT(SH_TYPE * tb, SH_KEY_TYPE key, bool *found);
+SH_SCOPE	SH_ELEMENT_TYPE *SH_INSERT_HASH(SH_TYPE * tb, SH_KEY_TYPE key,
+											uint32 hash, bool *found);
 SH_SCOPE	SH_ELEMENT_TYPE *SH_LOOKUP(SH_TYPE * tb, SH_KEY_TYPE key);
+SH_SCOPE	SH_ELEMENT_TYPE *SH_LOOKUP_HASH(SH_TYPE * tb, SH_KEY_TYPE key,
+											uint32 hash);
 SH_SCOPE bool SH_DELETE(SH_TYPE * tb, SH_KEY_TYPE key);
 SH_SCOPE void SH_START_ITERATE(SH_TYPE * tb, SH_ITERATOR * iter);
 SH_SCOPE void SH_START_ITERATE_AT(SH_TYPE * tb, SH_ITERATOR * iter, uint32 at);
@@ -499,7 +505,19 @@ SH_GROW(SH_TYPE * tb, uint32 newsize)
 SH_SCOPE	SH_ELEMENT_TYPE *
 SH_INSERT(SH_TYPE * tb, SH_KEY_TYPE key, bool *found)
 {
-	uint32		hash = SH_HASH_KEY(tb, key);
+	uint32 hash = SH_HASH_KEY(tb, key);
+
+	return SH_INSERT_HASH(tb, key, hash, found);
+}
+
+/*
+ * Insert the key key into the hash-table using an already-calculated
+ * hash. Set *found to true if the key already exists, false
+ * otherwise. Returns the hash-table entry in either case.
+ */
+SH_SCOPE	SH_ELEMENT_TYPE *
+SH_INSERT_HASH(SH_TYPE * tb, SH_KEY_TYPE key, uint32 hash, bool *found)
+{
 	uint32		startelem;
 	uint32		curelem;
 	SH_ELEMENT_TYPE *data;
@@ -669,7 +687,19 @@ restart:
 SH_SCOPE	SH_ELEMENT_TYPE *
 SH_LOOKUP(SH_TYPE * tb, SH_KEY_TYPE key)
 {
-	uint32		hash = SH_HASH_KEY(tb, key);
+	uint32 hash = SH_HASH_KEY(tb, key);
+
+	return SH_LOOKUP_HASH(tb, key, hash);
+}
+
+/*
+ * Lookup up entry in hash table using an already-calculated hash.
+ *
+ * Returns NULL if key not present.
+ */
+SH_SCOPE	SH_ELEMENT_TYPE *
+SH_LOOKUP_HASH(SH_TYPE * tb, SH_KEY_TYPE key, uint32 hash)
+{
 	const uint32 startelem = SH_INITIAL_BUCKET(tb, hash);
 	uint32		curelem = startelem;
 
@@ -971,8 +1001,10 @@ SH_STAT(SH_TYPE * tb)
 #undef SH_DESTROY
 #undef SH_RESET
 #undef SH_INSERT
+#undef SH_INSERT_HASH
 #undef SH_DELETE
 #undef SH_LOOKUP
+#undef SH_LOOKUP_HASH
 #undef SH_GROW
 #undef SH_START_ITERATE
 #undef SH_START_ITERATE_AT
-- 
2.17.1

Allow simplehash to use already-calculated hash values

Reply via email to