The attached small patch adds new entry points to simplehash.h that allow the caller to pass in the already-calculated hash value, so that simplehash doesn't need to recalculate it.
This is helpful for Memory-Bounded Hash Aggregation[1], which uses the hash value for multiple purposes. For instance, if the hash table is full and the group is not already present in the hash table, it needs to spill the tuple to disk. In that case, it would use the hash value for the initial lookup, then to select the right spill partition. Later, when it processes the batch, it will again need the same hash value to perform a lookup. By separating the hash value calculation from where it's used, we can avoid needlessly recalculating it for each of these steps. There is already an option for simplehash to cache the calculated hash value and return it with the entry, but that doesn't quite fit the need. The hash value is needed in cases where the lookup fails, because that is when the tuple must be spilled; but if the lookup fails, it returns NULL, discarding the calculated hash value. I am including this patch separately from Hash Aggregation because it is a small and independently-reviewable change. In theory, this could add overhead for "SH_SCOPE extern" for callers not specifying their own hash value, because it adds an extra external function call. I looked at the generated LLVM and it's a simple tail call, and I looked at the generated assembly and it's just an extra jmp. I tested by doing a hash aggregation of 30M zeroes, which should exercise that path a lot, and I didn't see any difference. Also, once we actually use this for hash aggregation, there will be no "SH_SCOPE extern" callers that don't specify the hash value anyway. Regards, Jeff Davis [1] https://postgr.es/m/507ac540ec7c20136364b5272acbcd4574aa76ef.camel%40j-davis.com
From ac4877d1e7b64da6caead189b85813fb5fed81a0 Mon Sep 17 00:00:00 2001 From: Jeff Davis <jda...@postgresql.org> Date: Tue, 16 Jul 2019 12:07:37 -0700 Subject: [PATCH] Allow simplehash to use already-calculated hash values. Add _lookup_hash and _insert_hash functions for callers that have already calculated the hash value of the key. This is intended for use with hash algorithms that write to disk in partitions. The hash value can be calculated once, used to perform a lookup, used to select the partition, then written to the partition along with the tuple. When the tuple is read back, the hash value does not need to be recalculated. --- src/include/lib/simplehash.h | 36 ++++++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/src/include/lib/simplehash.h b/src/include/lib/simplehash.h index 5c6bd93bc7..065062ee96 100644 --- a/src/include/lib/simplehash.h +++ b/src/include/lib/simplehash.h @@ -74,8 +74,10 @@ #define SH_DESTROY SH_MAKE_NAME(destroy) #define SH_RESET SH_MAKE_NAME(reset) #define SH_INSERT SH_MAKE_NAME(insert) +#define SH_INSERT_HASH SH_MAKE_NAME(insert_hash) #define SH_DELETE SH_MAKE_NAME(delete) #define SH_LOOKUP SH_MAKE_NAME(lookup) +#define SH_LOOKUP_HASH SH_MAKE_NAME(lookup_hash) #define SH_GROW SH_MAKE_NAME(grow) #define SH_START_ITERATE SH_MAKE_NAME(start_iterate) #define SH_START_ITERATE_AT SH_MAKE_NAME(start_iterate_at) @@ -144,7 +146,11 @@ SH_SCOPE void SH_DESTROY(SH_TYPE * tb); SH_SCOPE void SH_RESET(SH_TYPE * tb); SH_SCOPE void SH_GROW(SH_TYPE * tb, uint32 newsize); SH_SCOPE SH_ELEMENT_TYPE *SH_INSERT(SH_TYPE * tb, SH_KEY_TYPE key, bool *found); +SH_SCOPE SH_ELEMENT_TYPE *SH_INSERT_HASH(SH_TYPE * tb, SH_KEY_TYPE key, + uint32 hash, bool *found); SH_SCOPE SH_ELEMENT_TYPE *SH_LOOKUP(SH_TYPE * tb, SH_KEY_TYPE key); +SH_SCOPE SH_ELEMENT_TYPE *SH_LOOKUP_HASH(SH_TYPE * tb, SH_KEY_TYPE key, + uint32 hash); SH_SCOPE bool SH_DELETE(SH_TYPE * tb, SH_KEY_TYPE key); SH_SCOPE void SH_START_ITERATE(SH_TYPE * tb, SH_ITERATOR * iter); SH_SCOPE void SH_START_ITERATE_AT(SH_TYPE * tb, SH_ITERATOR * iter, uint32 at); @@ -499,7 +505,19 @@ SH_GROW(SH_TYPE * tb, uint32 newsize) SH_SCOPE SH_ELEMENT_TYPE * SH_INSERT(SH_TYPE * tb, SH_KEY_TYPE key, bool *found) { - uint32 hash = SH_HASH_KEY(tb, key); + uint32 hash = SH_HASH_KEY(tb, key); + + return SH_INSERT_HASH(tb, key, hash, found); +} + +/* + * Insert the key key into the hash-table using an already-calculated + * hash. Set *found to true if the key already exists, false + * otherwise. Returns the hash-table entry in either case. + */ +SH_SCOPE SH_ELEMENT_TYPE * +SH_INSERT_HASH(SH_TYPE * tb, SH_KEY_TYPE key, uint32 hash, bool *found) +{ uint32 startelem; uint32 curelem; SH_ELEMENT_TYPE *data; @@ -669,7 +687,19 @@ restart: SH_SCOPE SH_ELEMENT_TYPE * SH_LOOKUP(SH_TYPE * tb, SH_KEY_TYPE key) { - uint32 hash = SH_HASH_KEY(tb, key); + uint32 hash = SH_HASH_KEY(tb, key); + + return SH_LOOKUP_HASH(tb, key, hash); +} + +/* + * Lookup up entry in hash table using an already-calculated hash. + * + * Returns NULL if key not present. + */ +SH_SCOPE SH_ELEMENT_TYPE * +SH_LOOKUP_HASH(SH_TYPE * tb, SH_KEY_TYPE key, uint32 hash) +{ const uint32 startelem = SH_INITIAL_BUCKET(tb, hash); uint32 curelem = startelem; @@ -971,8 +1001,10 @@ SH_STAT(SH_TYPE * tb) #undef SH_DESTROY #undef SH_RESET #undef SH_INSERT +#undef SH_INSERT_HASH #undef SH_DELETE #undef SH_LOOKUP +#undef SH_LOOKUP_HASH #undef SH_GROW #undef SH_START_ITERATE #undef SH_START_ITERATE_AT -- 2.17.1