Reduce TupleHashEntryData struct size by half

Jeff Davis Sun, 17 Nov 2024 16:02:19 -0800

TupleHashEntryData is the size of a hash bucket, and it's currently 24
bytes. The size of the structure is important for HashAgg and other
callers that can build up large hashtables of small tuples. Waste in
this structure is even worse when the hash table is sparse, because the
space is consumed whether the bucket is occupied or not.


The attached patch series brings it down to 12. There are several
unappealing hacks, so ideally we'd find something cleaner, but halving
the bucket size seems quite desirable if it can be done in an
acceptable way.

test:
   create table big(i int, j int);
   insert into big select g, 1 from generate_series(1,10000000) g;
   vacuum freeze analyze big; checkpoint;
   select pg_prewarm('big'::regclass);
   set hash_mem_multiplier = 1;
   set work_mem = '10GB';
   explain analyze select count(*) from
     (select i from big group by i) s;

results:
   master: 768MiB memory, 4110ms
   patches: 576MiB memory, 4070ms

(Timing difference is within test noise, so the benefit is reduced
memory footprint.)

This change is related to the problem and potential solutions discussed
at [1].

Patches:

0001: This patch makes the structure private, so that it's easier to
control the way the structure is accessed.

0002: Removes the "additional" pointer, instead storing it right after
the tuple, which is already stored in a separate chunk. Hack: this
crams the "additional" pointer after the MinimalTuple in the same chunk
of memory to avoid adding additional palloc()s.

0003: simplehash: allow the caller to decide which entries are empty vs
in-use rather than requiring a separate "status" field. This may limit
other possible status values in the future (i.e. adding on to the
enum), but I'm not sure what those other stutus values might be.

0004: Removes the "status" field from TupleHashEntryData, using
firstTuple==NULL to mean "empty", otherwise "in use". Hack: need an
additional "special" pointer value to mean "input slot" now that NULL
means "empty".

0005: Pack TupleHashEntryData. IIUC, this is fine even on machines that
can't do unaligned access, so long as we are accessing the fields
through the struct, and not taking the address of individual members.

Regards,
        Jeff Davis


[1]
https://www.postgresql.org/message-id/e061c5439996e13c0fb51997e92d6a09834a7796.camel%40j-davis.com

From cf6f0afb6d8bf9215cb4008085e42a24c14e2546 Mon Sep 17 00:00:00 2001
From: Jeff Davis <j...@j-davis.com>
Date: Thu, 14 Nov 2024 11:47:51 -0800
Subject: [PATCH v1 1/5] Hide details of TupleHashEntryData struct.

---
 src/backend/executor/execGrouping.c | 41 ++++++++++++++++++++++++++---
 src/backend/executor/nodeAgg.c      | 16 +++++------
 src/backend/executor/nodeSetOp.c    | 20 +++++++-------
 src/backend/executor/nodeSubplan.c  |  2 +-
 src/include/executor/executor.h     |  4 +++
 src/include/nodes/execnodes.h       | 11 ++------
 6 files changed, 63 insertions(+), 31 deletions(-)

diff --git a/src/backend/executor/execGrouping.c b/src/backend/executor/execGrouping.c
index 774e4de882..c6aa28b17e 100644
--- a/src/backend/executor/execGrouping.c
+++ b/src/backend/executor/execGrouping.c
@@ -20,6 +20,14 @@
 #include "miscadmin.h"
 #include "utils/lsyscache.h"
 
+struct TupleHashEntryData
+{
+	MinimalTuple firstTuple;	/* copy of first tuple in this group */
+	void	   *additional;		/* user data */
+	uint32		status;			/* hash status */
+	uint32		hash;			/* hash value (cached) */
+};
+
 static int	TupleHashTableMatch(struct tuplehash_hash *tb, const MinimalTuple tuple1, const MinimalTuple tuple2);
 static inline uint32 TupleHashTableHash_internal(struct tuplehash_hash *tb,
 												 const MinimalTuple tuple);
@@ -33,7 +41,7 @@ static inline TupleHashEntry LookupTupleHashEntry_internal(TupleHashTable hashta
  * visible).
  */
 #define SH_PREFIX tuplehash
-#define SH_ELEMENT_TYPE TupleHashEntryData
+#define SH_ELEMENT_TYPE struct TupleHashEntryData
 #define SH_KEY_TYPE MinimalTuple
 #define SH_KEY firstTuple
 #define SH_HASH_KEY(tb, key) TupleHashTableHash_internal(tb, key)
@@ -165,7 +173,7 @@ BuildTupleHashTableExt(PlanState *parent,
 					   bool use_variable_hash_iv)
 {
 	TupleHashTable hashtable;
-	Size		entrysize = sizeof(TupleHashEntryData) + additionalsize;
+	Size		entrysize = sizeof(struct TupleHashEntryData) + additionalsize;
 	Size		hash_mem_limit;
 	MemoryContext oldcontext;
 	bool		allow_jit;
@@ -288,6 +296,15 @@ ResetTupleHashTable(TupleHashTable hashtable)
 	tuplehash_reset(hashtable->hashtab);
 }
 
+/*
+ * Return size of the hash bucket. Useful for estimating memory usage.
+ */
+size_t
+TupleHashEntrySize(void)
+{
+	return sizeof(struct TupleHashEntryData);
+}
+
 /*
  * Find or create a hashtable entry for the tuple group containing the
  * given tuple.  The tuple must be the same type as the hashtable entries.
@@ -354,6 +371,24 @@ TupleHashTableHash(TupleHashTable hashtable, TupleTableSlot *slot)
 	return hash;
 }
 
+MinimalTuple
+TupleHashEntryGetTuple(TupleHashEntry entry)
+{
+	return entry->firstTuple;
+}
+
+void *
+TupleHashEntryGetAdditional(TupleHashEntry entry)
+{
+	return entry->additional;
+}
+
+void
+TupleHashEntrySetAdditional(TupleHashEntry entry, void *additional)
+{
+	entry->additional = additional;
+}
+
 /*
  * A variant of LookupTupleHashEntry for callers that have already computed
  * the hash value.
@@ -497,7 +532,7 @@ static inline TupleHashEntry
 LookupTupleHashEntry_internal(TupleHashTable hashtable, TupleTableSlot *slot,
 							  bool *isnew, uint32 hash)
 {
-	TupleHashEntryData *entry;
+	struct TupleHashEntryData *entry;
 	bool		found;
 	MinimalTuple key;
 
diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index 53ead77ece..29d604bd8f 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -1713,7 +1713,7 @@ hash_agg_entry_size(int numTrans, Size tupleWidth, Size transitionSpace)
 		transitionChunkSize = 0;
 
 	return
-		sizeof(TupleHashEntryData) +
+		TupleHashEntrySize() +
 		tupleChunkSize +
 		pergroupChunkSize +
 		transitionChunkSize;
@@ -1954,7 +1954,7 @@ hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
 	if (aggstate->hash_ngroups_current > 0)
 	{
 		aggstate->hashentrysize =
-			sizeof(TupleHashEntryData) +
+			TupleHashEntrySize() +
 			(hashkey_mem / (double) aggstate->hash_ngroups_current);
 	}
 }
@@ -2059,7 +2059,7 @@ initialize_hash_entry(AggState *aggstate, TupleHashTable hashtable,
 		MemoryContextAlloc(hashtable->tablecxt,
 						   sizeof(AggStatePerGroupData) * aggstate->numtrans);
 
-	entry->additional = pergroup;
+	TupleHashEntrySetAdditional(entry, pergroup);
 
 	/*
 	 * Initialize aggregates for new tuple group, lookup_hash_entries()
@@ -2123,7 +2123,7 @@ lookup_hash_entries(AggState *aggstate)
 		{
 			if (isnew)
 				initialize_hash_entry(aggstate, hashtable, entry);
-			pergroup[setno] = entry->additional;
+			pergroup[setno] = TupleHashEntryGetAdditional(entry);
 		}
 		else
 		{
@@ -2681,7 +2681,7 @@ agg_refill_hash_table(AggState *aggstate)
 		{
 			if (isnew)
 				initialize_hash_entry(aggstate, perhash->hashtable, entry);
-			aggstate->hash_pergroup[batch->setno] = entry->additional;
+			aggstate->hash_pergroup[batch->setno] = TupleHashEntryGetAdditional(entry);
 			advance_aggregates(aggstate);
 		}
 		else
@@ -2773,7 +2773,7 @@ agg_retrieve_hash_table_in_memory(AggState *aggstate)
 	ExprContext *econtext;
 	AggStatePerAgg peragg;
 	AggStatePerGroup pergroup;
-	TupleHashEntryData *entry;
+	TupleHashEntry entry;
 	TupleTableSlot *firstSlot;
 	TupleTableSlot *result;
 	AggStatePerHash perhash;
@@ -2845,7 +2845,7 @@ agg_retrieve_hash_table_in_memory(AggState *aggstate)
 		 * Transform representative tuple back into one with the right
 		 * columns.
 		 */
-		ExecStoreMinimalTuple(entry->firstTuple, hashslot, false);
+		ExecStoreMinimalTuple(TupleHashEntryGetTuple(entry), hashslot, false);
 		slot_getallattrs(hashslot);
 
 		ExecClearTuple(firstSlot);
@@ -2861,7 +2861,7 @@ agg_retrieve_hash_table_in_memory(AggState *aggstate)
 		}
 		ExecStoreVirtualTuple(firstSlot);
 
-		pergroup = (AggStatePerGroup) entry->additional;
+		pergroup = (AggStatePerGroup) TupleHashEntryGetAdditional(entry);
 
 		/*
 		 * Use the representative input tuple for any references to
diff --git a/src/backend/executor/nodeSetOp.c b/src/backend/executor/nodeSetOp.c
index a8ac68b482..c1c030fe7e 100644
--- a/src/backend/executor/nodeSetOp.c
+++ b/src/backend/executor/nodeSetOp.c
@@ -364,7 +364,7 @@ setop_fill_hash_table(SetOpState *setopstate)
 	{
 		TupleTableSlot *outerslot;
 		int			flag;
-		TupleHashEntryData *entry;
+		TupleHashEntry entry;
 		bool		isnew;
 
 		outerslot = ExecProcNode(outerPlan);
@@ -386,14 +386,14 @@ setop_fill_hash_table(SetOpState *setopstate)
 			/* If new tuple group, initialize counts */
 			if (isnew)
 			{
-				entry->additional = (SetOpStatePerGroup)
-					MemoryContextAlloc(setopstate->hashtable->tablecxt,
-									   sizeof(SetOpStatePerGroupData));
-				initialize_counts((SetOpStatePerGroup) entry->additional);
+				TupleHashEntrySetAdditional(entry, (SetOpStatePerGroup)
+											MemoryContextAlloc(setopstate->hashtable->tablecxt,
+															   sizeof(SetOpStatePerGroupData)));
+				initialize_counts((SetOpStatePerGroup) TupleHashEntryGetAdditional(entry));
 			}
 
 			/* Advance the counts */
-			advance_counts((SetOpStatePerGroup) entry->additional, flag);
+			advance_counts((SetOpStatePerGroup) TupleHashEntryGetAdditional(entry), flag);
 		}
 		else
 		{
@@ -406,7 +406,7 @@ setop_fill_hash_table(SetOpState *setopstate)
 
 			/* Advance the counts if entry is already present */
 			if (entry)
-				advance_counts((SetOpStatePerGroup) entry->additional, flag);
+				advance_counts((SetOpStatePerGroup) TupleHashEntryGetAdditional(entry), flag);
 		}
 
 		/* Must reset expression context after each hashtable lookup */
@@ -424,7 +424,7 @@ setop_fill_hash_table(SetOpState *setopstate)
 static TupleTableSlot *
 setop_retrieve_hash_table(SetOpState *setopstate)
 {
-	TupleHashEntryData *entry;
+	TupleHashEntry entry;
 	TupleTableSlot *resultTupleSlot;
 
 	/*
@@ -454,12 +454,12 @@ setop_retrieve_hash_table(SetOpState *setopstate)
 		 * See if we should emit any copies of this tuple, and if so return
 		 * the first copy.
 		 */
-		set_output_count(setopstate, (SetOpStatePerGroup) entry->additional);
+		set_output_count(setopstate, (SetOpStatePerGroup) TupleHashEntryGetAdditional(entry));
 
 		if (setopstate->numOutput > 0)
 		{
 			setopstate->numOutput--;
-			return ExecStoreMinimalTuple(entry->firstTuple,
+			return ExecStoreMinimalTuple(TupleHashEntryGetTuple(entry),
 										 resultTupleSlot,
 										 false);
 		}
diff --git a/src/backend/executor/nodeSubplan.c b/src/backend/executor/nodeSubplan.c
index 236222d72a..63aff24251 100644
--- a/src/backend/executor/nodeSubplan.c
+++ b/src/backend/executor/nodeSubplan.c
@@ -746,7 +746,7 @@ findPartialMatch(TupleHashTable hashtable, TupleTableSlot *slot,
 	{
 		CHECK_FOR_INTERRUPTS();
 
-		ExecStoreMinimalTuple(entry->firstTuple, hashtable->tableslot, false);
+		ExecStoreMinimalTuple(TupleHashEntryGetTuple(entry), hashtable->tableslot, false);
 		if (!execTuplesUnequal(slot, hashtable->tableslot,
 							   numCols, keyColIdx,
 							   eqfunctions,
diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h
index 69c3ebff00..6ba8648db1 100644
--- a/src/include/executor/executor.h
+++ b/src/include/executor/executor.h
@@ -154,9 +154,13 @@ extern TupleHashEntry LookupTupleHashEntry(TupleHashTable hashtable,
 										   bool *isnew, uint32 *hash);
 extern uint32 TupleHashTableHash(TupleHashTable hashtable,
 								 TupleTableSlot *slot);
+extern size_t TupleHashEntrySize(void);
 extern TupleHashEntry LookupTupleHashEntryHash(TupleHashTable hashtable,
 											   TupleTableSlot *slot,
 											   bool *isnew, uint32 hash);
+extern MinimalTuple TupleHashEntryGetTuple(TupleHashEntry entry);
+extern void *TupleHashEntryGetAdditional(TupleHashEntry entry);
+extern void TupleHashEntrySetAdditional(TupleHashEntry entry, void *additional);
 extern TupleHashEntry FindTupleHashEntry(TupleHashTable hashtable,
 										 TupleTableSlot *slot,
 										 ExprState *eqcomp,
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 182a6956bb..f88637825d 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -806,20 +806,13 @@ typedef struct ExecAuxRowMark
  * tab_eq_func respectively.
  * ----------------------------------------------------------------
  */
+struct TupleHashEntryData;
 typedef struct TupleHashEntryData *TupleHashEntry;
 typedef struct TupleHashTableData *TupleHashTable;
 
-typedef struct TupleHashEntryData
-{
-	MinimalTuple firstTuple;	/* copy of first tuple in this group */
-	void	   *additional;		/* user data */
-	uint32		status;			/* hash status */
-	uint32		hash;			/* hash value (cached) */
-} TupleHashEntryData;
-
 /* define parameters necessary to generate the tuple hash table interface */
 #define SH_PREFIX tuplehash
-#define SH_ELEMENT_TYPE TupleHashEntryData
+#define SH_ELEMENT_TYPE struct TupleHashEntryData
 #define SH_KEY_TYPE MinimalTuple
 #define SH_SCOPE extern
 #define SH_DECLARE
-- 
2.34.1

From 1d7dfadc55e78915f724a3fc04ef56385d1a433b Mon Sep 17 00:00:00 2001
From: Jeff Davis <j...@j-davis.com>
Date: Fri, 15 Nov 2024 18:33:14 -0800
Subject: [PATCH v1 2/5] TupleHashTable: remove "additional" pointer.

Store at the end of the MinimalTuple. A hack, but reduces the bucket
size substantially.
---
 src/backend/executor/execGrouping.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/backend/executor/execGrouping.c b/src/backend/executor/execGrouping.c
index c6aa28b17e..40ba15ca54 100644
--- a/src/backend/executor/execGrouping.c
+++ b/src/backend/executor/execGrouping.c
@@ -23,7 +23,6 @@
 struct TupleHashEntryData
 {
 	MinimalTuple firstTuple;	/* copy of first tuple in this group */
-	void	   *additional;		/* user data */
 	uint32		status;			/* hash status */
 	uint32		hash;			/* hash value (cached) */
 };
@@ -380,13 +379,19 @@ TupleHashEntryGetTuple(TupleHashEntry entry)
 void *
 TupleHashEntryGetAdditional(TupleHashEntry entry)
 {
-	return entry->additional;
+	char	   *ptr = (char *) entry->firstTuple;
+	void	   *additional;
+
+	memcpy(&additional, ptr + entry->firstTuple->t_len, sizeof(void *));
+	return additional;
 }
 
 void
 TupleHashEntrySetAdditional(TupleHashEntry entry, void *additional)
 {
-	entry->additional = additional;
+	char	   *ptr = (char *) entry->firstTuple;
+
+	memcpy(ptr + entry->firstTuple->t_len, &additional, sizeof(void *));
 }
 
 /*
@@ -552,10 +557,11 @@ LookupTupleHashEntry_internal(TupleHashTable hashtable, TupleTableSlot *slot,
 			/* created new entry */
 			*isnew = true;
 			/* zero caller data */
-			entry->additional = NULL;
 			MemoryContextSwitchTo(hashtable->tablecxt);
 			/* Copy the first tuple into the table context */
 			entry->firstTuple = ExecCopySlotMinimalTuple(slot);
+			entry->firstTuple = repalloc(entry->firstTuple, entry->firstTuple->t_len + sizeof(void *));
+			memset((char *) entry->firstTuple + entry->firstTuple->t_len, 0, sizeof(void *));
 		}
 	}
 	else
-- 
2.34.1

From 6cf4390ef3e5898ee42e672f68cef88cd8d484f9 Mon Sep 17 00:00:00 2001
From: Jeff Davis <j...@j-davis.com>
Date: Fri, 15 Nov 2024 12:21:04 -0800
Subject: [PATCH v1 3/5] simplehash: don't require a "status" field.

Callers may define SH_ENTRY_IS_EMPTY(), SH_ENTRY_SET_EMPTY(), and
SH_ENTRY_SET_IN_USE() as an alternative, which can allow for a more
compact entry size. That reduces the memory overhead, especially when
the hash table is sparse.
---
 src/include/lib/simplehash.h | 68 ++++++++++++++++++++++++------------
 1 file changed, 45 insertions(+), 23 deletions(-)

diff --git a/src/include/lib/simplehash.h b/src/include/lib/simplehash.h
index 3e1b1f9461..213d8c372e 100644
--- a/src/include/lib/simplehash.h
+++ b/src/include/lib/simplehash.h
@@ -50,9 +50,18 @@
  *	  - SH_HASH_KEY(table, key) - generate hash for the key
  *	  - SH_STORE_HASH - if defined the hash is stored in the elements
  *	  - SH_GET_HASH(tb, a) - return the field to store the hash in
+ *	  - SH_ENTRY_IS_EMPTY(tb, a) - return true if the entry is empty
+ *	  - SH_ENTRY_SET_EMPTY(tb, a) - set entry to empty
+ *	  - SH_ENTRY_SET_IN_USE(tb, a) - set entry to "in use"
  *
  *	  The element type is required to contain a "status" member that can store
- *	  the range of values defined in the SH_STATUS enum.
+ *	  the range of values defined in the SH_STATUS enum. Alternatively,
+ *	  callers may define all of SH_ENTRY_IS_EMPTY, SH_ENTRY_SET_EMPTY,
+ *	  SH_ENTRY_SET_IN_USE to allow for a more compact entry size. For example,
+ *	  if the SH_ELEMENT_TYPE contains a pointer, the caller may decide that
+ *	  the entry is empty if the pointer is NULL and "in use" if the pointer is
+ *	  non-NULL. NB: if the entire entry is zero, then SH_ENTRY_IS_EMPTY()
+ *	  *must* evaluate to true.
  *
  *	  While SH_STORE_HASH (and subsequently SH_GET_HASH) are optional, because
  *	  the hash table implementation needs to compare hashes to move elements
@@ -277,6 +286,18 @@ SH_SCOPE void SH_STAT(SH_TYPE * tb);
 #define SH_GROW_MIN_FILLFACTOR 0.1
 #endif
 
+#ifndef SH_ENTRY_IS_EMPTY
+#define SH_ENTRY_IS_EMPTY(tb, a) ((a)->status == SH_STATUS_EMPTY)
+#endif
+
+#ifndef SH_ENTRY_SET_EMPTY
+#define SH_ENTRY_SET_EMPTY(tb, a) do { (a)->status = SH_STATUS_EMPTY; } while (0)
+#endif
+
+#ifndef SH_ENTRY_SET_IN_USE
+#define SH_ENTRY_SET_IN_USE(tb, a) do { (a)->status = SH_STATUS_IN_USE; } while (0)
+#endif
+
 #ifdef SH_STORE_HASH
 #define SH_COMPARE_KEYS(tb, ahash, akey, b) (ahash == SH_GET_HASH(tb, b) && SH_EQUAL(tb, b->SH_KEY, akey))
 #else
@@ -540,7 +561,7 @@ SH_GROW(SH_TYPE * tb, uint64 newsize)
 		uint32		hash;
 		uint32		optimal;
 
-		if (oldentry->status != SH_STATUS_IN_USE)
+		if (SH_ENTRY_IS_EMPTY(tb, oldentry))
 		{
 			startelem = i;
 			break;
@@ -562,7 +583,7 @@ SH_GROW(SH_TYPE * tb, uint64 newsize)
 	{
 		SH_ELEMENT_TYPE *oldentry = &olddata[copyelem];
 
-		if (oldentry->status == SH_STATUS_IN_USE)
+		if (!SH_ENTRY_IS_EMPTY(tb, oldentry))
 		{
 			uint32		hash;
 			uint32		startelem2;
@@ -578,7 +599,7 @@ SH_GROW(SH_TYPE * tb, uint64 newsize)
 			{
 				newentry = &newdata[curelem];
 
-				if (newentry->status == SH_STATUS_EMPTY)
+				if (SH_ENTRY_IS_EMPTY(tb, newentry))
 				{
 					break;
 				}
@@ -649,14 +670,14 @@ restart:
 		SH_ELEMENT_TYPE *entry = &data[curelem];
 
 		/* any empty bucket can directly be used */
-		if (entry->status == SH_STATUS_EMPTY)
+		if (SH_ENTRY_IS_EMPTY(tb, entry))
 		{
 			tb->members++;
 			entry->SH_KEY = key;
 #ifdef SH_STORE_HASH
 			SH_GET_HASH(tb, entry) = hash;
 #endif
-			entry->status = SH_STATUS_IN_USE;
+			SH_ENTRY_SET_IN_USE(tb, entry);
 			*found = false;
 			return entry;
 		}
@@ -671,7 +692,7 @@ restart:
 
 		if (SH_COMPARE_KEYS(tb, hash, key, entry))
 		{
-			Assert(entry->status == SH_STATUS_IN_USE);
+			Assert(!SH_ENTRY_IS_EMPTY(tb, entry));
 			*found = true;
 			return entry;
 		}
@@ -695,7 +716,7 @@ restart:
 				emptyelem = SH_NEXT(tb, emptyelem, startelem);
 				emptyentry = &data[emptyelem];
 
-				if (emptyentry->status == SH_STATUS_EMPTY)
+				if (SH_ENTRY_IS_EMPTY(tb, emptyentry))
 				{
 					lastentry = emptyentry;
 					break;
@@ -744,7 +765,7 @@ restart:
 #ifdef SH_STORE_HASH
 			SH_GET_HASH(tb, entry) = hash;
 #endif
-			entry->status = SH_STATUS_IN_USE;
+			SH_ENTRY_SET_IN_USE(tb, entry);
 			*found = false;
 			return entry;
 		}
@@ -806,13 +827,11 @@ SH_LOOKUP_HASH_INTERNAL(SH_TYPE * tb, SH_KEY_TYPE key, uint32 hash)
 	{
 		SH_ELEMENT_TYPE *entry = &tb->data[curelem];
 
-		if (entry->status == SH_STATUS_EMPTY)
+		if (SH_ENTRY_IS_EMPTY(tb, entry))
 		{
 			return NULL;
 		}
 
-		Assert(entry->status == SH_STATUS_IN_USE);
-
 		if (SH_COMPARE_KEYS(tb, hash, key, entry))
 			return entry;
 
@@ -864,10 +883,10 @@ SH_DELETE(SH_TYPE * tb, SH_KEY_TYPE key)
 	{
 		SH_ELEMENT_TYPE *entry = &tb->data[curelem];
 
-		if (entry->status == SH_STATUS_EMPTY)
+		if (SH_ENTRY_IS_EMPTY(tb, entry))
 			return false;
 
-		if (entry->status == SH_STATUS_IN_USE &&
+		if (!SH_ENTRY_IS_EMPTY(tb, entry) &&
 			SH_COMPARE_KEYS(tb, hash, key, entry))
 		{
 			SH_ELEMENT_TYPE *lastentry = entry;
@@ -890,9 +909,9 @@ SH_DELETE(SH_TYPE * tb, SH_KEY_TYPE key)
 				curelem = SH_NEXT(tb, curelem, startelem);
 				curentry = &tb->data[curelem];
 
-				if (curentry->status != SH_STATUS_IN_USE)
+				if (SH_ENTRY_IS_EMPTY(tb, curentry))
 				{
-					lastentry->status = SH_STATUS_EMPTY;
+					SH_ENTRY_SET_EMPTY(tb, lastentry);
 					break;
 				}
 
@@ -902,7 +921,7 @@ SH_DELETE(SH_TYPE * tb, SH_KEY_TYPE key)
 				/* current is at optimal position, done */
 				if (curoptimal == curelem)
 				{
-					lastentry->status = SH_STATUS_EMPTY;
+					SH_ENTRY_SET_EMPTY(tb, lastentry);
 					break;
 				}
 
@@ -953,9 +972,9 @@ SH_DELETE_ITEM(SH_TYPE * tb, SH_ELEMENT_TYPE * entry)
 		curelem = SH_NEXT(tb, curelem, startelem);
 		curentry = &tb->data[curelem];
 
-		if (curentry->status != SH_STATUS_IN_USE)
+		if (SH_ENTRY_IS_EMPTY(tb, curentry))
 		{
-			lastentry->status = SH_STATUS_EMPTY;
+			SH_ENTRY_SET_EMPTY(tb, lastentry);
 			break;
 		}
 
@@ -965,7 +984,7 @@ SH_DELETE_ITEM(SH_TYPE * tb, SH_ELEMENT_TYPE * entry)
 		/* current is at optimal position, done */
 		if (curoptimal == curelem)
 		{
-			lastentry->status = SH_STATUS_EMPTY;
+			SH_ENTRY_SET_EMPTY(tb, lastentry);
 			break;
 		}
 
@@ -993,7 +1012,7 @@ SH_START_ITERATE(SH_TYPE * tb, SH_ITERATOR * iter)
 	{
 		SH_ELEMENT_TYPE *entry = &tb->data[i];
 
-		if (entry->status != SH_STATUS_IN_USE)
+		if (SH_ENTRY_IS_EMPTY(tb, entry))
 		{
 			startelem = i;
 			break;
@@ -1055,7 +1074,7 @@ SH_ITERATE(SH_TYPE * tb, SH_ITERATOR * iter)
 
 		if ((iter->cur & tb->sizemask) == (iter->end & tb->sizemask))
 			iter->done = true;
-		if (elem->status == SH_STATUS_IN_USE)
+		if (!SH_ENTRY_IS_EMPTY(tb, elem))
 		{
 			return elem;
 		}
@@ -1091,7 +1110,7 @@ SH_STAT(SH_TYPE * tb)
 
 		elem = &tb->data[i];
 
-		if (elem->status != SH_STATUS_IN_USE)
+		if (SH_ENTRY_IS_EMPTY(tb, elem))
 			continue;
 
 		hash = SH_ENTRY_HASH(tb, elem);
@@ -1149,6 +1168,9 @@ SH_STAT(SH_TYPE * tb)
 #undef SH_KEY
 #undef SH_ELEMENT_TYPE
 #undef SH_HASH_KEY
+#undef SH_ENTRY_IS_EMPTY
+#undef SH_ENTRY_SET_EMPTY
+#undef SH_ENTRY_SET_IN_USE
 #undef SH_SCOPE
 #undef SH_DECLARE
 #undef SH_DEFINE
-- 
2.34.1

From d7ba7c25f2a5a3c139bbe36cb232444a82cc23e9 Mon Sep 17 00:00:00 2001
From: Jeff Davis <j...@j-davis.com>
Date: Fri, 15 Nov 2024 13:03:32 -0800
Subject: [PATCH v1 4/5] TupleHashTable: reduce overhead by removing "status"
 field.

Make use of new simplehash API introduced in commit 1234567890.

Because NULL now means that the entry is empty, use a unique dummy
pointer to refer to the inputslot instead. It also makes the code more
readable.
---
 src/backend/executor/execGrouping.c | 33 +++++++++++++++++++----------
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/src/backend/executor/execGrouping.c b/src/backend/executor/execGrouping.c
index 40ba15ca54..dc82259d32 100644
--- a/src/backend/executor/execGrouping.c
+++ b/src/backend/executor/execGrouping.c
@@ -23,7 +23,6 @@
 struct TupleHashEntryData
 {
 	MinimalTuple firstTuple;	/* copy of first tuple in this group */
-	uint32		status;			/* hash status */
 	uint32		hash;			/* hash value (cached) */
 };
 
@@ -34,6 +33,14 @@ static inline TupleHashEntry LookupTupleHashEntry_internal(TupleHashTable hashta
 														   TupleTableSlot *slot,
 														   bool *isnew, uint32 hash);
 
+/*
+ * Create a dummy pointer used to mean the input slot. The pointer will only
+ * be used for comparison and never dereferenced, and must not be equal to a
+ * valid pointer. Distinct from NULL, because NULL means that the hash entry
+ * itself is empty.
+ */
+const static MinimalTuple FIRSTTUPLE_INPUTSLOT = (MinimalTuple) 0x1;
+
 /*
  * Define parameters for tuple hash table code generation. The interface is
  * *also* declared in execnodes.h (to generate the types, which are externally
@@ -48,6 +55,9 @@ static inline TupleHashEntry LookupTupleHashEntry_internal(TupleHashTable hashta
 #define SH_SCOPE extern
 #define SH_STORE_HASH
 #define SH_GET_HASH(tb, a) a->hash
+#define SH_ENTRY_IS_EMPTY(tb, a) ((a)->firstTuple == NULL)
+#define SH_ENTRY_SET_EMPTY(tb, a) do { (a)->firstTuple = NULL; } while (0)
+#define SH_ENTRY_SET_IN_USE(tb, a) Assert((a)->firstTuple != NULL)
 #define SH_DEFINE
 #include "lib/simplehash.h"
 
@@ -335,7 +345,7 @@ LookupTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot,
 	hashtable->in_hash_funcs = hashtable->tab_hash_funcs;
 	hashtable->cur_eq_func = hashtable->tab_eq_func;
 
-	local_hash = TupleHashTableHash_internal(hashtable->hashtab, NULL);
+	local_hash = TupleHashTableHash_internal(hashtable->hashtab, FIRSTTUPLE_INPUTSLOT);
 	entry = LookupTupleHashEntry_internal(hashtable, slot, isnew, local_hash);
 
 	if (hash != NULL)
@@ -363,7 +373,8 @@ TupleHashTableHash(TupleHashTable hashtable, TupleTableSlot *slot)
 	/* Need to run the hash functions in short-lived context */
 	oldContext = MemoryContextSwitchTo(hashtable->tempcxt);
 
-	hash = TupleHashTableHash_internal(hashtable->hashtab, NULL);
+	hash = TupleHashTableHash_internal(hashtable->hashtab,
+									   FIRSTTUPLE_INPUTSLOT);
 
 	MemoryContextSwitchTo(oldContext);
 
@@ -448,7 +459,7 @@ FindTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot,
 	hashtable->cur_eq_func = eqcomp;
 
 	/* Search the hash table */
-	key = NULL;					/* flag to reference inputslot */
+	key = FIRSTTUPLE_INPUTSLOT;
 	entry = tuplehash_lookup(hashtable->hashtab, key);
 	MemoryContextSwitchTo(oldContext);
 
@@ -456,9 +467,9 @@ FindTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot,
 }
 
 /*
- * If tuple is NULL, use the input slot instead. This convention avoids the
- * need to materialize virtual input tuples unless they actually need to get
- * copied into the table.
+ * If tuple is FIRSTTUPLE_INPUTSLOT, use the input slot instead. This
+ * convention avoids the need to materialize virtual input tuples unless they
+ * actually need to get copied into the table.
  *
  * Also, the caller must select an appropriate memory context for running
  * the hash functions. (dynahash.c doesn't change CurrentMemoryContext.)
@@ -475,7 +486,7 @@ TupleHashTableHash_internal(struct tuplehash_hash *tb,
 	FmgrInfo   *hashfunctions;
 	int			i;
 
-	if (tuple == NULL)
+	if (tuple == FIRSTTUPLE_INPUTSLOT)
 	{
 		/* Process the current input tuple for the table */
 		slot = hashtable->inputslot;
@@ -541,7 +552,7 @@ LookupTupleHashEntry_internal(TupleHashTable hashtable, TupleTableSlot *slot,
 	bool		found;
 	MinimalTuple key;
 
-	key = NULL;					/* flag to reference inputslot */
+	key = FIRSTTUPLE_INPUTSLOT;
 
 	if (isnew)
 	{
@@ -589,10 +600,10 @@ TupleHashTableMatch(struct tuplehash_hash *tb, const MinimalTuple tuple1, const
 	 * LookupTupleHashEntry's dummy TupleHashEntryData.  The other direction
 	 * could be supported too, but is not currently required.
 	 */
-	Assert(tuple1 != NULL);
+	Assert(tuple1 && tuple1 != FIRSTTUPLE_INPUTSLOT);
 	slot1 = hashtable->tableslot;
 	ExecStoreMinimalTuple(tuple1, slot1, false);
-	Assert(tuple2 == NULL);
+	Assert(tuple2 == FIRSTTUPLE_INPUTSLOT);
 	slot2 = hashtable->inputslot;
 
 	/* For crosstype comparisons, the inputslot must be first */
-- 
2.34.1

From c5c9084bb72d5a1b7c1e0fb0a0f145bf51eed51c Mon Sep 17 00:00:00 2001
From: Jeff Davis <j...@j-davis.com>
Date: Fri, 15 Nov 2024 18:52:06 -0800
Subject: [PATCH v1 5/5] TupleHashTable: Pack TupleHashEntryData struct.

---
 src/backend/executor/execGrouping.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/backend/executor/execGrouping.c b/src/backend/executor/execGrouping.c
index dc82259d32..2f50f7dec9 100644
--- a/src/backend/executor/execGrouping.c
+++ b/src/backend/executor/execGrouping.c
@@ -24,7 +24,11 @@ struct TupleHashEntryData
 {
 	MinimalTuple firstTuple;	/* copy of first tuple in this group */
 	uint32		hash;			/* hash value (cached) */
-};
+}
+#ifdef pg_attribute_packed
+	pg_attribute_packed()
+#endif
+;
 
 static int	TupleHashTableMatch(struct tuplehash_hash *tb, const MinimalTuple tuple1, const MinimalTuple tuple2);
 static inline uint32 TupleHashTableHash_internal(struct tuplehash_hash *tb,
-- 
2.34.1

Reduce TupleHashEntryData struct size by half

Reply via email to