Re: Reduce TupleHashEntryData struct size by half

Jeff Davis Fri, 07 Feb 2025 17:13:31 -0800

On Sun, 2025-01-12 at 14:54 +1300, David Rowley wrote:
> I wonder if there's some other better way of doing this. Would it be
> worth having some function like ExecCopySlotMinimalTuple() that
> accepts an additional parameter so that the palloc allocates N more
> bytes at the end?


Attached a new series that implements this idea.

I also added in the change to use the Bump allocator for the tablecxt.
In my simple test, the whole patch series reduces HashAgg memory usage
by more than 40% and increases speed by a few percent.

In your test, patch 0003 seemed to have a regression, but then 0007
made up for it (and maybe a hair faster). I investigated, but the
profile was a bit misleading so I'm not clear on why 0003 came out
slower. I can investigate further, but the primary purpose of this
patch series is reducing memory usage, so as long as the overall series
has no regression then I think it's fine.

Regards,
        Jeff Davis

From 9c01b9ac8e1f51d19f4a83521fe1ed98782701f2 Mon Sep 17 00:00:00 2001
From: Jeff Davis <j...@j-davis.com>
Date: Mon, 13 Jan 2025 14:37:02 -0800
Subject: [PATCH v6 1/7] Create accessor functions for TupleHashEntry.

---
 src/backend/executor/execGrouping.c | 49 +++++++++++++++++++++++++++--
 src/backend/executor/nodeAgg.c      | 20 +++++-------
 src/backend/executor/nodeSetOp.c    | 17 +++++-----
 src/backend/executor/nodeSubplan.c  |  2 +-
 src/include/executor/executor.h     |  3 ++
 src/include/nodes/execnodes.h       | 11 ++-----
 6 files changed, 71 insertions(+), 31 deletions(-)

diff --git a/src/backend/executor/execGrouping.c b/src/backend/executor/execGrouping.c
index 33b124fbb0a..6763c35457c 100644
--- a/src/backend/executor/execGrouping.c
+++ b/src/backend/executor/execGrouping.c
@@ -20,6 +20,14 @@
 #include "miscadmin.h"
 #include "utils/lsyscache.h"
 
+struct TupleHashEntryData
+{
+	MinimalTuple firstTuple;	/* copy of first tuple in this group */
+	void	   *additional;		/* user data */
+	uint32		status;			/* hash status */
+	uint32		hash;			/* hash value (cached) */
+};
+
 static int	TupleHashTableMatch(struct tuplehash_hash *tb, const MinimalTuple tuple1, const MinimalTuple tuple2);
 static inline uint32 TupleHashTableHash_internal(struct tuplehash_hash *tb,
 												 const MinimalTuple tuple);
@@ -196,6 +204,7 @@ BuildTupleHashTable(PlanState *parent,
 	hashtable->tab_collations = collations;
 	hashtable->tablecxt = tablecxt;
 	hashtable->tempcxt = tempcxt;
+	hashtable->additionalsize = additionalsize;
 	hashtable->tableslot = NULL;	/* will be made on first lookup */
 	hashtable->inputslot = NULL;
 	hashtable->in_hash_expr = NULL;
@@ -273,6 +282,37 @@ ResetTupleHashTable(TupleHashTable hashtable)
 	tuplehash_reset(hashtable->hashtab);
 }
 
+/*
+ * Return size of the hash bucket. Useful for estimating memory usage.
+ */
+size_t
+TupleHashEntrySize(void)
+{
+	return sizeof(TupleHashEntryData);
+}
+
+/*
+ * Return tuple from hash entry.
+ */
+MinimalTuple
+TupleHashEntryGetTuple(TupleHashEntry entry)
+{
+	return entry->firstTuple;
+}
+
+/*
+ * Get a pointer into the additional space allocated for this entry. The
+ * amount of space available is the additionalsize specified to
+ * BuildTupleHashTable(). If additionalsize was specified as zero, no
+ * additional space is available and this function should not be called.
+ */
+void *
+TupleHashEntryGetAdditional(TupleHashEntry entry)
+{
+	Assert(entry->additional != NULL);
+	return entry->additional;
+}
+
 /*
  * Find or create a hashtable entry for the tuple group containing the
  * given tuple.  The tuple must be the same type as the hashtable entries.
@@ -479,11 +519,16 @@ LookupTupleHashEntry_internal(TupleHashTable hashtable, TupleTableSlot *slot,
 		{
 			/* created new entry */
 			*isnew = true;
-			/* zero caller data */
-			entry->additional = NULL;
+
 			MemoryContextSwitchTo(hashtable->tablecxt);
+
 			/* Copy the first tuple into the table context */
 			entry->firstTuple = ExecCopySlotMinimalTuple(slot);
+
+			if (hashtable->additionalsize > 0)
+				entry->additional = palloc0(hashtable->additionalsize);
+			else
+				entry->additional = NULL;
 		}
 	}
 	else
diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index 3005b5c0e3b..19640752967 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -1713,7 +1713,7 @@ hash_agg_entry_size(int numTrans, Size tupleWidth, Size transitionSpace)
 		transitionChunkSize = 0;
 
 	return
-		sizeof(TupleHashEntryData) +
+		TupleHashEntrySize() +
 		tupleChunkSize +
 		pergroupChunkSize +
 		transitionChunkSize;
@@ -1954,7 +1954,7 @@ hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
 	if (aggstate->hash_ngroups_current > 0)
 	{
 		aggstate->hashentrysize =
-			sizeof(TupleHashEntryData) +
+			TupleHashEntrySize() +
 			(hashkey_mem / (double) aggstate->hash_ngroups_current);
 	}
 }
@@ -2055,11 +2055,7 @@ initialize_hash_entry(AggState *aggstate, TupleHashTable hashtable,
 	if (aggstate->numtrans == 0)
 		return;
 
-	pergroup = (AggStatePerGroup)
-		MemoryContextAlloc(hashtable->tablecxt,
-						   sizeof(AggStatePerGroupData) * aggstate->numtrans);
-
-	entry->additional = pergroup;
+	pergroup = (AggStatePerGroup) TupleHashEntryGetAdditional(entry);
 
 	/*
 	 * Initialize aggregates for new tuple group, lookup_hash_entries()
@@ -2123,7 +2119,7 @@ lookup_hash_entries(AggState *aggstate)
 		{
 			if (isnew)
 				initialize_hash_entry(aggstate, hashtable, entry);
-			pergroup[setno] = entry->additional;
+			pergroup[setno] = TupleHashEntryGetAdditional(entry);
 		}
 		else
 		{
@@ -2681,7 +2677,7 @@ agg_refill_hash_table(AggState *aggstate)
 		{
 			if (isnew)
 				initialize_hash_entry(aggstate, perhash->hashtable, entry);
-			aggstate->hash_pergroup[batch->setno] = entry->additional;
+			aggstate->hash_pergroup[batch->setno] = TupleHashEntryGetAdditional(entry);
 			advance_aggregates(aggstate);
 		}
 		else
@@ -2773,7 +2769,7 @@ agg_retrieve_hash_table_in_memory(AggState *aggstate)
 	ExprContext *econtext;
 	AggStatePerAgg peragg;
 	AggStatePerGroup pergroup;
-	TupleHashEntryData *entry;
+	TupleHashEntry entry;
 	TupleTableSlot *firstSlot;
 	TupleTableSlot *result;
 	AggStatePerHash perhash;
@@ -2845,7 +2841,7 @@ agg_retrieve_hash_table_in_memory(AggState *aggstate)
 		 * Transform representative tuple back into one with the right
 		 * columns.
 		 */
-		ExecStoreMinimalTuple(entry->firstTuple, hashslot, false);
+		ExecStoreMinimalTuple(TupleHashEntryGetTuple(entry), hashslot, false);
 		slot_getallattrs(hashslot);
 
 		ExecClearTuple(firstSlot);
@@ -2861,7 +2857,7 @@ agg_retrieve_hash_table_in_memory(AggState *aggstate)
 		}
 		ExecStoreVirtualTuple(firstSlot);
 
-		pergroup = (AggStatePerGroup) entry->additional;
+		pergroup = (AggStatePerGroup) TupleHashEntryGetAdditional(entry);
 
 		/*
 		 * Use the representative input tuple for any references to
diff --git a/src/backend/executor/nodeSetOp.c b/src/backend/executor/nodeSetOp.c
index 5b7ff9c3748..2a50d56fc87 100644
--- a/src/backend/executor/nodeSetOp.c
+++ b/src/backend/executor/nodeSetOp.c
@@ -425,6 +425,7 @@ setop_fill_hash_table(SetOpState *setopstate)
 	{
 		TupleTableSlot *outerslot;
 		TupleHashEntryData *entry;
+		SetOpStatePerGroup pergroup;
 		bool		isnew;
 
 		outerslot = ExecProcNode(outerPlan);
@@ -437,16 +438,16 @@ setop_fill_hash_table(SetOpState *setopstate)
 									 outerslot,
 									 &isnew, NULL);
 
+		pergroup = TupleHashEntryGetAdditional(entry);
 		/* If new tuple group, initialize counts to zero */
 		if (isnew)
 		{
-			entry->additional = (SetOpStatePerGroup)
-				MemoryContextAllocZero(setopstate->hashtable->tablecxt,
-									   sizeof(SetOpStatePerGroupData));
+			pergroup->numLeft = 0;
+			pergroup->numRight = 0;
 		}
 
 		/* Advance the counts */
-		((SetOpStatePerGroup) entry->additional)->numLeft++;
+		pergroup->numLeft++;
 
 		/* Must reset expression context after each hashtable lookup */
 		ResetExprContext(econtext);
@@ -478,7 +479,7 @@ setop_fill_hash_table(SetOpState *setopstate)
 
 			/* Advance the counts if entry is already present */
 			if (entry)
-				((SetOpStatePerGroup) entry->additional)->numRight++;
+				((SetOpStatePerGroup) TupleHashEntryGetAdditional(entry))->numRight++;
 
 			/* Must reset expression context after each hashtable lookup */
 			ResetExprContext(econtext);
@@ -496,7 +497,7 @@ setop_fill_hash_table(SetOpState *setopstate)
 static TupleTableSlot *
 setop_retrieve_hash_table(SetOpState *setopstate)
 {
-	TupleHashEntryData *entry;
+	TupleHashEntry entry;
 	TupleTableSlot *resultTupleSlot;
 
 	/*
@@ -526,12 +527,12 @@ setop_retrieve_hash_table(SetOpState *setopstate)
 		 * See if we should emit any copies of this tuple, and if so return
 		 * the first copy.
 		 */
-		set_output_count(setopstate, (SetOpStatePerGroup) entry->additional);
+		set_output_count(setopstate, (SetOpStatePerGroup) TupleHashEntryGetAdditional(entry));
 
 		if (setopstate->numOutput > 0)
 		{
 			setopstate->numOutput--;
-			return ExecStoreMinimalTuple(entry->firstTuple,
+			return ExecStoreMinimalTuple(TupleHashEntryGetTuple(entry),
 										 resultTupleSlot,
 										 false);
 		}
diff --git a/src/backend/executor/nodeSubplan.c b/src/backend/executor/nodeSubplan.c
index 49767ed6a52..f7f6fc2da0b 100644
--- a/src/backend/executor/nodeSubplan.c
+++ b/src/backend/executor/nodeSubplan.c
@@ -753,7 +753,7 @@ findPartialMatch(TupleHashTable hashtable, TupleTableSlot *slot,
 	{
 		CHECK_FOR_INTERRUPTS();
 
-		ExecStoreMinimalTuple(entry->firstTuple, hashtable->tableslot, false);
+		ExecStoreMinimalTuple(TupleHashEntryGetTuple(entry), hashtable->tableslot, false);
 		if (!execTuplesUnequal(slot, hashtable->tableslot,
 							   numCols, keyColIdx,
 							   eqfunctions,
diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h
index 30e2a82346f..26d1877b1f3 100644
--- a/src/include/executor/executor.h
+++ b/src/include/executor/executor.h
@@ -148,6 +148,9 @@ extern TupleHashEntry LookupTupleHashEntry(TupleHashTable hashtable,
 										   bool *isnew, uint32 *hash);
 extern uint32 TupleHashTableHash(TupleHashTable hashtable,
 								 TupleTableSlot *slot);
+extern size_t TupleHashEntrySize(void);
+extern MinimalTuple TupleHashEntryGetTuple(TupleHashEntry entry);
+extern void *TupleHashEntryGetAdditional(TupleHashEntry entry);
 extern TupleHashEntry LookupTupleHashEntryHash(TupleHashTable hashtable,
 											   TupleTableSlot *slot,
 											   bool *isnew, uint32 hash);
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index e2d1dc1e067..d46ce9b4996 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -831,17 +831,11 @@ typedef struct ExecAuxRowMark
  * point to tab_hash_expr and tab_eq_func respectively.
  * ----------------------------------------------------------------
  */
+struct TupleHashEntryData;
+typedef struct TupleHashEntryData TupleHashEntryData;
 typedef struct TupleHashEntryData *TupleHashEntry;
 typedef struct TupleHashTableData *TupleHashTable;
 
-typedef struct TupleHashEntryData
-{
-	MinimalTuple firstTuple;	/* copy of first tuple in this group */
-	void	   *additional;		/* user data */
-	uint32		status;			/* hash status */
-	uint32		hash;			/* hash value (cached) */
-} TupleHashEntryData;
-
 /* define parameters necessary to generate the tuple hash table interface */
 #define SH_PREFIX tuplehash
 #define SH_ELEMENT_TYPE TupleHashEntryData
@@ -860,6 +854,7 @@ typedef struct TupleHashTableData
 	Oid		   *tab_collations; /* collations for hash and comparison */
 	MemoryContext tablecxt;		/* memory context containing table */
 	MemoryContext tempcxt;		/* context for function evaluations */
+	Size		additionalsize; /* size of additional data */
 	TupleTableSlot *tableslot;	/* slot for referencing table entries */
 	/* The following fields are set transiently for each table search: */
 	TupleTableSlot *inputslot;	/* current input tuple's slot */
-- 
2.34.1

From 5db233e1fd02636b25399fa4db521fd464d3c504 Mon Sep 17 00:00:00 2001
From: Jeff Davis <j...@j-davis.com>
Date: Fri, 7 Feb 2025 13:14:50 -0800
Subject: [PATCH v6 2/7] Add ExecCopySlotMinimalTupleExtra().

Allows an "extra" argument that allocates extra memory at the end of
the MinimalTuple. This is important for callers that need to store
additional data, but do not want to perform an additional allocation.
---
 src/backend/access/common/heaptuple.c      | 43 ++++++++++++++++++----
 src/backend/executor/execTuples.c          | 24 ++++++------
 src/backend/executor/nodeGatherMerge.c     |  2 +-
 src/backend/utils/sort/tuplesortvariants.c |  2 +-
 src/backend/utils/sort/tuplestore.c        |  6 +--
 src/include/access/htup_details.h          |  7 ++--
 src/include/executor/tuptable.h            | 19 +++++++++-
 7 files changed, 74 insertions(+), 29 deletions(-)

diff --git a/src/backend/access/common/heaptuple.c b/src/backend/access/common/heaptuple.c
index b43cb9ccff4..36df2b31f2d 100644
--- a/src/backend/access/common/heaptuple.c
+++ b/src/backend/access/common/heaptuple.c
@@ -1452,7 +1452,8 @@ heap_freetuple(HeapTuple htup)
 MinimalTuple
 heap_form_minimal_tuple(TupleDesc tupleDescriptor,
 						const Datum *values,
-						const bool *isnull)
+						const bool *isnull,
+						Size extra)
 {
 	MinimalTuple tuple;			/* return tuple */
 	Size		len,
@@ -1497,7 +1498,10 @@ heap_form_minimal_tuple(TupleDesc tupleDescriptor,
 	/*
 	 * Allocate and zero the space needed.
 	 */
-	tuple = (MinimalTuple) palloc0(len);
+	if (extra == 0)
+		tuple = (MinimalTuple) palloc0(len);
+	else
+		tuple = (MinimalTuple) palloc0(MAXALIGN(len) + extra);
 
 	/*
 	 * And fill in the information.
@@ -1533,12 +1537,23 @@ heap_free_minimal_tuple(MinimalTuple mtup)
  * The result is allocated in the current memory context.
  */
 MinimalTuple
-heap_copy_minimal_tuple(MinimalTuple mtup)
+heap_copy_minimal_tuple(MinimalTuple mtup, Size extra)
 {
 	MinimalTuple result;
 
-	result = (MinimalTuple) palloc(mtup->t_len);
-	memcpy(result, mtup, mtup->t_len);
+	if (extra == 0)
+	{
+		result = (MinimalTuple) palloc(mtup->t_len);
+		memcpy(result, mtup, mtup->t_len);
+	}
+	else
+	{
+		Size		totalsize = MAXALIGN(mtup->t_len) + extra;
+
+		result = (MinimalTuple) palloc(totalsize);
+		memcpy(result, mtup, mtup->t_len);
+		memset((char *) result + mtup->t_len, 0, totalsize - mtup->t_len);
+	}
 	return result;
 }
 
@@ -1574,15 +1589,27 @@ heap_tuple_from_minimal_tuple(MinimalTuple mtup)
  * The result is allocated in the current memory context.
  */
 MinimalTuple
-minimal_tuple_from_heap_tuple(HeapTuple htup)
+minimal_tuple_from_heap_tuple(HeapTuple htup, Size extra)
 {
 	MinimalTuple result;
 	uint32		len;
 
 	Assert(htup->t_len > MINIMAL_TUPLE_OFFSET);
 	len = htup->t_len - MINIMAL_TUPLE_OFFSET;
-	result = (MinimalTuple) palloc(len);
-	memcpy(result, (char *) htup->t_data + MINIMAL_TUPLE_OFFSET, len);
+	if (extra == 0)
+	{
+		result = (MinimalTuple) palloc(len);
+		memcpy(result, (char *) htup->t_data + MINIMAL_TUPLE_OFFSET, len);
+	}
+	else
+	{
+		Size		totalsize = MAXALIGN(len) + extra;
+
+		result = (MinimalTuple) palloc(totalsize);
+		memcpy(result, (char *) htup->t_data + MINIMAL_TUPLE_OFFSET, len);
+		memset((char *) result + len, 0, totalsize - len);
+	}
+
 	result->t_len = len;
 	return result;
 }
diff --git a/src/backend/executor/execTuples.c b/src/backend/executor/execTuples.c
index 7de490462d4..8e02d68824f 100644
--- a/src/backend/executor/execTuples.c
+++ b/src/backend/executor/execTuples.c
@@ -298,13 +298,14 @@ tts_virtual_copy_heap_tuple(TupleTableSlot *slot)
 }
 
 static MinimalTuple
-tts_virtual_copy_minimal_tuple(TupleTableSlot *slot)
+tts_virtual_copy_minimal_tuple(TupleTableSlot *slot, Size extra)
 {
 	Assert(!TTS_EMPTY(slot));
 
 	return heap_form_minimal_tuple(slot->tts_tupleDescriptor,
 								   slot->tts_values,
-								   slot->tts_isnull);
+								   slot->tts_isnull,
+								   extra);
 }
 
 
@@ -472,14 +473,14 @@ tts_heap_copy_heap_tuple(TupleTableSlot *slot)
 }
 
 static MinimalTuple
-tts_heap_copy_minimal_tuple(TupleTableSlot *slot)
+tts_heap_copy_minimal_tuple(TupleTableSlot *slot, Size extra)
 {
 	HeapTupleTableSlot *hslot = (HeapTupleTableSlot *) slot;
 
 	if (!hslot->tuple)
 		tts_heap_materialize(slot);
 
-	return minimal_tuple_from_heap_tuple(hslot->tuple);
+	return minimal_tuple_from_heap_tuple(hslot->tuple, extra);
 }
 
 static void
@@ -607,7 +608,8 @@ tts_minimal_materialize(TupleTableSlot *slot)
 	{
 		mslot->mintuple = heap_form_minimal_tuple(slot->tts_tupleDescriptor,
 												  slot->tts_values,
-												  slot->tts_isnull);
+												  slot->tts_isnull,
+												  0);
 	}
 	else
 	{
@@ -617,7 +619,7 @@ tts_minimal_materialize(TupleTableSlot *slot)
 		 * TTS_FLAG_SHOULDFREE set).  Copy the minimal tuple into the given
 		 * slot's memory context.
 		 */
-		mslot->mintuple = heap_copy_minimal_tuple(mslot->mintuple);
+		mslot->mintuple = heap_copy_minimal_tuple(mslot->mintuple, 0);
 	}
 
 	slot->tts_flags |= TTS_FLAG_SHOULDFREE;
@@ -666,14 +668,14 @@ tts_minimal_copy_heap_tuple(TupleTableSlot *slot)
 }
 
 static MinimalTuple
-tts_minimal_copy_minimal_tuple(TupleTableSlot *slot)
+tts_minimal_copy_minimal_tuple(TupleTableSlot *slot, Size extra)
 {
 	MinimalTupleTableSlot *mslot = (MinimalTupleTableSlot *) slot;
 
 	if (!mslot->mintuple)
 		tts_minimal_materialize(slot);
 
-	return heap_copy_minimal_tuple(mslot->mintuple);
+	return heap_copy_minimal_tuple(mslot->mintuple, extra);
 }
 
 static void
@@ -926,7 +928,7 @@ tts_buffer_heap_copy_heap_tuple(TupleTableSlot *slot)
 }
 
 static MinimalTuple
-tts_buffer_heap_copy_minimal_tuple(TupleTableSlot *slot)
+tts_buffer_heap_copy_minimal_tuple(TupleTableSlot *slot, Size extra)
 {
 	BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
 
@@ -935,7 +937,7 @@ tts_buffer_heap_copy_minimal_tuple(TupleTableSlot *slot)
 	if (!bslot->base.tuple)
 		tts_buffer_heap_materialize(slot);
 
-	return minimal_tuple_from_heap_tuple(bslot->base.tuple);
+	return minimal_tuple_from_heap_tuple(bslot->base.tuple, extra);
 }
 
 static inline void
@@ -1895,7 +1897,7 @@ ExecFetchSlotMinimalTuple(TupleTableSlot *slot,
 	{
 		if (shouldFree)
 			*shouldFree = true;
-		return slot->tts_ops->copy_minimal_tuple(slot);
+		return slot->tts_ops->copy_minimal_tuple(slot, 0);
 	}
 }
 
diff --git a/src/backend/executor/nodeGatherMerge.c b/src/backend/executor/nodeGatherMerge.c
index 01a6e3a8553..15f84597067 100644
--- a/src/backend/executor/nodeGatherMerge.c
+++ b/src/backend/executor/nodeGatherMerge.c
@@ -735,7 +735,7 @@ gm_readnext_tuple(GatherMergeState *gm_state, int nreader, bool nowait,
 	 * Since we'll be buffering these across multiple calls, we need to make a
 	 * copy.
 	 */
-	return tup ? heap_copy_minimal_tuple(tup) : NULL;
+	return tup ? heap_copy_minimal_tuple(tup, 0) : NULL;
 }
 
 /*
diff --git a/src/backend/utils/sort/tuplesortvariants.c b/src/backend/utils/sort/tuplesortvariants.c
index 913c4ef455e..03bd3e0d2f1 100644
--- a/src/backend/utils/sort/tuplesortvariants.c
+++ b/src/backend/utils/sort/tuplesortvariants.c
@@ -892,7 +892,7 @@ tuplesort_gettupleslot(Tuplesortstate *state, bool forward, bool copy,
 			*abbrev = stup.datum1;
 
 		if (copy)
-			stup.tuple = heap_copy_minimal_tuple((MinimalTuple) stup.tuple);
+			stup.tuple = heap_copy_minimal_tuple((MinimalTuple) stup.tuple, 0);
 
 		ExecStoreMinimalTuple((MinimalTuple) stup.tuple, slot, copy);
 		return true;
diff --git a/src/backend/utils/sort/tuplestore.c b/src/backend/utils/sort/tuplestore.c
index d61b601053c..c9aecab8d66 100644
--- a/src/backend/utils/sort/tuplestore.c
+++ b/src/backend/utils/sort/tuplestore.c
@@ -787,7 +787,7 @@ tuplestore_putvalues(Tuplestorestate *state, TupleDesc tdesc,
 	MinimalTuple tuple;
 	MemoryContext oldcxt = MemoryContextSwitchTo(state->context);
 
-	tuple = heap_form_minimal_tuple(tdesc, values, isnull);
+	tuple = heap_form_minimal_tuple(tdesc, values, isnull, 0);
 	USEMEM(state, GetMemoryChunkSpace(tuple));
 
 	tuplestore_puttuple_common(state, tuple);
@@ -1139,7 +1139,7 @@ tuplestore_gettupleslot(Tuplestorestate *state, bool forward,
 	{
 		if (copy && !should_free)
 		{
-			tuple = heap_copy_minimal_tuple(tuple);
+			tuple = heap_copy_minimal_tuple(tuple, 0);
 			should_free = true;
 		}
 		ExecStoreMinimalTuple(tuple, slot, should_free);
@@ -1590,7 +1590,7 @@ copytup_heap(Tuplestorestate *state, void *tup)
 {
 	MinimalTuple tuple;
 
-	tuple = minimal_tuple_from_heap_tuple((HeapTuple) tup);
+	tuple = minimal_tuple_from_heap_tuple((HeapTuple) tup, 0);
 	USEMEM(state, GetMemoryChunkSpace(tuple));
 	return tuple;
 }
diff --git a/src/include/access/htup_details.h b/src/include/access/htup_details.h
index 6cd4b95bfdb..aa957cf3b01 100644
--- a/src/include/access/htup_details.h
+++ b/src/include/access/htup_details.h
@@ -839,11 +839,12 @@ extern void heap_deform_tuple(HeapTuple tuple, TupleDesc tupleDesc,
 							  Datum *values, bool *isnull);
 extern void heap_freetuple(HeapTuple htup);
 extern MinimalTuple heap_form_minimal_tuple(TupleDesc tupleDescriptor,
-											const Datum *values, const bool *isnull);
+											const Datum *values, const bool *isnull,
+											Size extra);
 extern void heap_free_minimal_tuple(MinimalTuple mtup);
-extern MinimalTuple heap_copy_minimal_tuple(MinimalTuple mtup);
+extern MinimalTuple heap_copy_minimal_tuple(MinimalTuple mtup, Size extra);
 extern HeapTuple heap_tuple_from_minimal_tuple(MinimalTuple mtup);
-extern MinimalTuple minimal_tuple_from_heap_tuple(HeapTuple htup);
+extern MinimalTuple minimal_tuple_from_heap_tuple(HeapTuple htup, Size extra);
 extern size_t varsize_any(void *p);
 extern HeapTuple heap_expand_tuple(HeapTuple sourceTuple, TupleDesc tupleDesc);
 extern MinimalTuple minimal_expand_tuple(HeapTuple sourceTuple, TupleDesc tupleDesc);
diff --git a/src/include/executor/tuptable.h b/src/include/executor/tuptable.h
index a044d78e4d0..b901fef41f0 100644
--- a/src/include/executor/tuptable.h
+++ b/src/include/executor/tuptable.h
@@ -218,8 +218,12 @@ struct TupleTableSlotOps
 	 * meaningful "system columns" in the copy. The copy is not be "owned" by
 	 * the slot i.e. the caller has to take responsibility to free memory
 	 * consumed by the slot.
+	 *
+	 * The copy has "extra" bytes (maxaligned and zeroed) available at the
+	 * end, which is useful so that some callers may store extra data along
+	 * with the minimal tuple without the need for an additional allocation.
 	 */
-	MinimalTuple (*copy_minimal_tuple) (TupleTableSlot *slot);
+	MinimalTuple (*copy_minimal_tuple) (TupleTableSlot *slot, Size extra);
 };
 
 /*
@@ -491,7 +495,18 @@ ExecCopySlotHeapTuple(TupleTableSlot *slot)
 static inline MinimalTuple
 ExecCopySlotMinimalTuple(TupleTableSlot *slot)
 {
-	return slot->tts_ops->copy_minimal_tuple(slot);
+	return slot->tts_ops->copy_minimal_tuple(slot, 0);
+}
+
+/*
+ * ExecCopySlotMinimalTupleExtra - return MinimalTuple allocated in caller's
+ * context, with extra bytes (maxaligned and zeroed) after the end of the
+ * tuple.
+ */
+static inline MinimalTuple
+ExecCopySlotMinimalTupleExtra(TupleTableSlot *slot, Size extra)
+{
+	return slot->tts_ops->copy_minimal_tuple(slot, extra);
 }
 
 /*
-- 
2.34.1

From 449563fcdbaad76fa0841cdb82ce4e4f55d75163 Mon Sep 17 00:00:00 2001
From: Jeff Davis <j...@j-davis.com>
Date: Tue, 14 Jan 2025 16:17:00 -0800
Subject: [PATCH v6 3/7] Store additional data after mintuple

---
 src/backend/executor/execGrouping.c | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/src/backend/executor/execGrouping.c b/src/backend/executor/execGrouping.c
index 6763c35457c..90833216e3b 100644
--- a/src/backend/executor/execGrouping.c
+++ b/src/backend/executor/execGrouping.c
@@ -23,7 +23,6 @@
 struct TupleHashEntryData
 {
 	MinimalTuple firstTuple;	/* copy of first tuple in this group */
-	void	   *additional;		/* user data */
 	uint32		status;			/* hash status */
 	uint32		hash;			/* hash value (cached) */
 };
@@ -302,15 +301,16 @@ TupleHashEntryGetTuple(TupleHashEntry entry)
 
 /*
  * Get a pointer into the additional space allocated for this entry. The
- * amount of space available is the additionalsize specified to
- * BuildTupleHashTable(). If additionalsize was specified as zero, no
+ * memory will be maxaligned and zeroed.
+ *
+ * The amount of space available is the additionalsize requested in the call
+ * to BuildTupleHashTable(). If additionalsize was specified as zero, no
  * additional space is available and this function should not be called.
  */
 void *
 TupleHashEntryGetAdditional(TupleHashEntry entry)
 {
-	Assert(entry->additional != NULL);
-	return entry->additional;
+	return (char *) entry->firstTuple + MAXALIGN(entry->firstTuple->t_len);
 }
 
 /*
@@ -522,13 +522,17 @@ LookupTupleHashEntry_internal(TupleHashTable hashtable, TupleTableSlot *slot,
 
 			MemoryContextSwitchTo(hashtable->tablecxt);
 
-			/* Copy the first tuple into the table context */
-			entry->firstTuple = ExecCopySlotMinimalTuple(slot);
-
-			if (hashtable->additionalsize > 0)
-				entry->additional = palloc0(hashtable->additionalsize);
-			else
-				entry->additional = NULL;
+			/*
+			 * Copy the first tuple into the table context, and request
+			 * additionalsize extra bytes at the end of the allocation.
+			 *
+			 * The caller can get a pointer to this data with
+			 * TupleHashEntryGetAdditional(), and store arbitrary data there.
+			 * This avoids the need to store an extra pointer or allocate an
+			 * additional chunk, which would waste memory.
+			 */
+			entry->firstTuple = ExecCopySlotMinimalTupleExtra(slot,
+															  hashtable->additionalsize);
 		}
 	}
 	else
-- 
2.34.1

From 2c2506f14adc51625b577ececef36b6feb159a22 Mon Sep 17 00:00:00 2001
From: Jeff Davis <j...@j-davis.com>
Date: Fri, 15 Nov 2024 12:21:04 -0800
Subject: [PATCH v6 4/7] simplehash: don't require a "status" field.

Callers may define SH_ENTRY_IS_EMPTY(), SH_ENTRY_SET_EMPTY(), and
SH_ENTRY_SET_IN_USE() as an alternative, which can allow for a more
compact entry size. That reduces the memory overhead, especially when
the hash table is sparse.
---
 src/include/lib/simplehash.h | 68 ++++++++++++++++++++++++------------
 1 file changed, 45 insertions(+), 23 deletions(-)

diff --git a/src/include/lib/simplehash.h b/src/include/lib/simplehash.h
index 327274c2340..0b9b0cdfde0 100644
--- a/src/include/lib/simplehash.h
+++ b/src/include/lib/simplehash.h
@@ -50,9 +50,18 @@
  *	  - SH_HASH_KEY(table, key) - generate hash for the key
  *	  - SH_STORE_HASH - if defined the hash is stored in the elements
  *	  - SH_GET_HASH(tb, a) - return the field to store the hash in
+ *	  - SH_ENTRY_IS_EMPTY(tb, a) - return true if the entry is empty
+ *	  - SH_ENTRY_SET_EMPTY(tb, a) - set entry to empty
+ *	  - SH_ENTRY_SET_IN_USE(tb, a) - set entry to "in use"
  *
  *	  The element type is required to contain a "status" member that can store
- *	  the range of values defined in the SH_STATUS enum.
+ *	  the range of values defined in the SH_STATUS enum. Alternatively,
+ *	  callers may define all of SH_ENTRY_IS_EMPTY, SH_ENTRY_SET_EMPTY,
+ *	  SH_ENTRY_SET_IN_USE to allow for a more compact entry size. For example,
+ *	  if the SH_ELEMENT_TYPE contains a pointer, the caller may decide that
+ *	  the entry is empty if the pointer is NULL and "in use" if the pointer is
+ *	  non-NULL. NB: if the entire entry is zero, then SH_ENTRY_IS_EMPTY()
+ *	  *must* evaluate to true.
  *
  *	  While SH_STORE_HASH (and subsequently SH_GET_HASH) are optional, because
  *	  the hash table implementation needs to compare hashes to move elements
@@ -277,6 +286,18 @@ SH_SCOPE void SH_STAT(SH_TYPE * tb);
 #define SH_GROW_MIN_FILLFACTOR 0.1
 #endif
 
+#ifndef SH_ENTRY_IS_EMPTY
+#define SH_ENTRY_IS_EMPTY(tb, a) ((a)->status == SH_STATUS_EMPTY)
+#endif
+
+#ifndef SH_ENTRY_SET_EMPTY
+#define SH_ENTRY_SET_EMPTY(tb, a) do { (a)->status = SH_STATUS_EMPTY; } while (0)
+#endif
+
+#ifndef SH_ENTRY_SET_IN_USE
+#define SH_ENTRY_SET_IN_USE(tb, a) do { (a)->status = SH_STATUS_IN_USE; } while (0)
+#endif
+
 #ifdef SH_STORE_HASH
 #define SH_COMPARE_KEYS(tb, ahash, akey, b) (ahash == SH_GET_HASH(tb, b) && SH_EQUAL(tb, b->SH_KEY, akey))
 #else
@@ -540,7 +561,7 @@ SH_GROW(SH_TYPE * tb, uint64 newsize)
 		uint32		hash;
 		uint32		optimal;
 
-		if (oldentry->status != SH_STATUS_IN_USE)
+		if (SH_ENTRY_IS_EMPTY(tb, oldentry))
 		{
 			startelem = i;
 			break;
@@ -562,7 +583,7 @@ SH_GROW(SH_TYPE * tb, uint64 newsize)
 	{
 		SH_ELEMENT_TYPE *oldentry = &olddata[copyelem];
 
-		if (oldentry->status == SH_STATUS_IN_USE)
+		if (!SH_ENTRY_IS_EMPTY(tb, oldentry))
 		{
 			uint32		hash;
 			uint32		startelem2;
@@ -578,7 +599,7 @@ SH_GROW(SH_TYPE * tb, uint64 newsize)
 			{
 				newentry = &newdata[curelem];
 
-				if (newentry->status == SH_STATUS_EMPTY)
+				if (SH_ENTRY_IS_EMPTY(tb, newentry))
 				{
 					break;
 				}
@@ -649,14 +670,14 @@ restart:
 		SH_ELEMENT_TYPE *entry = &data[curelem];
 
 		/* any empty bucket can directly be used */
-		if (entry->status == SH_STATUS_EMPTY)
+		if (SH_ENTRY_IS_EMPTY(tb, entry))
 		{
 			tb->members++;
 			entry->SH_KEY = key;
 #ifdef SH_STORE_HASH
 			SH_GET_HASH(tb, entry) = hash;
 #endif
-			entry->status = SH_STATUS_IN_USE;
+			SH_ENTRY_SET_IN_USE(tb, entry);
 			*found = false;
 			return entry;
 		}
@@ -671,7 +692,7 @@ restart:
 
 		if (SH_COMPARE_KEYS(tb, hash, key, entry))
 		{
-			Assert(entry->status == SH_STATUS_IN_USE);
+			Assert(!SH_ENTRY_IS_EMPTY(tb, entry));
 			*found = true;
 			return entry;
 		}
@@ -695,7 +716,7 @@ restart:
 				emptyelem = SH_NEXT(tb, emptyelem, startelem);
 				emptyentry = &data[emptyelem];
 
-				if (emptyentry->status == SH_STATUS_EMPTY)
+				if (SH_ENTRY_IS_EMPTY(tb, emptyentry))
 				{
 					lastentry = emptyentry;
 					break;
@@ -744,7 +765,7 @@ restart:
 #ifdef SH_STORE_HASH
 			SH_GET_HASH(tb, entry) = hash;
 #endif
-			entry->status = SH_STATUS_IN_USE;
+			SH_ENTRY_SET_IN_USE(tb, entry);
 			*found = false;
 			return entry;
 		}
@@ -806,13 +827,11 @@ SH_LOOKUP_HASH_INTERNAL(SH_TYPE * tb, SH_KEY_TYPE key, uint32 hash)
 	{
 		SH_ELEMENT_TYPE *entry = &tb->data[curelem];
 
-		if (entry->status == SH_STATUS_EMPTY)
+		if (SH_ENTRY_IS_EMPTY(tb, entry))
 		{
 			return NULL;
 		}
 
-		Assert(entry->status == SH_STATUS_IN_USE);
-
 		if (SH_COMPARE_KEYS(tb, hash, key, entry))
 			return entry;
 
@@ -864,10 +883,10 @@ SH_DELETE(SH_TYPE * tb, SH_KEY_TYPE key)
 	{
 		SH_ELEMENT_TYPE *entry = &tb->data[curelem];
 
-		if (entry->status == SH_STATUS_EMPTY)
+		if (SH_ENTRY_IS_EMPTY(tb, entry))
 			return false;
 
-		if (entry->status == SH_STATUS_IN_USE &&
+		if (!SH_ENTRY_IS_EMPTY(tb, entry) &&
 			SH_COMPARE_KEYS(tb, hash, key, entry))
 		{
 			SH_ELEMENT_TYPE *lastentry = entry;
@@ -890,9 +909,9 @@ SH_DELETE(SH_TYPE * tb, SH_KEY_TYPE key)
 				curelem = SH_NEXT(tb, curelem, startelem);
 				curentry = &tb->data[curelem];
 
-				if (curentry->status != SH_STATUS_IN_USE)
+				if (SH_ENTRY_IS_EMPTY(tb, curentry))
 				{
-					lastentry->status = SH_STATUS_EMPTY;
+					SH_ENTRY_SET_EMPTY(tb, lastentry);
 					break;
 				}
 
@@ -902,7 +921,7 @@ SH_DELETE(SH_TYPE * tb, SH_KEY_TYPE key)
 				/* current is at optimal position, done */
 				if (curoptimal == curelem)
 				{
-					lastentry->status = SH_STATUS_EMPTY;
+					SH_ENTRY_SET_EMPTY(tb, lastentry);
 					break;
 				}
 
@@ -953,9 +972,9 @@ SH_DELETE_ITEM(SH_TYPE * tb, SH_ELEMENT_TYPE * entry)
 		curelem = SH_NEXT(tb, curelem, startelem);
 		curentry = &tb->data[curelem];
 
-		if (curentry->status != SH_STATUS_IN_USE)
+		if (SH_ENTRY_IS_EMPTY(tb, curentry))
 		{
-			lastentry->status = SH_STATUS_EMPTY;
+			SH_ENTRY_SET_EMPTY(tb, lastentry);
 			break;
 		}
 
@@ -965,7 +984,7 @@ SH_DELETE_ITEM(SH_TYPE * tb, SH_ELEMENT_TYPE * entry)
 		/* current is at optimal position, done */
 		if (curoptimal == curelem)
 		{
-			lastentry->status = SH_STATUS_EMPTY;
+			SH_ENTRY_SET_EMPTY(tb, lastentry);
 			break;
 		}
 
@@ -993,7 +1012,7 @@ SH_START_ITERATE(SH_TYPE * tb, SH_ITERATOR * iter)
 	{
 		SH_ELEMENT_TYPE *entry = &tb->data[i];
 
-		if (entry->status != SH_STATUS_IN_USE)
+		if (SH_ENTRY_IS_EMPTY(tb, entry))
 		{
 			startelem = i;
 			break;
@@ -1055,7 +1074,7 @@ SH_ITERATE(SH_TYPE * tb, SH_ITERATOR * iter)
 
 		if ((iter->cur & tb->sizemask) == (iter->end & tb->sizemask))
 			iter->done = true;
-		if (elem->status == SH_STATUS_IN_USE)
+		if (!SH_ENTRY_IS_EMPTY(tb, elem))
 		{
 			return elem;
 		}
@@ -1091,7 +1110,7 @@ SH_STAT(SH_TYPE * tb)
 
 		elem = &tb->data[i];
 
-		if (elem->status != SH_STATUS_IN_USE)
+		if (SH_ENTRY_IS_EMPTY(tb, elem))
 			continue;
 
 		hash = SH_ENTRY_HASH(tb, elem);
@@ -1149,6 +1168,9 @@ SH_STAT(SH_TYPE * tb)
 #undef SH_KEY
 #undef SH_ELEMENT_TYPE
 #undef SH_HASH_KEY
+#undef SH_ENTRY_IS_EMPTY
+#undef SH_ENTRY_SET_EMPTY
+#undef SH_ENTRY_SET_IN_USE
 #undef SH_SCOPE
 #undef SH_DECLARE
 #undef SH_DEFINE
-- 
2.34.1

From c7bcdc8493137b15b3b8f7b9f9bbfe8b5e640e9e Mon Sep 17 00:00:00 2001
From: Jeff Davis <j...@j-davis.com>
Date: Fri, 15 Nov 2024 13:03:32 -0800
Subject: [PATCH v6 5/7] TupleHashTable: reduce overhead by removing "status"
 field.

Make use of new simplehash API introduced in commit 1234567890.

Because NULL now means that the entry is empty, use a unique dummy
pointer to refer to the inputslot instead. It also makes the code more
readable.
---
 src/backend/executor/execGrouping.c | 33 +++++++++++++++++++----------
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/src/backend/executor/execGrouping.c b/src/backend/executor/execGrouping.c
index 90833216e3b..7f41b03324f 100644
--- a/src/backend/executor/execGrouping.c
+++ b/src/backend/executor/execGrouping.c
@@ -23,7 +23,6 @@
 struct TupleHashEntryData
 {
 	MinimalTuple firstTuple;	/* copy of first tuple in this group */
-	uint32		status;			/* hash status */
 	uint32		hash;			/* hash value (cached) */
 };
 
@@ -34,6 +33,14 @@ static inline TupleHashEntry LookupTupleHashEntry_internal(TupleHashTable hashta
 														   TupleTableSlot *slot,
 														   bool *isnew, uint32 hash);
 
+/*
+ * Create a dummy pointer used to mean the input slot. The pointer will only
+ * be used for comparison and never dereferenced, and must not be equal to a
+ * valid pointer. Distinct from NULL, because NULL means that the hash entry
+ * itself is empty.
+ */
+const static MinimalTuple FIRSTTUPLE_INPUTSLOT = (MinimalTuple) 0x1;
+
 /*
  * Define parameters for tuple hash table code generation. The interface is
  * *also* declared in execnodes.h (to generate the types, which are externally
@@ -48,6 +55,9 @@ static inline TupleHashEntry LookupTupleHashEntry_internal(TupleHashTable hashta
 #define SH_SCOPE extern
 #define SH_STORE_HASH
 #define SH_GET_HASH(tb, a) a->hash
+#define SH_ENTRY_IS_EMPTY(tb, a) ((a)->firstTuple == NULL)
+#define SH_ENTRY_SET_EMPTY(tb, a) do { (a)->firstTuple = NULL; } while (0)
+#define SH_ENTRY_SET_IN_USE(tb, a) Assert((a)->firstTuple != NULL)
 #define SH_DEFINE
 #include "lib/simplehash.h"
 
@@ -344,7 +354,7 @@ LookupTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot,
 	hashtable->in_hash_expr = hashtable->tab_hash_expr;
 	hashtable->cur_eq_func = hashtable->tab_eq_func;
 
-	local_hash = TupleHashTableHash_internal(hashtable->hashtab, NULL);
+	local_hash = TupleHashTableHash_internal(hashtable->hashtab, FIRSTTUPLE_INPUTSLOT);
 	entry = LookupTupleHashEntry_internal(hashtable, slot, isnew, local_hash);
 
 	if (hash != NULL)
@@ -372,7 +382,8 @@ TupleHashTableHash(TupleHashTable hashtable, TupleTableSlot *slot)
 	/* Need to run the hash functions in short-lived context */
 	oldContext = MemoryContextSwitchTo(hashtable->tempcxt);
 
-	hash = TupleHashTableHash_internal(hashtable->hashtab, NULL);
+	hash = TupleHashTableHash_internal(hashtable->hashtab,
+									   FIRSTTUPLE_INPUTSLOT);
 
 	MemoryContextSwitchTo(oldContext);
 
@@ -433,7 +444,7 @@ FindTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot,
 	hashtable->cur_eq_func = eqcomp;
 
 	/* Search the hash table */
-	key = NULL;					/* flag to reference inputslot */
+	key = FIRSTTUPLE_INPUTSLOT;
 	entry = tuplehash_lookup(hashtable->hashtab, key);
 	MemoryContextSwitchTo(oldContext);
 
@@ -441,9 +452,9 @@ FindTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot,
 }
 
 /*
- * If tuple is NULL, use the input slot instead. This convention avoids the
- * need to materialize virtual input tuples unless they actually need to get
- * copied into the table.
+ * If tuple is FIRSTTUPLE_INPUTSLOT, use the input slot instead. This
+ * convention avoids the need to materialize virtual input tuples unless they
+ * actually need to get copied into the table.
  *
  * Also, the caller must select an appropriate memory context for running
  * the hash functions.
@@ -457,7 +468,7 @@ TupleHashTableHash_internal(struct tuplehash_hash *tb,
 	TupleTableSlot *slot;
 	bool		isnull;
 
-	if (tuple == NULL)
+	if (tuple == FIRSTTUPLE_INPUTSLOT)
 	{
 		/* Process the current input tuple for the table */
 		hashtable->exprcontext->ecxt_innertuple = hashtable->inputslot;
@@ -504,7 +515,7 @@ LookupTupleHashEntry_internal(TupleHashTable hashtable, TupleTableSlot *slot,
 	bool		found;
 	MinimalTuple key;
 
-	key = NULL;					/* flag to reference inputslot */
+	key = FIRSTTUPLE_INPUTSLOT;
 
 	if (isnew)
 	{
@@ -560,10 +571,10 @@ TupleHashTableMatch(struct tuplehash_hash *tb, const MinimalTuple tuple1, const
 	 * LookupTupleHashEntry's dummy TupleHashEntryData.  The other direction
 	 * could be supported too, but is not currently required.
 	 */
-	Assert(tuple1 != NULL);
+	Assert(tuple1 && tuple1 != FIRSTTUPLE_INPUTSLOT);
 	slot1 = hashtable->tableslot;
 	ExecStoreMinimalTuple(tuple1, slot1, false);
-	Assert(tuple2 == NULL);
+	Assert(tuple2 == FIRSTTUPLE_INPUTSLOT);
 	slot2 = hashtable->inputslot;
 
 	/* For crosstype comparisons, the inputslot must be first */
-- 
2.34.1

From 64b049cd850c2e9c60574def618967eb7f0c0343 Mon Sep 17 00:00:00 2001
From: Jeff Davis <j...@j-davis.com>
Date: Fri, 15 Nov 2024 18:52:06 -0800
Subject: [PATCH v6 6/7] Pack TupleHashEntryData and AggStatePerGroupData.

---
 src/backend/executor/execGrouping.c | 6 +++++-
 src/include/executor/nodeAgg.h      | 6 +++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/backend/executor/execGrouping.c b/src/backend/executor/execGrouping.c
index 7f41b03324f..cea28ce4bd7 100644
--- a/src/backend/executor/execGrouping.c
+++ b/src/backend/executor/execGrouping.c
@@ -24,7 +24,11 @@ struct TupleHashEntryData
 {
 	MinimalTuple firstTuple;	/* copy of first tuple in this group */
 	uint32		hash;			/* hash value (cached) */
-};
+}
+#ifdef pg_attribute_packed
+			pg_attribute_packed()
+#endif
+		   ;
 
 static int	TupleHashTableMatch(struct tuplehash_hash *tb, const MinimalTuple tuple1, const MinimalTuple tuple2);
 static inline uint32 TupleHashTableHash_internal(struct tuplehash_hash *tb,
diff --git a/src/include/executor/nodeAgg.h b/src/include/executor/nodeAgg.h
index 34b82d0f5d1..cb756a912a6 100644
--- a/src/include/executor/nodeAgg.h
+++ b/src/include/executor/nodeAgg.h
@@ -264,7 +264,11 @@ typedef struct AggStatePerGroupData
 	 * NULL and not auto-replace it with a later input value. Only the first
 	 * non-NULL input will be auto-substituted.
 	 */
-} AggStatePerGroupData;
+}
+#ifdef pg_attribute_packed
+			pg_attribute_packed()
+#endif
+AggStatePerGroupData;
 
 /*
  * AggStatePerPhaseData - per-grouping-set-phase state
-- 
2.34.1

From 7e324de479998b8b773185db946e8788bc6a9201 Mon Sep 17 00:00:00 2001
From: Jeff Davis <j...@j-davis.com>
Date: Wed, 8 Jan 2025 11:46:35 -0800
Subject: [PATCH v6 7/7] HashAgg: use Bump allocator for hash TupleHashTable
 entries.

The entries aren't freed until the entire hash table is destroyed, so
use the Bump allocator to improve allocation speed, avoid wasting
space on the chunk header, and avoid wasting space due to the
power-of-two allocations.

Discussion: https://postgr.es/m/caaphdvqv1anb4cm36fzrwivxrevbo_lsg_eq3nqdxtjecaa...@mail.gmail.com
Reviewed-by: David Rowley
---
 src/backend/executor/nodeAgg.c             | 110 +++++++++++++++++----
 src/include/nodes/execnodes.h              |   5 +-
 src/test/regress/expected/groupingsets.out |   2 +-
 src/test/regress/sql/groupingsets.sql      |   2 +-
 4 files changed, 97 insertions(+), 22 deletions(-)

diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index 19640752967..d5bed9ec48e 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -405,6 +405,7 @@ static void build_hash_tables(AggState *aggstate);
 static void build_hash_table(AggState *aggstate, int setno, long nbuckets);
 static void hashagg_recompile_expressions(AggState *aggstate, bool minslot,
 										  bool nullcheck);
+static void hash_create_memory(AggState *aggstate);
 static long hash_choose_num_buckets(double hashentrysize,
 									long ngroups, Size memory);
 static int	hash_choose_num_partitions(double input_groups,
@@ -1503,7 +1504,7 @@ build_hash_table(AggState *aggstate, int setno, long nbuckets)
 {
 	AggStatePerHash perhash = &aggstate->perhash[setno];
 	MemoryContext metacxt = aggstate->hash_metacxt;
-	MemoryContext hashcxt = aggstate->hashcontext->ecxt_per_tuple_memory;
+	MemoryContext tablecxt = aggstate->hash_tablecxt;
 	MemoryContext tmpcxt = aggstate->tmpcontext->ecxt_per_tuple_memory;
 	Size		additionalsize;
 
@@ -1529,7 +1530,7 @@ build_hash_table(AggState *aggstate, int setno, long nbuckets)
 											 nbuckets,
 											 additionalsize,
 											 metacxt,
-											 hashcxt,
+											 tablecxt,
 											 tmpcxt,
 											 DO_AGGSPLIT_SKIPFINAL(aggstate->aggsplit));
 }
@@ -1700,15 +1701,24 @@ hash_agg_entry_size(int numTrans, Size tupleWidth, Size transitionSpace)
 							 tupleWidth);
 	Size		pergroupSize = numTrans * sizeof(AggStatePerGroupData);
 
-	tupleChunkSize = CHUNKHDRSZ + tupleSize;
-
-	if (pergroupSize > 0)
-		pergroupChunkSize = CHUNKHDRSZ + pergroupSize;
-	else
-		pergroupChunkSize = 0;
+	/*
+	 * Entries use the Bump allocator, so the chunk sizes are the same as the
+	 * requested sizes.
+	 */
+	tupleChunkSize = tupleSize;
+	pergroupChunkSize = pergroupSize;
 
+	/*
+	 * Transition values use AllocSet, which has a chunk header and also uses
+	 * power-of-two allocations.
+	 */
 	if (transitionSpace > 0)
-		transitionChunkSize = CHUNKHDRSZ + transitionSpace;
+	{
+		Size		pow2 = (Size) 1 << my_log2(transitionSpace);
+
+		/* use Max to protect against overflow */
+		transitionChunkSize = Max(CHUNKHDRSZ + pow2, transitionSpace);
+	}
 	else
 		transitionChunkSize = 0;
 
@@ -1858,15 +1868,18 @@ hash_agg_check_limits(AggState *aggstate)
 	uint64		ngroups = aggstate->hash_ngroups_current;
 	Size		meta_mem = MemoryContextMemAllocated(aggstate->hash_metacxt,
 													 true);
-	Size		hashkey_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory,
-														true);
+	Size		entry_mem = MemoryContextMemAllocated(aggstate->hash_tablecxt,
+													  true);
+	Size		tval_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory,
+													 true);
+	Size		total_mem = meta_mem + entry_mem + tval_mem;
 
 	/*
 	 * Don't spill unless there's at least one group in the hash table so we
 	 * can be sure to make progress even in edge cases.
 	 */
 	if (aggstate->hash_ngroups_current > 0 &&
-		(meta_mem + hashkey_mem > aggstate->hash_mem_limit ||
+		(total_mem > aggstate->hash_mem_limit ||
 		 ngroups > aggstate->hash_ngroups_limit))
 	{
 		hash_agg_enter_spill_mode(aggstate);
@@ -1917,6 +1930,7 @@ static void
 hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
 {
 	Size		meta_mem;
+	Size		entry_mem;
 	Size		hashkey_mem;
 	Size		buffer_mem;
 	Size		total_mem;
@@ -1928,7 +1942,10 @@ hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
 	/* memory for the hash table itself */
 	meta_mem = MemoryContextMemAllocated(aggstate->hash_metacxt, true);
 
-	/* memory for the group keys and transition states */
+	/* memory for hash entries */
+	entry_mem = MemoryContextMemAllocated(aggstate->hash_tablecxt, true);
+
+	/* memory for byref transition states */
 	hashkey_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory, true);
 
 	/* memory for read/write tape buffers, if spilled */
@@ -1937,7 +1954,7 @@ hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
 		buffer_mem += HASHAGG_READ_BUFFER_SIZE;
 
 	/* update peak mem */
-	total_mem = meta_mem + hashkey_mem + buffer_mem;
+	total_mem = meta_mem + entry_mem + hashkey_mem + buffer_mem;
 	if (total_mem > aggstate->hash_mem_peak)
 		aggstate->hash_mem_peak = total_mem;
 
@@ -1959,6 +1976,58 @@ hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
 	}
 }
 
+/*
+ * Create memory contexts used for hash aggregation.
+ */
+static void
+hash_create_memory(AggState *aggstate)
+{
+	Size		minContextSize = ALLOCSET_DEFAULT_MINSIZE;
+	Size		initBlockSize = ALLOCSET_DEFAULT_INITSIZE;
+	Size		maxBlockSize = ALLOCSET_DEFAULT_MAXSIZE;
+
+	/*
+	 * The hashcontext's per-tuple memory will be used for byref transition
+	 * values and returned by AggCheckCallContext().
+	 */
+	aggstate->hashcontext = CreateWorkExprContext(aggstate->ss.ps.state);
+
+	/*
+	 * The meta context will be used for the bucket array of
+	 * TupleHashEntryData (or arrays, in the case of grouping sets). As the
+	 * hash table grows, the bucket array will double in size and the old one
+	 * will be freed, so an AllocSet is appropriate. For large bucket arrays,
+	 * the large allocation path will be used, so it's not worth worrying
+	 * about wasting space due to power-of-two allocations.
+	 */
+	aggstate->hash_metacxt = AllocSetContextCreate(aggstate->ss.ps.state->es_query_cxt,
+												   "HashAgg meta context",
+												   ALLOCSET_DEFAULT_SIZES);
+
+	/*
+	 * The hash entries themselves, which include the grouping key
+	 * (firstTuple) and pergroup data, are stored in the table context. The
+	 * bump allocator can be used because the entries are not freed until the
+	 * entire hash table is reset. The bump allocator is faster for
+	 * allocations and avoids wasting space on the chunk header or
+	 * power-of-two allocations.
+	 *
+	 * Like CreateWorkExprContext(), use smaller sizings for smaller work_mem,
+	 * to avoid large jumps in memory usage.
+	 */
+	while (16 * maxBlockSize > work_mem * 1024L)
+		maxBlockSize >>= 1;
+
+	if (maxBlockSize < ALLOCSET_DEFAULT_INITSIZE)
+		maxBlockSize = ALLOCSET_DEFAULT_INITSIZE;
+
+	aggstate->hash_tablecxt = BumpContextCreate(aggstate->ss.ps.state->es_query_cxt,
+												"HashAgg table context",
+												minContextSize, initBlockSize,
+												maxBlockSize);
+
+}
+
 /*
  * Choose a reasonable number of buckets for the initial hash table size.
  */
@@ -2618,6 +2687,7 @@ agg_refill_hash_table(AggState *aggstate)
 
 	/* free memory and reset hash tables */
 	ReScanExprContext(aggstate->hashcontext);
+	MemoryContextReset(aggstate->hash_tablecxt);
 	for (int setno = 0; setno < aggstate->num_hashes; setno++)
 		ResetTupleHashTable(aggstate->perhash[setno].hashtable);
 
@@ -3285,7 +3355,7 @@ ExecInitAgg(Agg *node, EState *estate, int eflags)
 	}
 
 	if (use_hashing)
-		aggstate->hashcontext = CreateWorkExprContext(estate);
+		hash_create_memory(aggstate);
 
 	ExecAssignExprContext(estate, &aggstate->ss.ps);
 
@@ -3580,9 +3650,6 @@ ExecInitAgg(Agg *node, EState *estate, int eflags)
 		Plan	   *outerplan = outerPlan(node);
 		uint64		totalGroups = 0;
 
-		aggstate->hash_metacxt = AllocSetContextCreate(aggstate->ss.ps.state->es_query_cxt,
-													   "HashAgg meta context",
-													   ALLOCSET_DEFAULT_SIZES);
 		aggstate->hash_spill_rslot = ExecInitExtraTupleSlot(estate, scanDesc,
 															&TTSOpsMinimalTuple);
 		aggstate->hash_spill_wslot = ExecInitExtraTupleSlot(estate, scanDesc,
@@ -4327,6 +4394,12 @@ ExecEndAgg(AggState *node)
 		MemoryContextDelete(node->hash_metacxt);
 		node->hash_metacxt = NULL;
 	}
+	if (node->hash_tablecxt != NULL)
+	{
+		MemoryContextDelete(node->hash_tablecxt);
+		node->hash_tablecxt = NULL;
+	}
+
 
 	for (transno = 0; transno < node->numtrans; transno++)
 	{
@@ -4443,6 +4516,7 @@ ExecReScanAgg(AggState *node)
 		node->hash_ngroups_current = 0;
 
 		ReScanExprContext(node->hashcontext);
+		MemoryContextReset(node->hash_tablecxt);
 		/* Rebuild an empty hash table */
 		build_hash_tables(node);
 		node->table_filled = false;
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index d46ce9b4996..999d06aa570 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -2558,7 +2558,8 @@ typedef struct AggState
 	/* these fields are used in AGG_HASHED and AGG_MIXED modes: */
 	bool		table_filled;	/* hash table filled yet? */
 	int			num_hashes;
-	MemoryContext hash_metacxt; /* memory for hash table itself */
+	MemoryContext hash_metacxt; /* memory for hash table bucket array */
+	MemoryContext hash_tablecxt;	/* memory for hash table entries */
 	struct LogicalTapeSet *hash_tapeset;	/* tape set for hash spill tapes */
 	struct HashAggSpill *hash_spills;	/* HashAggSpill for each grouping set,
 										 * exists only during first pass */
@@ -2584,7 +2585,7 @@ typedef struct AggState
 										 * per-group pointers */
 
 	/* support for evaluation of agg input expressions: */
-#define FIELDNO_AGGSTATE_ALL_PERGROUPS 53
+#define FIELDNO_AGGSTATE_ALL_PERGROUPS 54
 	AggStatePerGroup *all_pergroups;	/* array of first ->pergroups, than
 										 * ->hash_pergroup */
 	SharedAggInfo *shared_info; /* one entry per worker */
diff --git a/src/test/regress/expected/groupingsets.out b/src/test/regress/expected/groupingsets.out
index d7c9b44605d..0e86e31fe90 100644
--- a/src/test/regress/expected/groupingsets.out
+++ b/src/test/regress/expected/groupingsets.out
@@ -1711,7 +1711,7 @@ explain (costs off)
          ->  Seq Scan on tenk1
 (9 rows)
 
-set work_mem = '384kB';
+set work_mem = '256kB';
 explain (costs off)
   select unique1,
          count(two), count(four), count(ten),
diff --git a/src/test/regress/sql/groupingsets.sql b/src/test/regress/sql/groupingsets.sql
index 21cd3121940..10f3c36d5c6 100644
--- a/src/test/regress/sql/groupingsets.sql
+++ b/src/test/regress/sql/groupingsets.sql
@@ -467,7 +467,7 @@ explain (costs off)
          count(*)
     from tenk1 group by grouping sets (unique1,hundred,ten,four,two);
 
-set work_mem = '384kB';
+set work_mem = '256kB';
 explain (costs off)
   select unique1,
          count(two), count(four), count(ten),
-- 
2.34.1

Re: Reduce TupleHashEntryData struct size by half

Reply via email to