On Tue, 2025-03-04 at 17:28 -0800, Jeff Davis wrote:
> My results (with wide tables):
> 
>                         GROUP BY      EXCEPT
>    master:                  2151        1732
>    entire v8 series:        2054        1740

I'm not sure what I did with the EXCEPT test, above, perhaps I had
filled the test table twice by mistake or something?

New numbers below:

Data:

  create table c (a int not null);
  insert into c select a from generate_Series(1,1000) a;

  create table big(i int, j int);
  insert into big select g, 1 from generate_series(1,10000000) g;

  create table wide (t text not null);
  insert into wide
    select repeat(md5(a::text),10)
    from generate_Series(1,1000) a;

  create table wide_copy (t text not null);
  insert into wide_copy select * from wide;

  vacuum freeze analyze;


Results:

              c(tps) wide(tps) except(tps)  big(ms)  big(MiB)

91ecb5e0bc     4768      2091        2317     3853       768
master         4942      2063        2323     3831       768
0001           4932      2038        2361     3712       616
0002           4601      2034        2365     3753       616
0003           4850      2040        2418     3749       616
0004           4744      2065        2282     3665       488
0005           4630      1994        2307     3665       488
0006           4809      2063        2394     3646       488
0007           4752      2070        2479     3659       488

Note: c.sql, wide.sql, and except.sql are measured in tps, higher is
better. big.sql is measured in ms and MiB, lower is better.

For some reason I'm getting a decline of about 3% in the c.sql test
that seems to be associated with the accessor functions, even when
inlined. I'm also not seeing as much benefit from the inlining of the
MemoryContextMemAllocated(). But these mixed test results are minor
compared with the memory savings of 35% and the more consistently-
improved performance of 5% on the larger test (which is also integers),
so I plan to commit it.

Regards,
        Jeff Davis

From f19e8363d52755d54d31f34778c841f97e6edfe2 Mon Sep 17 00:00:00 2001
From: Jeff Davis <j...@j-davis.com>
Date: Wed, 8 Jan 2025 11:46:35 -0800
Subject: [PATCH v9 1/7] HashAgg: use Bump allocator for hash TupleHashTable
 entries.

The entries aren't freed until the entire hash table is destroyed, so
use the Bump allocator to improve allocation speed, avoid wasting
space on the chunk header, and avoid wasting space due to the
power-of-two allocations.

Discussion: https://postgr.es/m/caaphdvqv1anb4cm36fzrwivxrevbo_lsg_eq3nqdxtjecaa...@mail.gmail.com
Reviewed-by: David Rowley
---
 src/backend/executor/execUtils.c |  17 +++--
 src/backend/executor/nodeAgg.c   | 111 ++++++++++++++++++++++++++-----
 src/include/nodes/execnodes.h    |   5 +-
 3 files changed, 104 insertions(+), 29 deletions(-)

diff --git a/src/backend/executor/execUtils.c b/src/backend/executor/execUtils.c
index 55ab18fb826..772c86e70e9 100644
--- a/src/backend/executor/execUtils.c
+++ b/src/backend/executor/execUtils.c
@@ -322,19 +322,18 @@ CreateExprContext(EState *estate)
 ExprContext *
 CreateWorkExprContext(EState *estate)
 {
-	Size		minContextSize = ALLOCSET_DEFAULT_MINSIZE;
-	Size		initBlockSize = ALLOCSET_DEFAULT_INITSIZE;
 	Size		maxBlockSize = ALLOCSET_DEFAULT_MAXSIZE;
 
-	/* choose the maxBlockSize to be no larger than 1/16 of work_mem */
-	while (maxBlockSize > work_mem * (Size) 1024 / 16)
-		maxBlockSize >>= 1;
+	maxBlockSize = pg_prevpower2_size_t(work_mem * (Size) 1024 / 16);
 
-	if (maxBlockSize < ALLOCSET_DEFAULT_INITSIZE)
-		maxBlockSize = ALLOCSET_DEFAULT_INITSIZE;
+	/* But no bigger than ALLOCSET_DEFAULT_MAXSIZE */
+	maxBlockSize = Min(maxBlockSize, ALLOCSET_DEFAULT_MAXSIZE);
 
-	return CreateExprContextInternal(estate, minContextSize,
-									 initBlockSize, maxBlockSize);
+	/* and no smaller than ALLOCSET_DEFAULT_INITSIZE */
+	maxBlockSize = Max(maxBlockSize, ALLOCSET_DEFAULT_INITSIZE);
+
+	return CreateExprContextInternal(estate, ALLOCSET_DEFAULT_MINSIZE,
+									 ALLOCSET_DEFAULT_INITSIZE, maxBlockSize);
 }
 
 /* ----------------
diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index b4a7698a0b3..fb73e6f991d 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -406,6 +406,7 @@ static void build_hash_tables(AggState *aggstate);
 static void build_hash_table(AggState *aggstate, int setno, long nbuckets);
 static void hashagg_recompile_expressions(AggState *aggstate, bool minslot,
 										  bool nullcheck);
+static void hash_create_memory(AggState *aggstate);
 static long hash_choose_num_buckets(double hashentrysize,
 									long ngroups, Size memory);
 static int	hash_choose_num_partitions(double input_groups,
@@ -1509,7 +1510,7 @@ build_hash_table(AggState *aggstate, int setno, long nbuckets)
 {
 	AggStatePerHash perhash = &aggstate->perhash[setno];
 	MemoryContext metacxt = aggstate->hash_metacxt;
-	MemoryContext hashcxt = aggstate->hashcontext->ecxt_per_tuple_memory;
+	MemoryContext tablecxt = aggstate->hash_tablecxt;
 	MemoryContext tmpcxt = aggstate->tmpcontext->ecxt_per_tuple_memory;
 	Size		additionalsize;
 
@@ -1535,7 +1536,7 @@ build_hash_table(AggState *aggstate, int setno, long nbuckets)
 											 nbuckets,
 											 additionalsize,
 											 metacxt,
-											 hashcxt,
+											 tablecxt,
 											 tmpcxt,
 											 DO_AGGSPLIT_SKIPFINAL(aggstate->aggsplit));
 }
@@ -1706,15 +1707,19 @@ hash_agg_entry_size(int numTrans, Size tupleWidth, Size transitionSpace)
 							 tupleWidth);
 	Size		pergroupSize = numTrans * sizeof(AggStatePerGroupData);
 
-	tupleChunkSize = CHUNKHDRSZ + tupleSize;
-
-	if (pergroupSize > 0)
-		pergroupChunkSize = CHUNKHDRSZ + pergroupSize;
-	else
-		pergroupChunkSize = 0;
+	/*
+	 * Entries use the Bump allocator, so the chunk sizes are the same as the
+	 * requested sizes.
+	 */
+	tupleChunkSize = MAXALIGN(tupleSize);
+	pergroupChunkSize = pergroupSize;
 
+	/*
+	 * Transition values use AllocSet, which has a chunk header and also uses
+	 * power-of-two allocations.
+	 */
 	if (transitionSpace > 0)
-		transitionChunkSize = CHUNKHDRSZ + transitionSpace;
+		transitionChunkSize = pg_nextpower2_size_t(transitionSpace);
 	else
 		transitionChunkSize = 0;
 
@@ -1864,8 +1869,11 @@ hash_agg_check_limits(AggState *aggstate)
 	uint64		ngroups = aggstate->hash_ngroups_current;
 	Size		meta_mem = MemoryContextMemAllocated(aggstate->hash_metacxt,
 													 true);
-	Size		hashkey_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory,
-														true);
+	Size		entry_mem = MemoryContextMemAllocated(aggstate->hash_tablecxt,
+													  true);
+	Size		tval_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory,
+													 true);
+	Size		total_mem = meta_mem + entry_mem + tval_mem;
 	bool		do_spill = false;
 
 #ifdef USE_INJECTION_POINTS
@@ -1884,7 +1892,7 @@ hash_agg_check_limits(AggState *aggstate)
 	 * can be sure to make progress even in edge cases.
 	 */
 	if (aggstate->hash_ngroups_current > 0 &&
-		(meta_mem + hashkey_mem > aggstate->hash_mem_limit ||
+		(total_mem > aggstate->hash_mem_limit ||
 		 ngroups > aggstate->hash_ngroups_limit))
 	{
 		do_spill = true;
@@ -1939,6 +1947,7 @@ static void
 hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
 {
 	Size		meta_mem;
+	Size		entry_mem;
 	Size		hashkey_mem;
 	Size		buffer_mem;
 	Size		total_mem;
@@ -1950,7 +1959,10 @@ hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
 	/* memory for the hash table itself */
 	meta_mem = MemoryContextMemAllocated(aggstate->hash_metacxt, true);
 
-	/* memory for the group keys and transition states */
+	/* memory for hash entries */
+	entry_mem = MemoryContextMemAllocated(aggstate->hash_tablecxt, true);
+
+	/* memory for byref transition states */
 	hashkey_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory, true);
 
 	/* memory for read/write tape buffers, if spilled */
@@ -1959,7 +1971,7 @@ hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
 		buffer_mem += HASHAGG_READ_BUFFER_SIZE;
 
 	/* update peak mem */
-	total_mem = meta_mem + hashkey_mem + buffer_mem;
+	total_mem = meta_mem + entry_mem + hashkey_mem + buffer_mem;
 	if (total_mem > aggstate->hash_mem_peak)
 		aggstate->hash_mem_peak = total_mem;
 
@@ -1981,6 +1993,64 @@ hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
 	}
 }
 
+/*
+ * Create memory contexts used for hash aggregation.
+ */
+static void
+hash_create_memory(AggState *aggstate)
+{
+	Size		maxBlockSize = ALLOCSET_DEFAULT_MAXSIZE;
+
+	/*
+	 * The hashcontext's per-tuple memory will be used for byref transition
+	 * values and returned by AggCheckCallContext().
+	 */
+	aggstate->hashcontext = CreateWorkExprContext(aggstate->ss.ps.state);
+
+	/*
+	 * The meta context will be used for the bucket array of
+	 * TupleHashEntryData (or arrays, in the case of grouping sets). As the
+	 * hash table grows, the bucket array will double in size and the old one
+	 * will be freed, so an AllocSet is appropriate. For large bucket arrays,
+	 * the large allocation path will be used, so it's not worth worrying
+	 * about wasting space due to power-of-two allocations.
+	 */
+	aggstate->hash_metacxt = AllocSetContextCreate(aggstate->ss.ps.state->es_query_cxt,
+												   "HashAgg meta context",
+												   ALLOCSET_DEFAULT_SIZES);
+
+	/*
+	 * The hash entries themselves, which include the grouping key
+	 * (firstTuple) and pergroup data, are stored in the table context. The
+	 * bump allocator can be used because the entries are not freed until the
+	 * entire hash table is reset. The bump allocator is faster for
+	 * allocations and avoids wasting space on the chunk header or
+	 * power-of-two allocations.
+	 *
+	 * Like CreateWorkExprContext(), use smaller sizings for smaller work_mem,
+	 * to avoid large jumps in memory usage.
+	 */
+
+	/*
+	 * Like CreateWorkExprContext(), use smaller sizings for smaller work_mem,
+	 * to avoid large jumps in memory usage.
+	 */
+	maxBlockSize = pg_prevpower2_size_t(work_mem * (Size) 1024 / 16);
+
+	/* But no bigger than ALLOCSET_DEFAULT_MAXSIZE */
+	maxBlockSize = Min(maxBlockSize, ALLOCSET_DEFAULT_MAXSIZE);
+
+	/* and no smaller than ALLOCSET_DEFAULT_INITSIZE */
+	maxBlockSize = Max(maxBlockSize, ALLOCSET_DEFAULT_INITSIZE);
+
+	aggstate->hash_tablecxt = BumpContextCreate(aggstate->ss.ps.state->es_query_cxt,
+												"HashAgg table context",
+												ALLOCSET_DEFAULT_MINSIZE,
+												ALLOCSET_DEFAULT_INITSIZE,
+												maxBlockSize);
+
+}
+
 /*
  * Choose a reasonable number of buckets for the initial hash table size.
  */
@@ -2642,6 +2712,7 @@ agg_refill_hash_table(AggState *aggstate)
 
 	/* free memory and reset hash tables */
 	ReScanExprContext(aggstate->hashcontext);
+	MemoryContextReset(aggstate->hash_tablecxt);
 	for (int setno = 0; setno < aggstate->num_hashes; setno++)
 		ResetTupleHashTable(aggstate->perhash[setno].hashtable);
 
@@ -3326,7 +3397,7 @@ ExecInitAgg(Agg *node, EState *estate, int eflags)
 	}
 
 	if (use_hashing)
-		aggstate->hashcontext = CreateWorkExprContext(estate);
+		hash_create_memory(aggstate);
 
 	ExecAssignExprContext(estate, &aggstate->ss.ps);
 
@@ -3621,9 +3692,6 @@ ExecInitAgg(Agg *node, EState *estate, int eflags)
 		Plan	   *outerplan = outerPlan(node);
 		uint64		totalGroups = 0;
 
-		aggstate->hash_metacxt = AllocSetContextCreate(aggstate->ss.ps.state->es_query_cxt,
-													   "HashAgg meta context",
-													   ALLOCSET_DEFAULT_SIZES);
 		aggstate->hash_spill_rslot = ExecInitExtraTupleSlot(estate, scanDesc,
 															&TTSOpsMinimalTuple);
 		aggstate->hash_spill_wslot = ExecInitExtraTupleSlot(estate, scanDesc,
@@ -4368,6 +4436,12 @@ ExecEndAgg(AggState *node)
 		MemoryContextDelete(node->hash_metacxt);
 		node->hash_metacxt = NULL;
 	}
+	if (node->hash_tablecxt != NULL)
+	{
+		MemoryContextDelete(node->hash_tablecxt);
+		node->hash_tablecxt = NULL;
+	}
+
 
 	for (transno = 0; transno < node->numtrans; transno++)
 	{
@@ -4484,6 +4558,7 @@ ExecReScanAgg(AggState *node)
 		node->hash_ngroups_current = 0;
 
 		ReScanExprContext(node->hashcontext);
+		MemoryContextReset(node->hash_tablecxt);
 		/* Rebuild an empty hash table */
 		build_hash_tables(node);
 		node->table_filled = false;
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index d4d4e655180..b5539ddb41e 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -2560,7 +2560,8 @@ typedef struct AggState
 	/* these fields are used in AGG_HASHED and AGG_MIXED modes: */
 	bool		table_filled;	/* hash table filled yet? */
 	int			num_hashes;
-	MemoryContext hash_metacxt; /* memory for hash table itself */
+	MemoryContext hash_metacxt; /* memory for hash table bucket array */
+	MemoryContext hash_tablecxt;	/* memory for hash table entries */
 	struct LogicalTapeSet *hash_tapeset;	/* tape set for hash spill tapes */
 	struct HashAggSpill *hash_spills;	/* HashAggSpill for each grouping set,
 										 * exists only during first pass */
@@ -2586,7 +2587,7 @@ typedef struct AggState
 										 * per-group pointers */
 
 	/* support for evaluation of agg input expressions: */
-#define FIELDNO_AGGSTATE_ALL_PERGROUPS 53
+#define FIELDNO_AGGSTATE_ALL_PERGROUPS 54
 	AggStatePerGroup *all_pergroups;	/* array of first ->pergroups, than
 										 * ->hash_pergroup */
 	SharedAggInfo *shared_info; /* one entry per worker */
-- 
2.34.1

From 051a5574345af4d3d914dc314788f586d68e3bfc Mon Sep 17 00:00:00 2001
From: Jeff Davis <j...@j-davis.com>
Date: Mon, 13 Jan 2025 14:37:02 -0800
Subject: [PATCH v9 2/7] Create accessor functions for TupleHashEntry.

---
 src/backend/executor/execGrouping.c | 54 ++++++++++++++++++++++++++---
 src/backend/executor/nodeAgg.c      | 32 ++++++++---------
 src/backend/executor/nodeSetOp.c    | 33 ++++++++++++------
 src/backend/executor/nodeSubplan.c  |  2 +-
 src/include/executor/executor.h     |  4 +++
 src/include/nodes/execnodes.h       | 11 ++----
 6 files changed, 95 insertions(+), 41 deletions(-)

diff --git a/src/backend/executor/execGrouping.c b/src/backend/executor/execGrouping.c
index 33b124fbb0a..4a90db94f8d 100644
--- a/src/backend/executor/execGrouping.c
+++ b/src/backend/executor/execGrouping.c
@@ -20,6 +20,14 @@
 #include "miscadmin.h"
 #include "utils/lsyscache.h"
 
+struct TupleHashEntryData
+{
+	MinimalTuple firstTuple;	/* copy of first tuple in this group */
+	void	   *additional;		/* user data */
+	uint32		status;			/* hash status */
+	uint32		hash;			/* hash value (cached) */
+};
+
 static int	TupleHashTableMatch(struct tuplehash_hash *tb, const MinimalTuple tuple1, const MinimalTuple tuple2);
 static inline uint32 TupleHashTableHash_internal(struct tuplehash_hash *tb,
 												 const MinimalTuple tuple);
@@ -174,13 +182,15 @@ BuildTupleHashTable(PlanState *parent,
 					bool use_variable_hash_iv)
 {
 	TupleHashTable hashtable;
-	Size		entrysize = sizeof(TupleHashEntryData) + additionalsize;
+	Size		entrysize;
 	Size		hash_mem_limit;
 	MemoryContext oldcontext;
 	bool		allow_jit;
 	uint32		hash_iv = 0;
 
 	Assert(nbuckets > 0);
+	additionalsize = MAXALIGN(additionalsize);
+	entrysize = sizeof(TupleHashEntryData) + additionalsize;
 
 	/* Limit initial table size request to not more than hash_mem */
 	hash_mem_limit = get_hash_memory_limit() / entrysize;
@@ -196,6 +206,7 @@ BuildTupleHashTable(PlanState *parent,
 	hashtable->tab_collations = collations;
 	hashtable->tablecxt = tablecxt;
 	hashtable->tempcxt = tempcxt;
+	hashtable->additionalsize = additionalsize;
 	hashtable->tableslot = NULL;	/* will be made on first lookup */
 	hashtable->inputslot = NULL;
 	hashtable->in_hash_expr = NULL;
@@ -273,6 +284,38 @@ ResetTupleHashTable(TupleHashTable hashtable)
 	tuplehash_reset(hashtable->hashtab);
 }
 
+/*
+ * Return size of the hash bucket. Useful for estimating memory usage.
+ */
+size_t
+TupleHashEntrySize(void)
+{
+	return sizeof(TupleHashEntryData);
+}
+
+/*
+ * Return tuple from hash entry.
+ */
+MinimalTuple
+TupleHashEntryGetTuple(TupleHashEntry entry)
+{
+	return entry->firstTuple;
+}
+
+/*
+ * Get a pointer into the additional space allocated for this entry. The
+ * memory will be maxaligned and zeroed.
+ *
+ * The amount of space available is the additionalsize requested in the call
+ * to BuildTupleHashTable(). If additionalsize was specified as zero, return
+ * NULL.
+ */
+void *
+TupleHashEntryGetAdditional(TupleHashTable hashtable, TupleHashEntry entry)
+{
+	return entry->additional;
+}
+
 /*
  * Find or create a hashtable entry for the tuple group containing the
  * given tuple.  The tuple must be the same type as the hashtable entries.
@@ -479,11 +522,14 @@ LookupTupleHashEntry_internal(TupleHashTable hashtable, TupleTableSlot *slot,
 		{
 			/* created new entry */
 			*isnew = true;
-			/* zero caller data */
-			entry->additional = NULL;
+
 			MemoryContextSwitchTo(hashtable->tablecxt);
-			/* Copy the first tuple into the table context */
+
 			entry->firstTuple = ExecCopySlotMinimalTuple(slot);
+			if (hashtable->additionalsize > 0)
+				entry->additional = palloc0(hashtable->additionalsize);
+			else
+				entry->additional = NULL;
 		}
 	}
 	else
diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index fb73e6f991d..3d6a4f40ed5 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -1491,7 +1491,7 @@ build_hash_tables(AggState *aggstate)
 #ifdef USE_INJECTION_POINTS
 		if (IS_INJECTION_POINT_ATTACHED("hash-aggregate-oversize-table"))
 		{
-			nbuckets = memory / sizeof(TupleHashEntryData);
+			nbuckets = memory / TupleHashEntrySize();
 			INJECTION_POINT_CACHED("hash-aggregate-oversize-table");
 		}
 #endif
@@ -1724,7 +1724,7 @@ hash_agg_entry_size(int numTrans, Size tupleWidth, Size transitionSpace)
 		transitionChunkSize = 0;
 
 	return
-		sizeof(TupleHashEntryData) +
+		TupleHashEntrySize() +
 		tupleChunkSize +
 		pergroupChunkSize +
 		transitionChunkSize;
@@ -1988,7 +1988,7 @@ hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
 	if (aggstate->hash_ngroups_current > 0)
 	{
 		aggstate->hashentrysize =
-			sizeof(TupleHashEntryData) +
+			TupleHashEntrySize() +
 			(hashkey_mem / (double) aggstate->hash_ngroups_current);
 	}
 }
@@ -2147,11 +2147,7 @@ initialize_hash_entry(AggState *aggstate, TupleHashTable hashtable,
 	if (aggstate->numtrans == 0)
 		return;
 
-	pergroup = (AggStatePerGroup)
-		MemoryContextAlloc(hashtable->tablecxt,
-						   sizeof(AggStatePerGroupData) * aggstate->numtrans);
-
-	entry->additional = pergroup;
+	pergroup = (AggStatePerGroup) TupleHashEntryGetAdditional(hashtable, entry);
 
 	/*
 	 * Initialize aggregates for new tuple group, lookup_hash_entries()
@@ -2213,7 +2209,7 @@ lookup_hash_entries(AggState *aggstate)
 		{
 			if (isnew)
 				initialize_hash_entry(aggstate, hashtable, entry);
-			pergroup[setno] = entry->additional;
+			pergroup[setno] = TupleHashEntryGetAdditional(hashtable, entry);
 		}
 		else
 		{
@@ -2748,6 +2744,7 @@ agg_refill_hash_table(AggState *aggstate)
 	{
 		TupleTableSlot *spillslot = aggstate->hash_spill_rslot;
 		TupleTableSlot *hashslot = perhash->hashslot;
+		TupleHashTable hashtable = perhash->hashtable;
 		TupleHashEntry entry;
 		MinimalTuple tuple;
 		uint32		hash;
@@ -2766,14 +2763,14 @@ agg_refill_hash_table(AggState *aggstate)
 		prepare_hash_slot(perhash,
 						  aggstate->tmpcontext->ecxt_outertuple,
 						  hashslot);
-		entry = LookupTupleHashEntryHash(perhash->hashtable, hashslot,
+		entry = LookupTupleHashEntryHash(hashtable, hashslot,
 										 p_isnew, hash);
 
 		if (entry != NULL)
 		{
 			if (isnew)
-				initialize_hash_entry(aggstate, perhash->hashtable, entry);
-			aggstate->hash_pergroup[batch->setno] = entry->additional;
+				initialize_hash_entry(aggstate, hashtable, entry);
+			aggstate->hash_pergroup[batch->setno] = TupleHashEntryGetAdditional(hashtable, entry);
 			advance_aggregates(aggstate);
 		}
 		else
@@ -2865,7 +2862,7 @@ agg_retrieve_hash_table_in_memory(AggState *aggstate)
 	ExprContext *econtext;
 	AggStatePerAgg peragg;
 	AggStatePerGroup pergroup;
-	TupleHashEntryData *entry;
+	TupleHashEntry entry;
 	TupleTableSlot *firstSlot;
 	TupleTableSlot *result;
 	AggStatePerHash perhash;
@@ -2892,6 +2889,7 @@ agg_retrieve_hash_table_in_memory(AggState *aggstate)
 	for (;;)
 	{
 		TupleTableSlot *hashslot = perhash->hashslot;
+		TupleHashTable hashtable = perhash->hashtable;
 		int			i;
 
 		CHECK_FOR_INTERRUPTS();
@@ -2899,7 +2897,7 @@ agg_retrieve_hash_table_in_memory(AggState *aggstate)
 		/*
 		 * Find the next entry in the hash table
 		 */
-		entry = ScanTupleHashTable(perhash->hashtable, &perhash->hashiter);
+		entry = ScanTupleHashTable(hashtable, &perhash->hashiter);
 		if (entry == NULL)
 		{
 			int			nextset = aggstate->current_set + 1;
@@ -2914,7 +2912,7 @@ agg_retrieve_hash_table_in_memory(AggState *aggstate)
 
 				perhash = &aggstate->perhash[aggstate->current_set];
 
-				ResetTupleHashIterator(perhash->hashtable, &perhash->hashiter);
+				ResetTupleHashIterator(hashtable, &perhash->hashiter);
 
 				continue;
 			}
@@ -2937,7 +2935,7 @@ agg_retrieve_hash_table_in_memory(AggState *aggstate)
 		 * Transform representative tuple back into one with the right
 		 * columns.
 		 */
-		ExecStoreMinimalTuple(entry->firstTuple, hashslot, false);
+		ExecStoreMinimalTuple(TupleHashEntryGetTuple(entry), hashslot, false);
 		slot_getallattrs(hashslot);
 
 		ExecClearTuple(firstSlot);
@@ -2953,7 +2951,7 @@ agg_retrieve_hash_table_in_memory(AggState *aggstate)
 		}
 		ExecStoreVirtualTuple(firstSlot);
 
-		pergroup = (AggStatePerGroup) entry->additional;
+		pergroup = (AggStatePerGroup) TupleHashEntryGetAdditional(hashtable, entry);
 
 		/*
 		 * Use the representative input tuple for any references to
diff --git a/src/backend/executor/nodeSetOp.c b/src/backend/executor/nodeSetOp.c
index 5b7ff9c3748..4068481a523 100644
--- a/src/backend/executor/nodeSetOp.c
+++ b/src/backend/executor/nodeSetOp.c
@@ -424,7 +424,9 @@ setop_fill_hash_table(SetOpState *setopstate)
 	for (;;)
 	{
 		TupleTableSlot *outerslot;
+		TupleHashTable hashtable = setopstate->hashtable;
 		TupleHashEntryData *entry;
+		SetOpStatePerGroup pergroup;
 		bool		isnew;
 
 		outerslot = ExecProcNode(outerPlan);
@@ -433,20 +435,20 @@ setop_fill_hash_table(SetOpState *setopstate)
 		have_tuples = true;
 
 		/* Find or build hashtable entry for this tuple's group */
-		entry = LookupTupleHashEntry(setopstate->hashtable,
+		entry = LookupTupleHashEntry(hashtable,
 									 outerslot,
 									 &isnew, NULL);
 
+		pergroup = TupleHashEntryGetAdditional(hashtable, entry);
 		/* If new tuple group, initialize counts to zero */
 		if (isnew)
 		{
-			entry->additional = (SetOpStatePerGroup)
-				MemoryContextAllocZero(setopstate->hashtable->tablecxt,
-									   sizeof(SetOpStatePerGroupData));
+			pergroup->numLeft = 0;
+			pergroup->numRight = 0;
 		}
 
 		/* Advance the counts */
-		((SetOpStatePerGroup) entry->additional)->numLeft++;
+		pergroup->numLeft++;
 
 		/* Must reset expression context after each hashtable lookup */
 		ResetExprContext(econtext);
@@ -465,6 +467,7 @@ setop_fill_hash_table(SetOpState *setopstate)
 		for (;;)
 		{
 			TupleTableSlot *innerslot;
+			TupleHashTable hashtable = setopstate->hashtable;
 			TupleHashEntryData *entry;
 
 			innerslot = ExecProcNode(innerPlan);
@@ -472,13 +475,17 @@ setop_fill_hash_table(SetOpState *setopstate)
 				break;
 
 			/* For tuples not seen previously, do not make hashtable entry */
-			entry = LookupTupleHashEntry(setopstate->hashtable,
+			entry = LookupTupleHashEntry(hashtable,
 										 innerslot,
 										 NULL, NULL);
 
 			/* Advance the counts if entry is already present */
 			if (entry)
-				((SetOpStatePerGroup) entry->additional)->numRight++;
+			{
+				SetOpStatePerGroup pergroup = TupleHashEntryGetAdditional(hashtable, entry);
+
+				pergroup->numRight++;
+			}
 
 			/* Must reset expression context after each hashtable lookup */
 			ResetExprContext(econtext);
@@ -496,7 +503,7 @@ setop_fill_hash_table(SetOpState *setopstate)
 static TupleTableSlot *
 setop_retrieve_hash_table(SetOpState *setopstate)
 {
-	TupleHashEntryData *entry;
+	TupleHashEntry entry;
 	TupleTableSlot *resultTupleSlot;
 
 	/*
@@ -509,12 +516,15 @@ setop_retrieve_hash_table(SetOpState *setopstate)
 	 */
 	while (!setopstate->setop_done)
 	{
+		TupleHashTable hashtable = setopstate->hashtable;
+		SetOpStatePerGroup pergroup;
+
 		CHECK_FOR_INTERRUPTS();
 
 		/*
 		 * Find the next entry in the hash table
 		 */
-		entry = ScanTupleHashTable(setopstate->hashtable, &setopstate->hashiter);
+		entry = ScanTupleHashTable(hashtable, &setopstate->hashiter);
 		if (entry == NULL)
 		{
 			/* No more entries in hashtable, so done */
@@ -526,12 +536,13 @@ setop_retrieve_hash_table(SetOpState *setopstate)
 		 * See if we should emit any copies of this tuple, and if so return
 		 * the first copy.
 		 */
-		set_output_count(setopstate, (SetOpStatePerGroup) entry->additional);
+		pergroup = TupleHashEntryGetAdditional(hashtable, entry);
+		set_output_count(setopstate, pergroup);
 
 		if (setopstate->numOutput > 0)
 		{
 			setopstate->numOutput--;
-			return ExecStoreMinimalTuple(entry->firstTuple,
+			return ExecStoreMinimalTuple(TupleHashEntryGetTuple(entry),
 										 resultTupleSlot,
 										 false);
 		}
diff --git a/src/backend/executor/nodeSubplan.c b/src/backend/executor/nodeSubplan.c
index 49767ed6a52..f7f6fc2da0b 100644
--- a/src/backend/executor/nodeSubplan.c
+++ b/src/backend/executor/nodeSubplan.c
@@ -753,7 +753,7 @@ findPartialMatch(TupleHashTable hashtable, TupleTableSlot *slot,
 	{
 		CHECK_FOR_INTERRUPTS();
 
-		ExecStoreMinimalTuple(entry->firstTuple, hashtable->tableslot, false);
+		ExecStoreMinimalTuple(TupleHashEntryGetTuple(entry), hashtable->tableslot, false);
 		if (!execTuplesUnequal(slot, hashtable->tableslot,
 							   numCols, keyColIdx,
 							   eqfunctions,
diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h
index 0db5d18ba22..aae49f57e5e 100644
--- a/src/include/executor/executor.h
+++ b/src/include/executor/executor.h
@@ -149,6 +149,10 @@ extern TupleHashEntry LookupTupleHashEntry(TupleHashTable hashtable,
 										   bool *isnew, uint32 *hash);
 extern uint32 TupleHashTableHash(TupleHashTable hashtable,
 								 TupleTableSlot *slot);
+extern size_t TupleHashEntrySize(void);
+extern MinimalTuple TupleHashEntryGetTuple(TupleHashEntry entry);
+extern void *TupleHashEntryGetAdditional(TupleHashTable hashtable,
+										 TupleHashEntry entry);
 extern TupleHashEntry LookupTupleHashEntryHash(TupleHashTable hashtable,
 											   TupleTableSlot *slot,
 											   bool *isnew, uint32 hash);
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index b5539ddb41e..d2c03e0c50d 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -834,17 +834,11 @@ typedef struct ExecAuxRowMark
  * point to tab_hash_expr and tab_eq_func respectively.
  * ----------------------------------------------------------------
  */
+struct TupleHashEntryData;
+typedef struct TupleHashEntryData TupleHashEntryData;
 typedef struct TupleHashEntryData *TupleHashEntry;
 typedef struct TupleHashTableData *TupleHashTable;
 
-typedef struct TupleHashEntryData
-{
-	MinimalTuple firstTuple;	/* copy of first tuple in this group */
-	void	   *additional;		/* user data */
-	uint32		status;			/* hash status */
-	uint32		hash;			/* hash value (cached) */
-} TupleHashEntryData;
-
 /* define parameters necessary to generate the tuple hash table interface */
 #define SH_PREFIX tuplehash
 #define SH_ELEMENT_TYPE TupleHashEntryData
@@ -863,6 +857,7 @@ typedef struct TupleHashTableData
 	Oid		   *tab_collations; /* collations for hash and comparison */
 	MemoryContext tablecxt;		/* memory context containing table */
 	MemoryContext tempcxt;		/* context for function evaluations */
+	Size		additionalsize; /* size of additional data */
 	TupleTableSlot *tableslot;	/* slot for referencing table entries */
 	/* The following fields are set transiently for each table search: */
 	TupleTableSlot *inputslot;	/* current input tuple's slot */
-- 
2.34.1

From 6ae73d393c0f5f0a5355354d55bbd025e4c8965b Mon Sep 17 00:00:00 2001
From: Jeff Davis <j...@j-davis.com>
Date: Tue, 4 Mar 2025 15:51:19 -0800
Subject: [PATCH v9 3/7] Inline TupleHashEntryGetTuple() and
 ...GetAdditional().

---
 src/backend/executor/execGrouping.c | 40 -----------------------------
 src/include/executor/executor.h     | 38 ++++++++++++++++++++++++---
 src/include/nodes/execnodes.h       | 10 ++++++--
 3 files changed, 42 insertions(+), 46 deletions(-)

diff --git a/src/backend/executor/execGrouping.c b/src/backend/executor/execGrouping.c
index 4a90db94f8d..a9d212aaec6 100644
--- a/src/backend/executor/execGrouping.c
+++ b/src/backend/executor/execGrouping.c
@@ -20,14 +20,6 @@
 #include "miscadmin.h"
 #include "utils/lsyscache.h"
 
-struct TupleHashEntryData
-{
-	MinimalTuple firstTuple;	/* copy of first tuple in this group */
-	void	   *additional;		/* user data */
-	uint32		status;			/* hash status */
-	uint32		hash;			/* hash value (cached) */
-};
-
 static int	TupleHashTableMatch(struct tuplehash_hash *tb, const MinimalTuple tuple1, const MinimalTuple tuple2);
 static inline uint32 TupleHashTableHash_internal(struct tuplehash_hash *tb,
 												 const MinimalTuple tuple);
@@ -284,38 +276,6 @@ ResetTupleHashTable(TupleHashTable hashtable)
 	tuplehash_reset(hashtable->hashtab);
 }
 
-/*
- * Return size of the hash bucket. Useful for estimating memory usage.
- */
-size_t
-TupleHashEntrySize(void)
-{
-	return sizeof(TupleHashEntryData);
-}
-
-/*
- * Return tuple from hash entry.
- */
-MinimalTuple
-TupleHashEntryGetTuple(TupleHashEntry entry)
-{
-	return entry->firstTuple;
-}
-
-/*
- * Get a pointer into the additional space allocated for this entry. The
- * memory will be maxaligned and zeroed.
- *
- * The amount of space available is the additionalsize requested in the call
- * to BuildTupleHashTable(). If additionalsize was specified as zero, return
- * NULL.
- */
-void *
-TupleHashEntryGetAdditional(TupleHashTable hashtable, TupleHashEntry entry)
-{
-	return entry->additional;
-}
-
 /*
  * Find or create a hashtable entry for the tuple group containing the
  * given tuple.  The tuple must be the same type as the hashtable entries.
diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h
index aae49f57e5e..69396a9d7f8 100644
--- a/src/include/executor/executor.h
+++ b/src/include/executor/executor.h
@@ -149,10 +149,6 @@ extern TupleHashEntry LookupTupleHashEntry(TupleHashTable hashtable,
 										   bool *isnew, uint32 *hash);
 extern uint32 TupleHashTableHash(TupleHashTable hashtable,
 								 TupleTableSlot *slot);
-extern size_t TupleHashEntrySize(void);
-extern MinimalTuple TupleHashEntryGetTuple(TupleHashEntry entry);
-extern void *TupleHashEntryGetAdditional(TupleHashTable hashtable,
-										 TupleHashEntry entry);
 extern TupleHashEntry LookupTupleHashEntryHash(TupleHashTable hashtable,
 											   TupleTableSlot *slot,
 											   bool *isnew, uint32 hash);
@@ -162,6 +158,40 @@ extern TupleHashEntry FindTupleHashEntry(TupleHashTable hashtable,
 										 ExprState *hashexpr);
 extern void ResetTupleHashTable(TupleHashTable hashtable);
 
+#ifndef FRONTEND
+/*
+ * Return size of the hash bucket. Useful for estimating memory usage.
+ */
+static inline size_t
+TupleHashEntrySize(void)
+{
+	return sizeof(TupleHashEntryData);
+}
+
+/*
+ * Return tuple from hash entry.
+ */
+static inline MinimalTuple
+TupleHashEntryGetTuple(TupleHashEntry entry)
+{
+	return entry->firstTuple;
+}
+
+/*
+ * Get a pointer into the additional space allocated for this entry. The
+ * memory will be maxaligned and zeroed.
+ *
+ * The amount of space available is the additionalsize requested in the call
+ * to BuildTupleHashTable(). If additionalsize was specified as zero, return
+ * NULL.
+ */
+static inline void *
+TupleHashEntryGetAdditional(TupleHashTable hashtable, TupleHashEntry entry)
+{
+	return entry->additional;
+}
+#endif
+
 /*
  * prototypes from functions in execJunk.c
  */
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index d2c03e0c50d..7df25d7e64f 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -834,11 +834,17 @@ typedef struct ExecAuxRowMark
  * point to tab_hash_expr and tab_eq_func respectively.
  * ----------------------------------------------------------------
  */
-struct TupleHashEntryData;
-typedef struct TupleHashEntryData TupleHashEntryData;
 typedef struct TupleHashEntryData *TupleHashEntry;
 typedef struct TupleHashTableData *TupleHashTable;
 
+typedef struct TupleHashEntryData
+{
+	MinimalTuple firstTuple;	/* copy of first tuple in this group */
+	void	   *additional;		/* user data */
+	uint32		status;			/* hash status */
+	uint32		hash;			/* hash value (cached) */
+} TupleHashEntryData;
+
 /* define parameters necessary to generate the tuple hash table interface */
 #define SH_PREFIX tuplehash
 #define SH_ELEMENT_TYPE TupleHashEntryData
-- 
2.34.1

From 51ce93aea0653932433fab4cb8e2f4094c79989e Mon Sep 17 00:00:00 2001
From: Jeff Davis <j...@j-davis.com>
Date: Tue, 4 Mar 2025 15:13:22 -0800
Subject: [PATCH v9 4/7] Remove 'additional' pointer from TupleHashEntryData.

Reduces memory required for hash aggregation by avoiding an allocation
and a pointer in the TupleHashEntryData structure. That structure is
used for all buckets, whether occupied or not, so the savings is
substantial.

Discussion: https://postgr.es/m/AApHDvpN4v3t_sdz4dvrv1Fx_ZPw=twsnxuteytryp7lfz5...@mail.gmail.com
Reviewed-by: David Rowley <dgrowle...@gmail.com>
---
 src/backend/executor/execGrouping.c | 32 ++++++++++++++++++++++-------
 src/include/executor/executor.h     |  5 ++++-
 src/include/nodes/execnodes.h       |  1 -
 3 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/src/backend/executor/execGrouping.c b/src/backend/executor/execGrouping.c
index a9d212aaec6..5c06cc6da40 100644
--- a/src/backend/executor/execGrouping.c
+++ b/src/backend/executor/execGrouping.c
@@ -481,15 +481,33 @@ LookupTupleHashEntry_internal(TupleHashTable hashtable, TupleTableSlot *slot,
 		else
 		{
 			/* created new entry */
-			*isnew = true;
+			bool		shouldFree;
+			char	   *mem;
+			MinimalTuple mtup;
 
-			MemoryContextSwitchTo(hashtable->tablecxt);
+			*isnew = true;
 
-			entry->firstTuple = ExecCopySlotMinimalTuple(slot);
-			if (hashtable->additionalsize > 0)
-				entry->additional = palloc0(hashtable->additionalsize);
-			else
-				entry->additional = NULL;
+			MemoryContextSwitchTo(hashtable->tempcxt);
+
+			/* ignore shouldFree, because we're in the temp context anyway */
+			mtup = ExecFetchSlotMinimalTuple(slot, &shouldFree);
+
+			/*
+			 * Allocate enough memory in the tablecxt for the additionalsize
+			 * followed by the tuple. Clear the additional data, and copy the
+			 * tuple.
+			 *
+			 * The caller can get a pointer to the additional data with
+			 * TupleHashEntryGetAdditional(), and store arbitrary data there.
+			 * Placing both the tuple and additional data in the same
+			 * allocation avoids the need to store an extra pointer in
+			 * TupleHashEntryData or allocate an additional chunk.
+			 */
+			mem = MemoryContextAlloc(hashtable->tablecxt,
+									 mtup->t_len + hashtable->additionalsize);
+			memset(mem, 0, hashtable->additionalsize);
+			entry->firstTuple = (MinimalTuple) (mem + hashtable->additionalsize);
+			memcpy(entry->firstTuple, mtup, mtup->t_len);
 		}
 	}
 	else
diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h
index 69396a9d7f8..6a1fec88928 100644
--- a/src/include/executor/executor.h
+++ b/src/include/executor/executor.h
@@ -188,7 +188,10 @@ TupleHashEntryGetTuple(TupleHashEntry entry)
 static inline void *
 TupleHashEntryGetAdditional(TupleHashTable hashtable, TupleHashEntry entry)
 {
-	return entry->additional;
+	if (hashtable->additionalsize > 0)
+		return (char *) entry->firstTuple - hashtable->additionalsize;
+	else
+		return NULL;
 }
 #endif
 
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 7df25d7e64f..e42f9f9f957 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -840,7 +840,6 @@ typedef struct TupleHashTableData *TupleHashTable;
 typedef struct TupleHashEntryData
 {
 	MinimalTuple firstTuple;	/* copy of first tuple in this group */
-	void	   *additional;		/* user data */
 	uint32		status;			/* hash status */
 	uint32		hash;			/* hash value (cached) */
 } TupleHashEntryData;
-- 
2.34.1

From 23a982a87c9f9396d7b53786cd4a70e9e855633d Mon Sep 17 00:00:00 2001
From: Jeff Davis <j...@j-davis.com>
Date: Fri, 7 Feb 2025 13:14:50 -0800
Subject: [PATCH v9 5/7] Add ExecCopySlotMinimalTupleExtra().

Allows an "extra" argument that allocates extra memory at the end of
the MinimalTuple. This is important for callers that need to store
additional data, but do not want to perform an additional allocation.
---
 src/backend/access/common/heaptuple.c      | 27 +++++++++++++++++-----
 src/backend/executor/execTuples.c          | 24 ++++++++++---------
 src/backend/executor/nodeGatherMerge.c     |  2 +-
 src/backend/utils/sort/tuplesortvariants.c |  2 +-
 src/backend/utils/sort/tuplestore.c        |  6 ++---
 src/include/access/htup_details.h          |  7 +++---
 src/include/executor/tuptable.h            | 20 ++++++++++++++--
 7 files changed, 61 insertions(+), 27 deletions(-)

diff --git a/src/backend/access/common/heaptuple.c b/src/backend/access/common/heaptuple.c
index acd5da4ccf8..969d1028cae 100644
--- a/src/backend/access/common/heaptuple.c
+++ b/src/backend/access/common/heaptuple.c
@@ -1452,9 +1452,11 @@ heap_freetuple(HeapTuple htup)
 MinimalTuple
 heap_form_minimal_tuple(TupleDesc tupleDescriptor,
 						const Datum *values,
-						const bool *isnull)
+						const bool *isnull,
+						Size extra)
 {
 	MinimalTuple tuple;			/* return tuple */
+	char	   *mem;
 	Size		len,
 				data_len;
 	int			hoff;
@@ -1462,6 +1464,8 @@ heap_form_minimal_tuple(TupleDesc tupleDescriptor,
 	int			numberOfAttributes = tupleDescriptor->natts;
 	int			i;
 
+	Assert(extra == MAXALIGN(extra));
+
 	if (numberOfAttributes > MaxTupleAttributeNumber)
 		ereport(ERROR,
 				(errcode(ERRCODE_TOO_MANY_COLUMNS),
@@ -1497,7 +1501,9 @@ heap_form_minimal_tuple(TupleDesc tupleDescriptor,
 	/*
 	 * Allocate and zero the space needed.
 	 */
-	tuple = (MinimalTuple) palloc0(len);
+	mem = palloc0(len + extra);
+	memset(mem, 0, extra);
+	tuple = (MinimalTuple) (mem + extra);
 
 	/*
 	 * And fill in the information.
@@ -1533,11 +1539,15 @@ heap_free_minimal_tuple(MinimalTuple mtup)
  * The result is allocated in the current memory context.
  */
 MinimalTuple
-heap_copy_minimal_tuple(MinimalTuple mtup)
+heap_copy_minimal_tuple(MinimalTuple mtup, Size extra)
 {
 	MinimalTuple result;
+	char	   *mem;
 
-	result = (MinimalTuple) palloc(mtup->t_len);
+	Assert(extra == MAXALIGN(extra));
+	mem = palloc(mtup->t_len + extra);
+	memset(mem, 0, extra);
+	result = (MinimalTuple) (mem + extra);
 	memcpy(result, mtup, mtup->t_len);
 	return result;
 }
@@ -1574,15 +1584,20 @@ heap_tuple_from_minimal_tuple(MinimalTuple mtup)
  * The result is allocated in the current memory context.
  */
 MinimalTuple
-minimal_tuple_from_heap_tuple(HeapTuple htup)
+minimal_tuple_from_heap_tuple(HeapTuple htup, Size extra)
 {
 	MinimalTuple result;
+	char	   *mem;
 	uint32		len;
 
+	Assert(extra == MAXALIGN(extra));
 	Assert(htup->t_len > MINIMAL_TUPLE_OFFSET);
 	len = htup->t_len - MINIMAL_TUPLE_OFFSET;
-	result = (MinimalTuple) palloc(len);
+	mem = palloc(len + extra);
+	memset(mem, 0, extra);
+	result = (MinimalTuple) (mem + extra);
 	memcpy(result, (char *) htup->t_data + MINIMAL_TUPLE_OFFSET, len);
+
 	result->t_len = len;
 	return result;
 }
diff --git a/src/backend/executor/execTuples.c b/src/backend/executor/execTuples.c
index 7de490462d4..8e02d68824f 100644
--- a/src/backend/executor/execTuples.c
+++ b/src/backend/executor/execTuples.c
@@ -298,13 +298,14 @@ tts_virtual_copy_heap_tuple(TupleTableSlot *slot)
 }
 
 static MinimalTuple
-tts_virtual_copy_minimal_tuple(TupleTableSlot *slot)
+tts_virtual_copy_minimal_tuple(TupleTableSlot *slot, Size extra)
 {
 	Assert(!TTS_EMPTY(slot));
 
 	return heap_form_minimal_tuple(slot->tts_tupleDescriptor,
 								   slot->tts_values,
-								   slot->tts_isnull);
+								   slot->tts_isnull,
+								   extra);
 }
 
 
@@ -472,14 +473,14 @@ tts_heap_copy_heap_tuple(TupleTableSlot *slot)
 }
 
 static MinimalTuple
-tts_heap_copy_minimal_tuple(TupleTableSlot *slot)
+tts_heap_copy_minimal_tuple(TupleTableSlot *slot, Size extra)
 {
 	HeapTupleTableSlot *hslot = (HeapTupleTableSlot *) slot;
 
 	if (!hslot->tuple)
 		tts_heap_materialize(slot);
 
-	return minimal_tuple_from_heap_tuple(hslot->tuple);
+	return minimal_tuple_from_heap_tuple(hslot->tuple, extra);
 }
 
 static void
@@ -607,7 +608,8 @@ tts_minimal_materialize(TupleTableSlot *slot)
 	{
 		mslot->mintuple = heap_form_minimal_tuple(slot->tts_tupleDescriptor,
 												  slot->tts_values,
-												  slot->tts_isnull);
+												  slot->tts_isnull,
+												  0);
 	}
 	else
 	{
@@ -617,7 +619,7 @@ tts_minimal_materialize(TupleTableSlot *slot)
 		 * TTS_FLAG_SHOULDFREE set).  Copy the minimal tuple into the given
 		 * slot's memory context.
 		 */
-		mslot->mintuple = heap_copy_minimal_tuple(mslot->mintuple);
+		mslot->mintuple = heap_copy_minimal_tuple(mslot->mintuple, 0);
 	}
 
 	slot->tts_flags |= TTS_FLAG_SHOULDFREE;
@@ -666,14 +668,14 @@ tts_minimal_copy_heap_tuple(TupleTableSlot *slot)
 }
 
 static MinimalTuple
-tts_minimal_copy_minimal_tuple(TupleTableSlot *slot)
+tts_minimal_copy_minimal_tuple(TupleTableSlot *slot, Size extra)
 {
 	MinimalTupleTableSlot *mslot = (MinimalTupleTableSlot *) slot;
 
 	if (!mslot->mintuple)
 		tts_minimal_materialize(slot);
 
-	return heap_copy_minimal_tuple(mslot->mintuple);
+	return heap_copy_minimal_tuple(mslot->mintuple, extra);
 }
 
 static void
@@ -926,7 +928,7 @@ tts_buffer_heap_copy_heap_tuple(TupleTableSlot *slot)
 }
 
 static MinimalTuple
-tts_buffer_heap_copy_minimal_tuple(TupleTableSlot *slot)
+tts_buffer_heap_copy_minimal_tuple(TupleTableSlot *slot, Size extra)
 {
 	BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
 
@@ -935,7 +937,7 @@ tts_buffer_heap_copy_minimal_tuple(TupleTableSlot *slot)
 	if (!bslot->base.tuple)
 		tts_buffer_heap_materialize(slot);
 
-	return minimal_tuple_from_heap_tuple(bslot->base.tuple);
+	return minimal_tuple_from_heap_tuple(bslot->base.tuple, extra);
 }
 
 static inline void
@@ -1895,7 +1897,7 @@ ExecFetchSlotMinimalTuple(TupleTableSlot *slot,
 	{
 		if (shouldFree)
 			*shouldFree = true;
-		return slot->tts_ops->copy_minimal_tuple(slot);
+		return slot->tts_ops->copy_minimal_tuple(slot, 0);
 	}
 }
 
diff --git a/src/backend/executor/nodeGatherMerge.c b/src/backend/executor/nodeGatherMerge.c
index 01a6e3a8553..15f84597067 100644
--- a/src/backend/executor/nodeGatherMerge.c
+++ b/src/backend/executor/nodeGatherMerge.c
@@ -735,7 +735,7 @@ gm_readnext_tuple(GatherMergeState *gm_state, int nreader, bool nowait,
 	 * Since we'll be buffering these across multiple calls, we need to make a
 	 * copy.
 	 */
-	return tup ? heap_copy_minimal_tuple(tup) : NULL;
+	return tup ? heap_copy_minimal_tuple(tup, 0) : NULL;
 }
 
 /*
diff --git a/src/backend/utils/sort/tuplesortvariants.c b/src/backend/utils/sort/tuplesortvariants.c
index 4059af5bb71..471d1197060 100644
--- a/src/backend/utils/sort/tuplesortvariants.c
+++ b/src/backend/utils/sort/tuplesortvariants.c
@@ -1002,7 +1002,7 @@ tuplesort_gettupleslot(Tuplesortstate *state, bool forward, bool copy,
 			*abbrev = stup.datum1;
 
 		if (copy)
-			stup.tuple = heap_copy_minimal_tuple((MinimalTuple) stup.tuple);
+			stup.tuple = heap_copy_minimal_tuple((MinimalTuple) stup.tuple, 0);
 
 		ExecStoreMinimalTuple((MinimalTuple) stup.tuple, slot, copy);
 		return true;
diff --git a/src/backend/utils/sort/tuplestore.c b/src/backend/utils/sort/tuplestore.c
index d61b601053c..c9aecab8d66 100644
--- a/src/backend/utils/sort/tuplestore.c
+++ b/src/backend/utils/sort/tuplestore.c
@@ -787,7 +787,7 @@ tuplestore_putvalues(Tuplestorestate *state, TupleDesc tdesc,
 	MinimalTuple tuple;
 	MemoryContext oldcxt = MemoryContextSwitchTo(state->context);
 
-	tuple = heap_form_minimal_tuple(tdesc, values, isnull);
+	tuple = heap_form_minimal_tuple(tdesc, values, isnull, 0);
 	USEMEM(state, GetMemoryChunkSpace(tuple));
 
 	tuplestore_puttuple_common(state, tuple);
@@ -1139,7 +1139,7 @@ tuplestore_gettupleslot(Tuplestorestate *state, bool forward,
 	{
 		if (copy && !should_free)
 		{
-			tuple = heap_copy_minimal_tuple(tuple);
+			tuple = heap_copy_minimal_tuple(tuple, 0);
 			should_free = true;
 		}
 		ExecStoreMinimalTuple(tuple, slot, should_free);
@@ -1590,7 +1590,7 @@ copytup_heap(Tuplestorestate *state, void *tup)
 {
 	MinimalTuple tuple;
 
-	tuple = minimal_tuple_from_heap_tuple((HeapTuple) tup);
+	tuple = minimal_tuple_from_heap_tuple((HeapTuple) tup, 0);
 	USEMEM(state, GetMemoryChunkSpace(tuple));
 	return tuple;
 }
diff --git a/src/include/access/htup_details.h b/src/include/access/htup_details.h
index 6cd4b95bfdb..aa957cf3b01 100644
--- a/src/include/access/htup_details.h
+++ b/src/include/access/htup_details.h
@@ -839,11 +839,12 @@ extern void heap_deform_tuple(HeapTuple tuple, TupleDesc tupleDesc,
 							  Datum *values, bool *isnull);
 extern void heap_freetuple(HeapTuple htup);
 extern MinimalTuple heap_form_minimal_tuple(TupleDesc tupleDescriptor,
-											const Datum *values, const bool *isnull);
+											const Datum *values, const bool *isnull,
+											Size extra);
 extern void heap_free_minimal_tuple(MinimalTuple mtup);
-extern MinimalTuple heap_copy_minimal_tuple(MinimalTuple mtup);
+extern MinimalTuple heap_copy_minimal_tuple(MinimalTuple mtup, Size extra);
 extern HeapTuple heap_tuple_from_minimal_tuple(MinimalTuple mtup);
-extern MinimalTuple minimal_tuple_from_heap_tuple(HeapTuple htup);
+extern MinimalTuple minimal_tuple_from_heap_tuple(HeapTuple htup, Size extra);
 extern size_t varsize_any(void *p);
 extern HeapTuple heap_expand_tuple(HeapTuple sourceTuple, TupleDesc tupleDesc);
 extern MinimalTuple minimal_expand_tuple(HeapTuple sourceTuple, TupleDesc tupleDesc);
diff --git a/src/include/executor/tuptable.h b/src/include/executor/tuptable.h
index a044d78e4d0..095e4cc82e3 100644
--- a/src/include/executor/tuptable.h
+++ b/src/include/executor/tuptable.h
@@ -218,8 +218,12 @@ struct TupleTableSlotOps
 	 * meaningful "system columns" in the copy. The copy is not be "owned" by
 	 * the slot i.e. the caller has to take responsibility to free memory
 	 * consumed by the slot.
+	 *
+	 * The copy has "extra" bytes (maxaligned and zeroed) available before the
+	 * tuple, which is useful so that some callers may store extra data along
+	 * with the minimal tuple without the need for an additional allocation.
 	 */
-	MinimalTuple (*copy_minimal_tuple) (TupleTableSlot *slot);
+	MinimalTuple (*copy_minimal_tuple) (TupleTableSlot *slot, Size extra);
 };
 
 /*
@@ -491,7 +495,19 @@ ExecCopySlotHeapTuple(TupleTableSlot *slot)
 static inline MinimalTuple
 ExecCopySlotMinimalTuple(TupleTableSlot *slot)
 {
-	return slot->tts_ops->copy_minimal_tuple(slot);
+	return slot->tts_ops->copy_minimal_tuple(slot, 0);
+}
+
+/*
+ * ExecCopySlotMinimalTupleExtra - return MinimalTuple allocated in caller's
+ * context, with extra bytes (maxaligned and zeroed) before the tuple for data
+ * the caller wishes to store along with the tuple (without requiring the
+ * caller to make an additional allocation).
+ */
+static inline MinimalTuple
+ExecCopySlotMinimalTupleExtra(TupleTableSlot *slot, Size extra)
+{
+	return slot->tts_ops->copy_minimal_tuple(slot, extra);
 }
 
 /*
-- 
2.34.1

From d1f8a7c48e0463216e6cde3b7998b51b1cde93f0 Mon Sep 17 00:00:00 2001
From: Jeff Davis <j...@j-davis.com>
Date: Tue, 4 Mar 2025 12:46:48 -0800
Subject: [PATCH v9 6/7] Use ExecCopySlotMinimalTupleExtra() to avoid a memcpy.

Previously, ExecFetchSlotMinimalTuple() could cause an unnecessary
copy for hash-based set operations.

Use the new API ExecCopySlotMinimalTupleExtra() to perform the copy
while also allocating the extra space needed for the 'additional'
data.

Discussion: https://postgr.es/m/AApHDvpN4v3t_sdz4dvrv1Fx_ZPw=twsnxuteytryp7lfz5...@mail.gmail.com
Reviewed-by: David Rowley <dgrowle...@gmail.com>
---
 src/backend/executor/execGrouping.c | 21 +++++----------------
 1 file changed, 5 insertions(+), 16 deletions(-)

diff --git a/src/backend/executor/execGrouping.c b/src/backend/executor/execGrouping.c
index 5c06cc6da40..255bd795361 100644
--- a/src/backend/executor/execGrouping.c
+++ b/src/backend/executor/execGrouping.c
@@ -481,21 +481,13 @@ LookupTupleHashEntry_internal(TupleHashTable hashtable, TupleTableSlot *slot,
 		else
 		{
 			/* created new entry */
-			bool		shouldFree;
-			char	   *mem;
-			MinimalTuple mtup;
-
 			*isnew = true;
 
-			MemoryContextSwitchTo(hashtable->tempcxt);
-
-			/* ignore shouldFree, because we're in the temp context anyway */
-			mtup = ExecFetchSlotMinimalTuple(slot, &shouldFree);
+			MemoryContextSwitchTo(hashtable->tablecxt);
 
 			/*
-			 * Allocate enough memory in the tablecxt for the additionalsize
-			 * followed by the tuple. Clear the additional data, and copy the
-			 * tuple.
+			 * Copy the first tuple into the table context, and request
+			 * additionalsize extra bytes before the allocation.
 			 *
 			 * The caller can get a pointer to the additional data with
 			 * TupleHashEntryGetAdditional(), and store arbitrary data there.
@@ -503,11 +495,8 @@ LookupTupleHashEntry_internal(TupleHashTable hashtable, TupleTableSlot *slot,
 			 * allocation avoids the need to store an extra pointer in
 			 * TupleHashEntryData or allocate an additional chunk.
 			 */
-			mem = MemoryContextAlloc(hashtable->tablecxt,
-									 mtup->t_len + hashtable->additionalsize);
-			memset(mem, 0, hashtable->additionalsize);
-			entry->firstTuple = (MinimalTuple) (mem + hashtable->additionalsize);
-			memcpy(entry->firstTuple, mtup, mtup->t_len);
+			entry->firstTuple = ExecCopySlotMinimalTupleExtra(slot,
+															  hashtable->additionalsize);
 		}
 	}
 	else
-- 
2.34.1

From 44ede0e8f8cc86d37decf054b46e2bf5e3b4734d Mon Sep 17 00:00:00 2001
From: Jeff Davis <j...@j-davis.com>
Date: Tue, 4 Mar 2025 13:21:35 -0800
Subject: [PATCH v9 7/7] Inline MemoryContextMemAllocated().

Speeds up the repeated calls during hash aggregation.

Discussion: https://postgr.es/m/CAApHDvpN4v3t_sdz4dvrv1Fx_ZPw=twsnxuteytryp7lfz5...@mail.gmail.com
---
 src/backend/utils/mmgr/mcxt.c | 68 -----------------------------------
 src/include/nodes/memnodes.h  | 68 +++++++++++++++++++++++++++++++++++
 src/include/utils/memutils.h  |  1 -
 3 files changed, 68 insertions(+), 69 deletions(-)

diff --git a/src/backend/utils/mmgr/mcxt.c b/src/backend/utils/mmgr/mcxt.c
index 91060de0ab7..ea08bcf8ed2 100644
--- a/src/backend/utils/mmgr/mcxt.c
+++ b/src/backend/utils/mmgr/mcxt.c
@@ -232,50 +232,6 @@ GetMemoryChunkHeader(const void *pointer)
 	return header;
 }
 
-/*
- * MemoryContextTraverseNext
- *		Helper function to traverse all descendants of a memory context
- *		without recursion.
- *
- * Recursion could lead to out-of-stack errors with deep context hierarchies,
- * which would be unpleasant in error cleanup code paths.
- *
- * To process 'context' and all its descendants, use a loop like this:
- *
- *     <process 'context'>
- *     for (MemoryContext curr = context->firstchild;
- *          curr != NULL;
- *          curr = MemoryContextTraverseNext(curr, context))
- *     {
- *         <process 'curr'>
- *     }
- *
- * This visits all the contexts in pre-order, that is a node is visited
- * before its children.
- */
-static MemoryContext
-MemoryContextTraverseNext(MemoryContext curr, MemoryContext top)
-{
-	/* After processing a node, traverse to its first child if any */
-	if (curr->firstchild != NULL)
-		return curr->firstchild;
-
-	/*
-	 * After processing a childless node, traverse to its next sibling if
-	 * there is one.  If there isn't, traverse back up to the parent (which
-	 * has already been visited, and now so have all its descendants).  We're
-	 * done if that is "top", otherwise traverse to its next sibling if any,
-	 * otherwise repeat moving up.
-	 */
-	while (curr->nextchild == NULL)
-	{
-		curr = curr->parent;
-		if (curr == top)
-			return NULL;
-	}
-	return curr->nextchild;
-}
-
 /*
  * Support routines to trap use of invalid memory context method IDs
  * (from calling pfree or the like on a bogus pointer).  As a possible
@@ -754,30 +710,6 @@ MemoryContextIsEmpty(MemoryContext context)
 	return context->methods->is_empty(context);
 }
 
-/*
- * Find the memory allocated to blocks for this memory context. If recurse is
- * true, also include children.
- */
-Size
-MemoryContextMemAllocated(MemoryContext context, bool recurse)
-{
-	Size		total = context->mem_allocated;
-
-	Assert(MemoryContextIsValid(context));
-
-	if (recurse)
-	{
-		for (MemoryContext curr = context->firstchild;
-			 curr != NULL;
-			 curr = MemoryContextTraverseNext(curr, context))
-		{
-			total += curr->mem_allocated;
-		}
-	}
-
-	return total;
-}
-
 /*
  * Return the memory consumption statistics about the given context and its
  * children.
diff --git a/src/include/nodes/memnodes.h b/src/include/nodes/memnodes.h
index 5807ef805bd..654c52448b5 100644
--- a/src/include/nodes/memnodes.h
+++ b/src/include/nodes/memnodes.h
@@ -149,4 +149,72 @@ typedef struct MemoryContextData
 	  IsA((context), GenerationContext) || \
 	  IsA((context), BumpContext)))
 
+/*
+ * MemoryContextTraverseNext
+ *		Helper function to traverse all descendants of a memory context
+ *		without recursion.
+ *
+ * Recursion could lead to out-of-stack errors with deep context hierarchies,
+ * which would be unpleasant in error cleanup code paths.
+ *
+ * To process 'context' and all its descendants, use a loop like this:
+ *
+ *     <process 'context'>
+ *     for (MemoryContext curr = context->firstchild;
+ *          curr != NULL;
+ *          curr = MemoryContextTraverseNext(curr, context))
+ *     {
+ *         <process 'curr'>
+ *     }
+ *
+ * This visits all the contexts in pre-order, that is a node is visited
+ * before its children.
+ */
+static MemoryContext
+MemoryContextTraverseNext(MemoryContext curr, MemoryContext top)
+{
+	/* After processing a node, traverse to its first child if any */
+	if (curr->firstchild != NULL)
+		return curr->firstchild;
+
+	/*
+	 * After processing a childless node, traverse to its next sibling if
+	 * there is one.  If there isn't, traverse back up to the parent (which
+	 * has already been visited, and now so have all its descendants).  We're
+	 * done if that is "top", otherwise traverse to its next sibling if any,
+	 * otherwise repeat moving up.
+	 */
+	while (curr->nextchild == NULL)
+	{
+		curr = curr->parent;
+		if (curr == top)
+			return NULL;
+	}
+	return curr->nextchild;
+}
+
+/*
+ * Find the memory allocated to blocks for this memory context. If recurse is
+ * true, also include children.
+ */
+static inline Size
+MemoryContextMemAllocated(MemoryContext context, bool recurse)
+{
+	Size		total = context->mem_allocated;
+
+	Assert(MemoryContextIsValid(context));
+
+	if (recurse)
+	{
+		for (MemoryContext curr = context->firstchild;
+			 curr != NULL;
+			 curr = MemoryContextTraverseNext(curr, context))
+		{
+			total += curr->mem_allocated;
+		}
+	}
+
+	return total;
+}
+
 #endif							/* MEMNODES_H */
diff --git a/src/include/utils/memutils.h b/src/include/utils/memutils.h
index 8abc26abce2..4f153d0e067 100644
--- a/src/include/utils/memutils.h
+++ b/src/include/utils/memutils.h
@@ -83,7 +83,6 @@ extern MemoryContext GetMemoryChunkContext(void *pointer);
 extern Size GetMemoryChunkSpace(void *pointer);
 extern MemoryContext MemoryContextGetParent(MemoryContext context);
 extern bool MemoryContextIsEmpty(MemoryContext context);
-extern Size MemoryContextMemAllocated(MemoryContext context, bool recurse);
 extern void MemoryContextMemConsumed(MemoryContext context,
 									 MemoryContextCounters *consumed);
 extern void MemoryContextStats(MemoryContext context);
-- 
2.34.1

Attachment: c.sql
Description: application/sql

Attachment: except.sql
Description: application/sql

Attachment: wide.sql
Description: application/sql

Reply via email to