Re: Reduce TupleHashEntryData struct size by half

Jeff Davis Wed, 12 Feb 2025 17:11:51 -0800

On Fri, 2025-02-07 at 17:13 -0800, Jeff Davis wrote:
> On Sun, 2025-01-12 at 14:54 +1300, David Rowley wrote:
> > I wonder if there's some other better way of doing this. Would it
> > be
> > worth having some function like ExecCopySlotMinimalTuple() that
> > accepts an additional parameter so that the palloc allocates N more
> > bytes at the end?
> 
> Attached a new series that implements this idea.


Attached a new patchset that doesn't change the API for
ExecCopySlotMinimalTuple(). Instead, I'm using
ExecFetchSlotMinimalTuple(), which avoids the extra memcpy if the slot
is TTSOpsMinimalTuple, which is what HashAgg uses. I separated out the
change to use ExecFetchSlotMinimalTuple() into 0004 to illustrate the
performance impact.

DATA:
  create table a (a text not null);
  insert into a select repeat(md5(a::text),10)
    from generate_Series(1,1000) a;
  vacuum freeze analyze a;
  create table b (b text not null);
  insert into b select repeat(md5(b::text),10)
    from generate_Series(1001,2000) b;
  vacuum freeze analyze b;

QUERY: select a,count(*) from a group by a;

  master: tps = 2054.742658 (without initial connection time)
  0001:   tps = 2068.620429 (without initial connection time)
  0002:   tps = 2027.046422 (without initial connection time)
  0003:   tps = 1951.392904 (without initial connection time)
  0004:   tps = 2071.690037 (without initial connection time)

QUERY: select * from a except select * from b;

  master: tps = 1720.168862 (without initial connection time)
  0001:   tps = 1703.040810 (without initial connection time)
  0002:   tps = 1687.191715 (without initial connection time)
  0003:   tps = 1579.975535 (without initial connection time)
  0004:   tps = 1616.741447 (without initial connection time)

So the GROUP BY query has no regression (because there's no additional
copy) and the EXCEPT query has about a 6% regression.

The v6 series doesn't have that regression for set operations, but it
requires the somewhat-messy ExecCopySlotMinimalTupleExtra() API. If you
think you'll use that in nodeMemoize, then the API change is worth it.
If not, it feels a bit like a hack.

Another thing I did in the v7 series is I stored the additional data
before the firstTuple. It seems cleaner to store the fixed-length data
first -- I could do the same thing if we go with
ExecCopySlotMinimalTupleExtra().

In any case, it seems like we have agreement to switch to the Bump
context, so I'll do another round of tests to see if there are any
downsides, then clean it up and commit v7-0001.

Regards,
        Jeff Davis

From b0023cde3798a481c7a7f1f453be3d037ef4c2f5 Mon Sep 17 00:00:00 2001
From: Jeff Davis <j...@j-davis.com>
Date: Wed, 8 Jan 2025 11:46:35 -0800
Subject: [PATCH v7 1/4] HashAgg: use Bump allocator for hash TupleHashTable
 entries.

The entries aren't freed until the entire hash table is destroyed, so
use the Bump allocator to improve allocation speed, avoid wasting
space on the chunk header, and avoid wasting space due to the
power-of-two allocations.

Discussion: https://postgr.es/m/caaphdvqv1anb4cm36fzrwivxrevbo_lsg_eq3nqdxtjecaa...@mail.gmail.com
Reviewed-by: David Rowley
---
 src/backend/executor/nodeAgg.c             | 110 +++++++++++++++++----
 src/include/nodes/execnodes.h              |   5 +-
 src/test/regress/expected/groupingsets.out |   2 +-
 src/test/regress/sql/groupingsets.sql      |   2 +-
 4 files changed, 97 insertions(+), 22 deletions(-)

diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index ae9cc256b8c..340a2010101 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -406,6 +406,7 @@ static void build_hash_tables(AggState *aggstate);
 static void build_hash_table(AggState *aggstate, int setno, long nbuckets);
 static void hashagg_recompile_expressions(AggState *aggstate, bool minslot,
 										  bool nullcheck);
+static void hash_create_memory(AggState *aggstate);
 static long hash_choose_num_buckets(double hashentrysize,
 									long ngroups, Size memory);
 static int	hash_choose_num_partitions(double input_groups,
@@ -1512,7 +1513,7 @@ build_hash_table(AggState *aggstate, int setno, long nbuckets)
 {
 	AggStatePerHash perhash = &aggstate->perhash[setno];
 	MemoryContext metacxt = aggstate->hash_metacxt;
-	MemoryContext hashcxt = aggstate->hashcontext->ecxt_per_tuple_memory;
+	MemoryContext tablecxt = aggstate->hash_tablecxt;
 	MemoryContext tmpcxt = aggstate->tmpcontext->ecxt_per_tuple_memory;
 	Size		additionalsize;
 
@@ -1538,7 +1539,7 @@ build_hash_table(AggState *aggstate, int setno, long nbuckets)
 											 nbuckets,
 											 additionalsize,
 											 metacxt,
-											 hashcxt,
+											 tablecxt,
 											 tmpcxt,
 											 DO_AGGSPLIT_SKIPFINAL(aggstate->aggsplit));
 }
@@ -1709,15 +1710,24 @@ hash_agg_entry_size(int numTrans, Size tupleWidth, Size transitionSpace)
 							 tupleWidth);
 	Size		pergroupSize = numTrans * sizeof(AggStatePerGroupData);
 
-	tupleChunkSize = CHUNKHDRSZ + tupleSize;
-
-	if (pergroupSize > 0)
-		pergroupChunkSize = CHUNKHDRSZ + pergroupSize;
-	else
-		pergroupChunkSize = 0;
+	/*
+	 * Entries use the Bump allocator, so the chunk sizes are the same as the
+	 * requested sizes.
+	 */
+	tupleChunkSize = tupleSize;
+	pergroupChunkSize = pergroupSize;
 
+	/*
+	 * Transition values use AllocSet, which has a chunk header and also uses
+	 * power-of-two allocations.
+	 */
 	if (transitionSpace > 0)
-		transitionChunkSize = CHUNKHDRSZ + transitionSpace;
+	{
+		Size		pow2 = (Size) 1 << my_log2(transitionSpace);
+
+		/* use Max to protect against overflow */
+		transitionChunkSize = Max(CHUNKHDRSZ + pow2, transitionSpace);
+	}
 	else
 		transitionChunkSize = 0;
 
@@ -1867,8 +1877,11 @@ hash_agg_check_limits(AggState *aggstate)
 	uint64		ngroups = aggstate->hash_ngroups_current;
 	Size		meta_mem = MemoryContextMemAllocated(aggstate->hash_metacxt,
 													 true);
-	Size		hashkey_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory,
-														true);
+	Size		entry_mem = MemoryContextMemAllocated(aggstate->hash_tablecxt,
+													  true);
+	Size		tval_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory,
+													 true);
+	Size		total_mem = meta_mem + entry_mem + tval_mem;
 	bool		do_spill = false;
 
 #ifdef USE_INJECTION_POINTS
@@ -1887,7 +1900,7 @@ hash_agg_check_limits(AggState *aggstate)
 	 * can be sure to make progress even in edge cases.
 	 */
 	if (aggstate->hash_ngroups_current > 0 &&
-		(meta_mem + hashkey_mem > aggstate->hash_mem_limit ||
+		(total_mem > aggstate->hash_mem_limit ||
 		 ngroups > aggstate->hash_ngroups_limit))
 	{
 		do_spill = true;
@@ -1942,6 +1955,7 @@ static void
 hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
 {
 	Size		meta_mem;
+	Size		entry_mem;
 	Size		hashkey_mem;
 	Size		buffer_mem;
 	Size		total_mem;
@@ -1953,7 +1967,10 @@ hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
 	/* memory for the hash table itself */
 	meta_mem = MemoryContextMemAllocated(aggstate->hash_metacxt, true);
 
-	/* memory for the group keys and transition states */
+	/* memory for hash entries */
+	entry_mem = MemoryContextMemAllocated(aggstate->hash_tablecxt, true);
+
+	/* memory for byref transition states */
 	hashkey_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory, true);
 
 	/* memory for read/write tape buffers, if spilled */
@@ -1962,7 +1979,7 @@ hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
 		buffer_mem += HASHAGG_READ_BUFFER_SIZE;
 
 	/* update peak mem */
-	total_mem = meta_mem + hashkey_mem + buffer_mem;
+	total_mem = meta_mem + entry_mem + hashkey_mem + buffer_mem;
 	if (total_mem > aggstate->hash_mem_peak)
 		aggstate->hash_mem_peak = total_mem;
 
@@ -1984,6 +2001,58 @@ hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
 	}
 }
 
+/*
+ * Create memory contexts used for hash aggregation.
+ */
+static void
+hash_create_memory(AggState *aggstate)
+{
+	Size		minContextSize = ALLOCSET_DEFAULT_MINSIZE;
+	Size		initBlockSize = ALLOCSET_DEFAULT_INITSIZE;
+	Size		maxBlockSize = ALLOCSET_DEFAULT_MAXSIZE;
+
+	/*
+	 * The hashcontext's per-tuple memory will be used for byref transition
+	 * values and returned by AggCheckCallContext().
+	 */
+	aggstate->hashcontext = CreateWorkExprContext(aggstate->ss.ps.state);
+
+	/*
+	 * The meta context will be used for the bucket array of
+	 * TupleHashEntryData (or arrays, in the case of grouping sets). As the
+	 * hash table grows, the bucket array will double in size and the old one
+	 * will be freed, so an AllocSet is appropriate. For large bucket arrays,
+	 * the large allocation path will be used, so it's not worth worrying
+	 * about wasting space due to power-of-two allocations.
+	 */
+	aggstate->hash_metacxt = AllocSetContextCreate(aggstate->ss.ps.state->es_query_cxt,
+												   "HashAgg meta context",
+												   ALLOCSET_DEFAULT_SIZES);
+
+	/*
+	 * The hash entries themselves, which include the grouping key
+	 * (firstTuple) and pergroup data, are stored in the table context. The
+	 * bump allocator can be used because the entries are not freed until the
+	 * entire hash table is reset. The bump allocator is faster for
+	 * allocations and avoids wasting space on the chunk header or
+	 * power-of-two allocations.
+	 *
+	 * Like CreateWorkExprContext(), use smaller sizings for smaller work_mem,
+	 * to avoid large jumps in memory usage.
+	 */
+	while (16 * maxBlockSize > work_mem * 1024L)
+		maxBlockSize >>= 1;
+
+	if (maxBlockSize < ALLOCSET_DEFAULT_INITSIZE)
+		maxBlockSize = ALLOCSET_DEFAULT_INITSIZE;
+
+	aggstate->hash_tablecxt = BumpContextCreate(aggstate->ss.ps.state->es_query_cxt,
+												"HashAgg table context",
+												minContextSize, initBlockSize,
+												maxBlockSize);
+
+}
+
 /*
  * Choose a reasonable number of buckets for the initial hash table size.
  */
@@ -2647,6 +2716,7 @@ agg_refill_hash_table(AggState *aggstate)
 
 	/* free memory and reset hash tables */
 	ReScanExprContext(aggstate->hashcontext);
+	MemoryContextReset(aggstate->hash_tablecxt);
 	for (int setno = 0; setno < aggstate->num_hashes; setno++)
 		ResetTupleHashTable(aggstate->perhash[setno].hashtable);
 
@@ -3331,7 +3401,7 @@ ExecInitAgg(Agg *node, EState *estate, int eflags)
 	}
 
 	if (use_hashing)
-		aggstate->hashcontext = CreateWorkExprContext(estate);
+		hash_create_memory(aggstate);
 
 	ExecAssignExprContext(estate, &aggstate->ss.ps);
 
@@ -3626,9 +3696,6 @@ ExecInitAgg(Agg *node, EState *estate, int eflags)
 		Plan	   *outerplan = outerPlan(node);
 		uint64		totalGroups = 0;
 
-		aggstate->hash_metacxt = AllocSetContextCreate(aggstate->ss.ps.state->es_query_cxt,
-													   "HashAgg meta context",
-													   ALLOCSET_DEFAULT_SIZES);
 		aggstate->hash_spill_rslot = ExecInitExtraTupleSlot(estate, scanDesc,
 															&TTSOpsMinimalTuple);
 		aggstate->hash_spill_wslot = ExecInitExtraTupleSlot(estate, scanDesc,
@@ -4373,6 +4440,12 @@ ExecEndAgg(AggState *node)
 		MemoryContextDelete(node->hash_metacxt);
 		node->hash_metacxt = NULL;
 	}
+	if (node->hash_tablecxt != NULL)
+	{
+		MemoryContextDelete(node->hash_tablecxt);
+		node->hash_tablecxt = NULL;
+	}
+
 
 	for (transno = 0; transno < node->numtrans; transno++)
 	{
@@ -4489,6 +4562,7 @@ ExecReScanAgg(AggState *node)
 		node->hash_ngroups_current = 0;
 
 		ReScanExprContext(node->hashcontext);
+		MemoryContextReset(node->hash_tablecxt);
 		/* Rebuild an empty hash table */
 		build_hash_tables(node);
 		node->table_filled = false;
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index e2d1dc1e067..1aacad936af 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -2563,7 +2563,8 @@ typedef struct AggState
 	/* these fields are used in AGG_HASHED and AGG_MIXED modes: */
 	bool		table_filled;	/* hash table filled yet? */
 	int			num_hashes;
-	MemoryContext hash_metacxt; /* memory for hash table itself */
+	MemoryContext hash_metacxt; /* memory for hash table bucket array */
+	MemoryContext hash_tablecxt;	/* memory for hash table entries */
 	struct LogicalTapeSet *hash_tapeset;	/* tape set for hash spill tapes */
 	struct HashAggSpill *hash_spills;	/* HashAggSpill for each grouping set,
 										 * exists only during first pass */
@@ -2589,7 +2590,7 @@ typedef struct AggState
 										 * per-group pointers */
 
 	/* support for evaluation of agg input expressions: */
-#define FIELDNO_AGGSTATE_ALL_PERGROUPS 53
+#define FIELDNO_AGGSTATE_ALL_PERGROUPS 54
 	AggStatePerGroup *all_pergroups;	/* array of first ->pergroups, than
 										 * ->hash_pergroup */
 	SharedAggInfo *shared_info; /* one entry per worker */
diff --git a/src/test/regress/expected/groupingsets.out b/src/test/regress/expected/groupingsets.out
index d7c9b44605d..0e86e31fe90 100644
--- a/src/test/regress/expected/groupingsets.out
+++ b/src/test/regress/expected/groupingsets.out
@@ -1711,7 +1711,7 @@ explain (costs off)
          ->  Seq Scan on tenk1
 (9 rows)
 
-set work_mem = '384kB';
+set work_mem = '256kB';
 explain (costs off)
   select unique1,
          count(two), count(four), count(ten),
diff --git a/src/test/regress/sql/groupingsets.sql b/src/test/regress/sql/groupingsets.sql
index 21cd3121940..10f3c36d5c6 100644
--- a/src/test/regress/sql/groupingsets.sql
+++ b/src/test/regress/sql/groupingsets.sql
@@ -467,7 +467,7 @@ explain (costs off)
          count(*)
     from tenk1 group by grouping sets (unique1,hundred,ten,four,two);
 
-set work_mem = '384kB';
+set work_mem = '256kB';
 explain (costs off)
   select unique1,
          count(two), count(four), count(ten),
-- 
2.34.1

From d76e51114cb4e50fd36905ffcb446652fa64a62c Mon Sep 17 00:00:00 2001
From: Jeff Davis <j...@j-davis.com>
Date: Mon, 13 Jan 2025 14:37:02 -0800
Subject: [PATCH v7 2/4] Refactor: accessors for TupleHashEntryData.

Previously, callers were accessing fields directly, making it more
difficult to optimize.

A subsequent patch will combine firstTuple and the additionalsize into
a single allocation.

Discussion: https://postgr.es/m/817d244237878cebdff0bc363718feaf49a1ea7d.camel%40j-davis.com
---
 src/backend/executor/execGrouping.c |  7 ++++--
 src/backend/executor/nodeAgg.c      | 32 +++++++++++++---------------
 src/backend/executor/nodeSetOp.c    | 23 +++++++++++---------
 src/backend/executor/nodeSubplan.c  |  2 +-
 src/include/executor/executor.h     | 33 +++++++++++++++++++++++++++++
 src/include/nodes/execnodes.h       |  1 +
 6 files changed, 68 insertions(+), 30 deletions(-)

diff --git a/src/backend/executor/execGrouping.c b/src/backend/executor/execGrouping.c
index 33b124fbb0a..450fe74bec5 100644
--- a/src/backend/executor/execGrouping.c
+++ b/src/backend/executor/execGrouping.c
@@ -196,6 +196,7 @@ BuildTupleHashTable(PlanState *parent,
 	hashtable->tab_collations = collations;
 	hashtable->tablecxt = tablecxt;
 	hashtable->tempcxt = tempcxt;
+	hashtable->additionalsize = additionalsize;
 	hashtable->tableslot = NULL;	/* will be made on first lookup */
 	hashtable->inputslot = NULL;
 	hashtable->in_hash_expr = NULL;
@@ -479,11 +480,13 @@ LookupTupleHashEntry_internal(TupleHashTable hashtable, TupleTableSlot *slot,
 		{
 			/* created new entry */
 			*isnew = true;
-			/* zero caller data */
-			entry->additional = NULL;
+
 			MemoryContextSwitchTo(hashtable->tablecxt);
+
 			/* Copy the first tuple into the table context */
 			entry->firstTuple = ExecCopySlotMinimalTuple(slot);
+
+			entry->additional = palloc0(hashtable->additionalsize);
 		}
 	}
 	else
diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index 340a2010101..af0b7b27383 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -1494,7 +1494,7 @@ build_hash_tables(AggState *aggstate)
 #ifdef USE_INJECTION_POINTS
 		if (IS_INJECTION_POINT_ATTACHED("hash-aggregate-oversize-table"))
 		{
-			nbuckets = memory / sizeof(TupleHashEntryData);
+			nbuckets = memory / TupleHashEntrySize();
 			INJECTION_POINT_CACHED("hash-aggregate-oversize-table");
 		}
 #endif
@@ -1732,7 +1732,7 @@ hash_agg_entry_size(int numTrans, Size tupleWidth, Size transitionSpace)
 		transitionChunkSize = 0;
 
 	return
-		sizeof(TupleHashEntryData) +
+		TupleHashEntrySize() +
 		tupleChunkSize +
 		pergroupChunkSize +
 		transitionChunkSize;
@@ -1996,7 +1996,7 @@ hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
 	if (aggstate->hash_ngroups_current > 0)
 	{
 		aggstate->hashentrysize =
-			sizeof(TupleHashEntryData) +
+			TupleHashEntrySize() +
 			(hashkey_mem / (double) aggstate->hash_ngroups_current);
 	}
 }
@@ -2149,11 +2149,7 @@ initialize_hash_entry(AggState *aggstate, TupleHashTable hashtable,
 	if (aggstate->numtrans == 0)
 		return;
 
-	pergroup = (AggStatePerGroup)
-		MemoryContextAlloc(hashtable->tablecxt,
-						   sizeof(AggStatePerGroupData) * aggstate->numtrans);
-
-	entry->additional = pergroup;
+	pergroup = (AggStatePerGroup) TupleHashEntryGetAdditional(hashtable, entry);
 
 	/*
 	 * Initialize aggregates for new tuple group, lookup_hash_entries()
@@ -2217,7 +2213,7 @@ lookup_hash_entries(AggState *aggstate)
 		{
 			if (isnew)
 				initialize_hash_entry(aggstate, hashtable, entry);
-			pergroup[setno] = entry->additional;
+			pergroup[setno] = TupleHashEntryGetAdditional(hashtable, entry);
 		}
 		else
 		{
@@ -2750,6 +2746,7 @@ agg_refill_hash_table(AggState *aggstate)
 	INJECTION_POINT("hash-aggregate-process-batch");
 	for (;;)
 	{
+		TupleHashTable hashtable = perhash->hashtable;
 		TupleTableSlot *spillslot = aggstate->hash_spill_rslot;
 		TupleTableSlot *hashslot = perhash->hashslot;
 		TupleHashEntry entry;
@@ -2770,14 +2767,14 @@ agg_refill_hash_table(AggState *aggstate)
 		prepare_hash_slot(perhash,
 						  aggstate->tmpcontext->ecxt_outertuple,
 						  hashslot);
-		entry = LookupTupleHashEntryHash(perhash->hashtable, hashslot,
+		entry = LookupTupleHashEntryHash(hashtable, hashslot,
 										 p_isnew, hash);
 
 		if (entry != NULL)
 		{
 			if (isnew)
-				initialize_hash_entry(aggstate, perhash->hashtable, entry);
-			aggstate->hash_pergroup[batch->setno] = entry->additional;
+				initialize_hash_entry(aggstate, hashtable, entry);
+			aggstate->hash_pergroup[batch->setno] = TupleHashEntryGetAdditional(hashtable, entry);
 			advance_aggregates(aggstate);
 		}
 		else
@@ -2869,7 +2866,7 @@ agg_retrieve_hash_table_in_memory(AggState *aggstate)
 	ExprContext *econtext;
 	AggStatePerAgg peragg;
 	AggStatePerGroup pergroup;
-	TupleHashEntryData *entry;
+	TupleHashEntry entry;
 	TupleTableSlot *firstSlot;
 	TupleTableSlot *result;
 	AggStatePerHash perhash;
@@ -2895,6 +2892,7 @@ agg_retrieve_hash_table_in_memory(AggState *aggstate)
 	 */
 	for (;;)
 	{
+		TupleHashTable hashtable = perhash->hashtable;
 		TupleTableSlot *hashslot = perhash->hashslot;
 		int			i;
 
@@ -2903,7 +2901,7 @@ agg_retrieve_hash_table_in_memory(AggState *aggstate)
 		/*
 		 * Find the next entry in the hash table
 		 */
-		entry = ScanTupleHashTable(perhash->hashtable, &perhash->hashiter);
+		entry = ScanTupleHashTable(hashtable, &perhash->hashiter);
 		if (entry == NULL)
 		{
 			int			nextset = aggstate->current_set + 1;
@@ -2918,7 +2916,7 @@ agg_retrieve_hash_table_in_memory(AggState *aggstate)
 
 				perhash = &aggstate->perhash[aggstate->current_set];
 
-				ResetTupleHashIterator(perhash->hashtable, &perhash->hashiter);
+				ResetTupleHashIterator(hashtable, &perhash->hashiter);
 
 				continue;
 			}
@@ -2941,7 +2939,7 @@ agg_retrieve_hash_table_in_memory(AggState *aggstate)
 		 * Transform representative tuple back into one with the right
 		 * columns.
 		 */
-		ExecStoreMinimalTuple(entry->firstTuple, hashslot, false);
+		ExecStoreMinimalTuple(TupleHashEntryGetTuple(entry), hashslot, false);
 		slot_getallattrs(hashslot);
 
 		ExecClearTuple(firstSlot);
@@ -2957,7 +2955,7 @@ agg_retrieve_hash_table_in_memory(AggState *aggstate)
 		}
 		ExecStoreVirtualTuple(firstSlot);
 
-		pergroup = (AggStatePerGroup) entry->additional;
+		pergroup = (AggStatePerGroup) TupleHashEntryGetAdditional(hashtable, entry);
 
 		/*
 		 * Use the representative input tuple for any references to
diff --git a/src/backend/executor/nodeSetOp.c b/src/backend/executor/nodeSetOp.c
index 5b7ff9c3748..cbdc717850c 100644
--- a/src/backend/executor/nodeSetOp.c
+++ b/src/backend/executor/nodeSetOp.c
@@ -424,7 +424,9 @@ setop_fill_hash_table(SetOpState *setopstate)
 	for (;;)
 	{
 		TupleTableSlot *outerslot;
+		TupleHashTable hashtable = setopstate->hashtable;
 		TupleHashEntryData *entry;
+		SetOpStatePerGroup pergroup;
 		bool		isnew;
 
 		outerslot = ExecProcNode(outerPlan);
@@ -433,20 +435,20 @@ setop_fill_hash_table(SetOpState *setopstate)
 		have_tuples = true;
 
 		/* Find or build hashtable entry for this tuple's group */
-		entry = LookupTupleHashEntry(setopstate->hashtable,
+		entry = LookupTupleHashEntry(hashtable,
 									 outerslot,
 									 &isnew, NULL);
 
+		pergroup = TupleHashEntryGetAdditional(hashtable, entry);
 		/* If new tuple group, initialize counts to zero */
 		if (isnew)
 		{
-			entry->additional = (SetOpStatePerGroup)
-				MemoryContextAllocZero(setopstate->hashtable->tablecxt,
-									   sizeof(SetOpStatePerGroupData));
+			pergroup->numLeft = 0;
+			pergroup->numRight = 0;
 		}
 
 		/* Advance the counts */
-		((SetOpStatePerGroup) entry->additional)->numLeft++;
+		pergroup->numLeft++;
 
 		/* Must reset expression context after each hashtable lookup */
 		ResetExprContext(econtext);
@@ -464,6 +466,7 @@ setop_fill_hash_table(SetOpState *setopstate)
 		 */
 		for (;;)
 		{
+			TupleHashTable hashtable = setopstate->hashtable;
 			TupleTableSlot *innerslot;
 			TupleHashEntryData *entry;
 
@@ -472,13 +475,13 @@ setop_fill_hash_table(SetOpState *setopstate)
 				break;
 
 			/* For tuples not seen previously, do not make hashtable entry */
-			entry = LookupTupleHashEntry(setopstate->hashtable,
+			entry = LookupTupleHashEntry(hashtable,
 										 innerslot,
 										 NULL, NULL);
 
 			/* Advance the counts if entry is already present */
 			if (entry)
-				((SetOpStatePerGroup) entry->additional)->numRight++;
+				((SetOpStatePerGroup) TupleHashEntryGetAdditional(hashtable, entry))->numRight++;
 
 			/* Must reset expression context after each hashtable lookup */
 			ResetExprContext(econtext);
@@ -496,7 +499,7 @@ setop_fill_hash_table(SetOpState *setopstate)
 static TupleTableSlot *
 setop_retrieve_hash_table(SetOpState *setopstate)
 {
-	TupleHashEntryData *entry;
+	TupleHashEntry entry;
 	TupleTableSlot *resultTupleSlot;
 
 	/*
@@ -526,12 +529,12 @@ setop_retrieve_hash_table(SetOpState *setopstate)
 		 * See if we should emit any copies of this tuple, and if so return
 		 * the first copy.
 		 */
-		set_output_count(setopstate, (SetOpStatePerGroup) entry->additional);
+		set_output_count(setopstate, (SetOpStatePerGroup) TupleHashEntryGetAdditional(setopstate->hashtable, entry));
 
 		if (setopstate->numOutput > 0)
 		{
 			setopstate->numOutput--;
-			return ExecStoreMinimalTuple(entry->firstTuple,
+			return ExecStoreMinimalTuple(TupleHashEntryGetTuple(entry),
 										 resultTupleSlot,
 										 false);
 		}
diff --git a/src/backend/executor/nodeSubplan.c b/src/backend/executor/nodeSubplan.c
index 49767ed6a52..f7f6fc2da0b 100644
--- a/src/backend/executor/nodeSubplan.c
+++ b/src/backend/executor/nodeSubplan.c
@@ -753,7 +753,7 @@ findPartialMatch(TupleHashTable hashtable, TupleTableSlot *slot,
 	{
 		CHECK_FOR_INTERRUPTS();
 
-		ExecStoreMinimalTuple(entry->firstTuple, hashtable->tableslot, false);
+		ExecStoreMinimalTuple(TupleHashEntryGetTuple(entry), hashtable->tableslot, false);
 		if (!execTuplesUnequal(slot, hashtable->tableslot,
 							   numCols, keyColIdx,
 							   eqfunctions,
diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h
index 30e2a82346f..ee93b479c92 100644
--- a/src/include/executor/executor.h
+++ b/src/include/executor/executor.h
@@ -157,6 +157,39 @@ extern TupleHashEntry FindTupleHashEntry(TupleHashTable hashtable,
 										 ExprState *hashexpr);
 extern void ResetTupleHashTable(TupleHashTable hashtable);
 
+#ifndef FRONTEND
+/*
+ * Return size of the hash bucket. Useful for estimating memory usage.
+ */
+static inline size_t
+TupleHashEntrySize(void)
+{
+	return sizeof(TupleHashEntryData);
+}
+
+/*
+ * Return tuple from hash entry.
+ */
+static inline MinimalTuple
+TupleHashEntryGetTuple(TupleHashEntry entry)
+{
+	return entry->firstTuple;
+}
+
+/*
+ * Get a pointer into the additional space allocated for this entry. The
+ * amount of space available is the additionalsize specified to
+ * BuildTupleHashTable(). If additionalsize was specified as zero, no
+ * additional space is available and this function should not be called.
+ */
+static inline void *
+TupleHashEntryGetAdditional(TupleHashTable hashtable, TupleHashEntry entry)
+{
+	Assert(entry->additional != NULL);
+	return entry->additional;
+}
+#endif
+
 /*
  * prototypes from functions in execJunk.c
  */
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 1aacad936af..11aa110cf9c 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -860,6 +860,7 @@ typedef struct TupleHashTableData
 	Oid		   *tab_collations; /* collations for hash and comparison */
 	MemoryContext tablecxt;		/* memory context containing table */
 	MemoryContext tempcxt;		/* context for function evaluations */
+	Size		additionalsize; /* size of additional data */
 	TupleTableSlot *tableslot;	/* slot for referencing table entries */
 	/* The following fields are set transiently for each table search: */
 	TupleTableSlot *inputslot;	/* current input tuple's slot */
-- 
2.34.1

From 3e21d4328af39e52df0e1ec49a8234a328506f75 Mon Sep 17 00:00:00 2001
From: Jeff Davis <j...@j-davis.com>
Date: Tue, 14 Jan 2025 16:17:00 -0800
Subject: [PATCH v7 3/4] Combine firstTuple and additionalsize into a single
 allocation.

Saves space in TupleHashEntryData by removing the "additional"
pointer. TupleHashEntryData is required for each bucket, whether
occupied or not, so that space is more valuable than other hash table
allocations.

Discussion: https://postgr.es/m/817d244237878cebdff0bc363718feaf49a1ea7d.camel%40j-davis.com
---
 src/backend/executor/execGrouping.c | 33 +++++++++++++++++++++++------
 src/include/executor/executor.h     |  9 ++++----
 src/include/nodes/execnodes.h       |  1 -
 3 files changed, 31 insertions(+), 12 deletions(-)

diff --git a/src/backend/executor/execGrouping.c b/src/backend/executor/execGrouping.c
index 450fe74bec5..76b16101dbd 100644
--- a/src/backend/executor/execGrouping.c
+++ b/src/backend/executor/execGrouping.c
@@ -196,7 +196,7 @@ BuildTupleHashTable(PlanState *parent,
 	hashtable->tab_collations = collations;
 	hashtable->tablecxt = tablecxt;
 	hashtable->tempcxt = tempcxt;
-	hashtable->additionalsize = additionalsize;
+	hashtable->additionalsize = MAXALIGN(additionalsize);
 	hashtable->tableslot = NULL;	/* will be made on first lookup */
 	hashtable->inputslot = NULL;
 	hashtable->in_hash_expr = NULL;
@@ -478,15 +478,34 @@ LookupTupleHashEntry_internal(TupleHashTable hashtable, TupleTableSlot *slot,
 		}
 		else
 		{
+			char	   *data;
+			MinimalTuple tmpTuple;
+			Size		totalsize;
+
 			/* created new entry */
 			*isnew = true;
 
-			MemoryContextSwitchTo(hashtable->tablecxt);
-
-			/* Copy the first tuple into the table context */
-			entry->firstTuple = ExecCopySlotMinimalTuple(slot);
-
-			entry->additional = palloc0(hashtable->additionalsize);
+			/* get minimal tuple in temp context */
+			MemoryContextSwitchTo(hashtable->tempcxt);
+			tmpTuple = ExecCopySlotMinimalTuple(slot);
+
+			/*
+			 * Allocate space for additionalsize followed by the MinimalTuple
+			 * in the table context, and copy the tuple. A single allocation
+			 * avoids the need to store two pointers in TupleHashEntryData.
+			 */
+			totalsize = hashtable->additionalsize + tmpTuple->t_len;
+			data = MemoryContextAlloc(hashtable->tablecxt, totalsize);
+			memset(data, 0, hashtable->additionalsize);
+
+			/*
+			 * firstTuple points in the middle of the allocated space. The
+			 * additional data pointer can be computed by subtracting
+			 * additionalsize from firstTuple.
+			 */
+			data += hashtable->additionalsize;
+			entry->firstTuple = (MinimalTuple) data;
+			memcpy(entry->firstTuple, tmpTuple, tmpTuple->t_len);
 		}
 	}
 	else
diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h
index ee93b479c92..344c5f8a54b 100644
--- a/src/include/executor/executor.h
+++ b/src/include/executor/executor.h
@@ -178,15 +178,16 @@ TupleHashEntryGetTuple(TupleHashEntry entry)
 
 /*
  * Get a pointer into the additional space allocated for this entry. The
- * amount of space available is the additionalsize specified to
- * BuildTupleHashTable(). If additionalsize was specified as zero, no
+ * memory will be maxaligned and zeroed.
+ *
+ * The amount of space available is the additionalsize requested in the call
+ * to BuildTupleHashTable(). If additionalsize was specified as zero, no
  * additional space is available and this function should not be called.
  */
 static inline void *
 TupleHashEntryGetAdditional(TupleHashTable hashtable, TupleHashEntry entry)
 {
-	Assert(entry->additional != NULL);
-	return entry->additional;
+	return (char *) entry->firstTuple - hashtable->additionalsize;
 }
 #endif
 
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 11aa110cf9c..a875335cb09 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -837,7 +837,6 @@ typedef struct TupleHashTableData *TupleHashTable;
 typedef struct TupleHashEntryData
 {
 	MinimalTuple firstTuple;	/* copy of first tuple in this group */
-	void	   *additional;		/* user data */
 	uint32		status;			/* hash status */
 	uint32		hash;			/* hash value (cached) */
 } TupleHashEntryData;
-- 
2.34.1

From c8f534aa7954fa30ee3271a1cf157652f086b559 Mon Sep 17 00:00:00 2001
From: Jeff Davis <j...@j-davis.com>
Date: Wed, 12 Feb 2025 13:05:26 -0800
Subject: [PATCH v7 4/4] Use ExecFetchSlotMinimalTuple().

---
 src/backend/executor/execGrouping.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/backend/executor/execGrouping.c b/src/backend/executor/execGrouping.c
index 76b16101dbd..3712bcdf6f5 100644
--- a/src/backend/executor/execGrouping.c
+++ b/src/backend/executor/execGrouping.c
@@ -481,13 +481,14 @@ LookupTupleHashEntry_internal(TupleHashTable hashtable, TupleTableSlot *slot,
 			char	   *data;
 			MinimalTuple tmpTuple;
 			Size		totalsize;
+			bool		shouldFree;
 
 			/* created new entry */
 			*isnew = true;
 
 			/* get minimal tuple in temp context */
 			MemoryContextSwitchTo(hashtable->tempcxt);
-			tmpTuple = ExecCopySlotMinimalTuple(slot);
+			tmpTuple = ExecFetchSlotMinimalTuple(slot, &shouldFree);
 
 			/*
 			 * Allocate space for additionalsize followed by the MinimalTuple
-- 
2.34.1

Re: Reduce TupleHashEntryData struct size by half

Reply via email to