On Mon, 2025-01-06 at 12:34 -0800, Jeff Davis wrote: > If we separate out 4, we can use the Bump allocator for 2 & 3.
Attached POC patch, which reduces memory usage by ~15% for a simple distinct query on an integer key. Performance is the same or perhaps a hair faster. It's not many lines of code, but the surrounding code might benefit from some refactoring which would make it a bit simpler. Regards, Jeff Davis
From 3ed8fcf2db77b51cf665125ff5c83acd3e87a816 Mon Sep 17 00:00:00 2001 From: Jeff Davis <j...@j-davis.com> Date: Wed, 8 Jan 2025 11:46:35 -0800 Subject: [PATCH v1] HashAgg: use Bump context for hash table entries. --- src/backend/executor/nodeAgg.c | 32 +++++++++++++++++++++++++------- src/include/nodes/execnodes.h | 3 ++- 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c index 3005b5c0e3b..695091c7c46 100644 --- a/src/backend/executor/nodeAgg.c +++ b/src/backend/executor/nodeAgg.c @@ -1503,7 +1503,7 @@ build_hash_table(AggState *aggstate, int setno, long nbuckets) { AggStatePerHash perhash = &aggstate->perhash[setno]; MemoryContext metacxt = aggstate->hash_metacxt; - MemoryContext hashcxt = aggstate->hashcontext->ecxt_per_tuple_memory; + MemoryContext tablecxt = aggstate->hash_tablecxt; MemoryContext tmpcxt = aggstate->tmpcontext->ecxt_per_tuple_memory; Size additionalsize; @@ -1529,7 +1529,7 @@ build_hash_table(AggState *aggstate, int setno, long nbuckets) nbuckets, additionalsize, metacxt, - hashcxt, + tablecxt, tmpcxt, DO_AGGSPLIT_SKIPFINAL(aggstate->aggsplit)); } @@ -1858,15 +1858,18 @@ hash_agg_check_limits(AggState *aggstate) uint64 ngroups = aggstate->hash_ngroups_current; Size meta_mem = MemoryContextMemAllocated(aggstate->hash_metacxt, true); - Size hashkey_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory, - true); + Size entry_mem = MemoryContextMemAllocated(aggstate->hash_tablecxt, + true); + Size tval_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory, + true); + Size total_mem = meta_mem + entry_mem + tval_mem; /* * Don't spill unless there's at least one group in the hash table so we * can be sure to make progress even in edge cases. */ if (aggstate->hash_ngroups_current > 0 && - (meta_mem + hashkey_mem > aggstate->hash_mem_limit || + (total_mem > aggstate->hash_mem_limit || ngroups > aggstate->hash_ngroups_limit)) { hash_agg_enter_spill_mode(aggstate); @@ -1917,6 +1920,7 @@ static void hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions) { Size meta_mem; + Size entry_mem; Size hashkey_mem; Size buffer_mem; Size total_mem; @@ -1928,7 +1932,10 @@ hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions) /* memory for the hash table itself */ meta_mem = MemoryContextMemAllocated(aggstate->hash_metacxt, true); - /* memory for the group keys and transition states */ + /* memory for hash entries */ + entry_mem = MemoryContextMemAllocated(aggstate->hash_tablecxt, true); + + /* memory for byref transition states */ hashkey_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory, true); /* memory for read/write tape buffers, if spilled */ @@ -1937,7 +1944,7 @@ hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions) buffer_mem += HASHAGG_READ_BUFFER_SIZE; /* update peak mem */ - total_mem = meta_mem + hashkey_mem + buffer_mem; + total_mem = meta_mem + entry_mem + hashkey_mem + buffer_mem; if (total_mem > aggstate->hash_mem_peak) aggstate->hash_mem_peak = total_mem; @@ -2622,6 +2629,7 @@ agg_refill_hash_table(AggState *aggstate) /* free memory and reset hash tables */ ReScanExprContext(aggstate->hashcontext); + MemoryContextReset(aggstate->hash_tablecxt); for (int setno = 0; setno < aggstate->num_hashes; setno++) ResetTupleHashTable(aggstate->perhash[setno].hashtable); @@ -3587,6 +3595,9 @@ ExecInitAgg(Agg *node, EState *estate, int eflags) aggstate->hash_metacxt = AllocSetContextCreate(aggstate->ss.ps.state->es_query_cxt, "HashAgg meta context", ALLOCSET_DEFAULT_SIZES); + aggstate->hash_tablecxt = BumpContextCreate(aggstate->ss.ps.state->es_query_cxt, + "HashAgg table context", + ALLOCSET_DEFAULT_SIZES); aggstate->hash_spill_rslot = ExecInitExtraTupleSlot(estate, scanDesc, &TTSOpsMinimalTuple); aggstate->hash_spill_wslot = ExecInitExtraTupleSlot(estate, scanDesc, @@ -4331,6 +4342,12 @@ ExecEndAgg(AggState *node) MemoryContextDelete(node->hash_metacxt); node->hash_metacxt = NULL; } + if (node->hash_tablecxt != NULL) + { + MemoryContextDelete(node->hash_tablecxt); + node->hash_tablecxt = NULL; + } + for (transno = 0; transno < node->numtrans; transno++) { @@ -4447,6 +4464,7 @@ ExecReScanAgg(AggState *node) node->hash_ngroups_current = 0; ReScanExprContext(node->hashcontext); + MemoryContextReset(node->hash_tablecxt); /* Rebuild an empty hash table */ build_hash_tables(node); node->table_filled = false; diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index b3f7aa299f5..434adf2300c 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -2532,7 +2532,8 @@ typedef struct AggState /* these fields are used in AGG_HASHED and AGG_MIXED modes: */ bool table_filled; /* hash table filled yet? */ int num_hashes; - MemoryContext hash_metacxt; /* memory for hash table itself */ + MemoryContext hash_metacxt; /* memory for hash table bucket array */ + MemoryContext hash_tablecxt; /* memory for hash table entries */ struct LogicalTapeSet *hash_tapeset; /* tape set for hash spill tapes */ struct HashAggSpill *hash_spills; /* HashAggSpill for each grouping set, * exists only during first pass */ -- 2.34.1