Currently, an executor node can hold onto large data structures until ExecEndNode(). For complex plans, that means we can use a lot more memory than we need to. For instance, see the SQL at the end of this email.
(Note: this work is not intended to implement any kind of query-level work_mem enforcement. It reduces the maximum number of times that work_mem is used in certain kinds of complex plans, which may help with that problem, but enforcing that is not my goal right now.) The attached proof-of-concept patch (0001) makes the Hash node call ExecShutdownNode() on the subplan when it's exhausted, and extends ExecShutdownHash() to release memory. I added some code to measure the peak memory usage (0002), and it went from about 200MB to about 150MB. The savings roughly correspond to the memory used by the first Hash node. I'm not 100% sure that ExecShutdownNode() is the right place to do this, but it seems better than introducing yet another executor API. Thoughts? We'd need to figure out what to do about rescan. One option is to just say that if EXEC_FLAG_REWIND is used, then it would never free the resources eagerly. But if it's under a memoize, then it's unlikely to be called with the same parameters again, so whatever state it already has might be useless anyway. Also, are there any major challenges making this work with parallel query? Regards, Jeff Davis Example: CREATE TABLE t1(id1 int8 primary key); CREATE TABLE t2(id2 int8 primary key, id1 int8); CREATE TABLE t3(id3 int8, id2 int8, n NUMERIC); INSERT INTO t1 SELECT g FROM generate_series(1, 1000000) g; INSERT INTO t2 SELECT g+1000000000, g FROM generate_series(1, 1000000) g; INSERT INTO t3 SELECT g+2000000000, g+1000000000, 3.14159 FROM generate_series(1, 1000000) g; INSERT INTO t3 SELECT random(1,1000000) + 3000000000, random(1,1000000) + 3000000000, -1 FROM generate_series(1, 10000000) g; VACUUM ANALYZE; SET work_mem = '1GB'; SET from_collapse_limit = 1; SET enable_sort = off; SET enable_nestloop = off; SET max_parallel_workers = 0; SET max_parallel_workers_per_gather = 0; EXPLAIN (COSTS OFF) SELECT id1, COUNT(*) FROM t3, (SELECT t1.id1, t2.id2 FROM t1, t2 WHERE t2.id1 = t1.id1) s WHERE t3.id2 = s.id2 GROUP BY id1; QUERY PLAN -------------------------------------------------- HashAggregate Group Key: t1.id1 -> Hash Join Hash Cond: (t3.id2 = t2.id2) -> Seq Scan on t3 -> Hash -> Hash Join Hash Cond: (t2.id1 = t1.id1) -> Seq Scan on t2 -> Hash -> Seq Scan on t1
From bba63770ba061c3056e7e998640740f0a0018ada Mon Sep 17 00:00:00 2001 From: Jeff Davis <j...@j-davis.com> Date: Fri, 15 Aug 2025 15:54:42 -0700 Subject: [PATCH 1/2] release memory in ExecShutdownHashJoin(). --- src/backend/executor/execProcnode.c | 4 +++- src/backend/executor/nodeHash.c | 3 +++ src/backend/executor/nodeHashjoin.c | 3 +++ src/include/nodes/execnodes.h | 2 ++ 4 files changed, 11 insertions(+), 1 deletion(-) diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c index f5f9cfbeead..e21a037e9e2 100644 --- a/src/backend/executor/execProcnode.c +++ b/src/backend/executor/execProcnode.c @@ -777,7 +777,7 @@ ExecShutdownNode(PlanState *node) static bool ExecShutdownNode_walker(PlanState *node, void *context) { - if (node == NULL) + if (node == NULL || node->shutdown) return false; check_stack_depth(); @@ -825,6 +825,8 @@ ExecShutdownNode_walker(PlanState *node, void *context) if (node->instrument && node->instrument->running) InstrStopNode(node->instrument, 0); + node->shutdown = true; + return false; } diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c index 8d2201ab67f..7c22dd035bc 100644 --- a/src/backend/executor/nodeHash.c +++ b/src/backend/executor/nodeHash.c @@ -164,7 +164,10 @@ MultiExecPrivateHash(HashState *node) slot = ExecProcNode(outerNode); if (TupIsNull(slot)) + { + ExecShutdownNode(outerNode); break; + } /* We have to compute the hash value */ econtext->ecxt_outertuple = slot; diff --git a/src/backend/executor/nodeHashjoin.c b/src/backend/executor/nodeHashjoin.c index 5661ad76830..cf3c5e5cc6f 100644 --- a/src/backend/executor/nodeHashjoin.c +++ b/src/backend/executor/nodeHashjoin.c @@ -1591,6 +1591,9 @@ ExecShutdownHashJoin(HashJoinState *node) */ ExecHashTableDetachBatch(node->hj_HashTable); ExecHashTableDetach(node->hj_HashTable); + + ExecHashTableDestroy(node->hj_HashTable); + node->hj_HashTable = NULL; } } diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index e107d6e5f81..4e45faa4a66 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -1205,6 +1205,8 @@ typedef struct PlanState bool async_capable; /* true if node is async-capable */ + bool shutdown; /* ExecShutdownNode() called */ + /* * Scanslot's descriptor if known. This is a bit of a hack, but otherwise * it's hard for expression compilation to optimize based on the -- 2.43.0
From 7a08dbac3f40afd50de2f8f7c3a2a5e7e13397fb Mon Sep 17 00:00:00 2001 From: Jeff Davis <j...@j-davis.com> Date: Fri, 15 Aug 2025 16:35:46 -0700 Subject: [PATCH 2/2] measure peak memory usage --- src/backend/executor/execMain.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index b8b9d2a85f7..1021e714b66 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -1636,6 +1636,8 @@ ExecutePlan(QueryDesc *queryDesc, bool use_parallel_mode; TupleTableSlot *slot; uint64 current_tuple_count; + size_t mem_current = 0; + size_t mem_hwm = 0; /* * initialize local variables @@ -1677,6 +1679,10 @@ ExecutePlan(QueryDesc *queryDesc, */ slot = ExecProcNode(planstate); + mem_current = MemoryContextMemAllocated(TopMemoryContext, true); + if (mem_current > mem_hwm) + mem_hwm = mem_current; + /* * if the tuple is null, then we assume there is nothing more to * process so we just end the loop... @@ -1737,6 +1743,13 @@ ExecutePlan(QueryDesc *queryDesc, if (use_parallel_mode) ExitParallelMode(); + + mem_current = MemoryContextMemAllocated(TopMemoryContext, true); + if (mem_current > mem_hwm) + mem_hwm = mem_current; + + MemoryContextStats(TopMemoryContext); + elog(NOTICE, "mem_hwm: %zu", mem_hwm); } -- 2.43.0