Currently, an executor node can hold onto large data structures until
ExecEndNode(). For complex plans, that means we can use a lot more
memory than we need to. For instance, see the SQL at the end of this
email.

(Note: this work is not intended to implement any kind of query-level
work_mem enforcement. It reduces the maximum number of times that
work_mem is used in certain kinds of complex plans, which may help with
that problem, but enforcing that is not my goal right now.)

The attached proof-of-concept patch (0001) makes the Hash node call
ExecShutdownNode() on the subplan when it's exhausted, and extends
ExecShutdownHash() to release memory. I added some code to measure the
peak memory usage (0002), and it went from about 200MB to about 150MB.
The savings roughly correspond to the memory used by the first Hash
node.

I'm not 100% sure that ExecShutdownNode() is the right place to do
this, but it seems better than introducing yet another executor API.
Thoughts?

We'd need to figure out what to do about rescan. One option is to just
say that if EXEC_FLAG_REWIND is used, then it would never free the
resources eagerly. But if it's under a memoize, then it's unlikely to
be called with the same parameters again, so whatever state it already
has might be useless anyway.

Also, are there any major challenges making this work with parallel
query?

Regards,
        Jeff Davis

Example:

   CREATE TABLE t1(id1 int8 primary key);
   CREATE TABLE t2(id2 int8 primary key, id1 int8);
   CREATE TABLE t3(id3 int8, id2 int8, n NUMERIC);
   INSERT INTO t1 SELECT g FROM generate_series(1, 1000000) g;
   INSERT INTO t2 SELECT g+1000000000, g
     FROM generate_series(1, 1000000) g;
   INSERT INTO t3 SELECT g+2000000000, g+1000000000, 3.14159
     FROM generate_series(1, 1000000) g;
   INSERT INTO t3 SELECT
       random(1,1000000) + 3000000000,
       random(1,1000000) + 3000000000,
       -1
     FROM generate_series(1, 10000000) g;

   VACUUM ANALYZE;

   SET work_mem = '1GB';
   SET from_collapse_limit = 1;
   SET enable_sort = off;
   SET enable_nestloop = off;
   SET max_parallel_workers = 0;
   SET max_parallel_workers_per_gather = 0;
   EXPLAIN (COSTS OFF)
     SELECT id1, COUNT(*) FROM
        t3,
        (SELECT t1.id1, t2.id2 FROM t1, t2 WHERE t2.id1 = t1.id1) s
     WHERE t3.id2 = s.id2
     GROUP BY id1;
                       QUERY PLAN                    
   --------------------------------------------------
    HashAggregate
      Group Key: t1.id1
      ->  Hash Join
            Hash Cond: (t3.id2 = t2.id2)
            ->  Seq Scan on t3
            ->  Hash
                  ->  Hash Join
                        Hash Cond: (t2.id1 = t1.id1)
                        ->  Seq Scan on t2
                        ->  Hash
                              ->  Seq Scan on t1

From bba63770ba061c3056e7e998640740f0a0018ada Mon Sep 17 00:00:00 2001
From: Jeff Davis <j...@j-davis.com>
Date: Fri, 15 Aug 2025 15:54:42 -0700
Subject: [PATCH 1/2] release memory in ExecShutdownHashJoin().

---
 src/backend/executor/execProcnode.c | 4 +++-
 src/backend/executor/nodeHash.c     | 3 +++
 src/backend/executor/nodeHashjoin.c | 3 +++
 src/include/nodes/execnodes.h       | 2 ++
 4 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c
index f5f9cfbeead..e21a037e9e2 100644
--- a/src/backend/executor/execProcnode.c
+++ b/src/backend/executor/execProcnode.c
@@ -777,7 +777,7 @@ ExecShutdownNode(PlanState *node)
 static bool
 ExecShutdownNode_walker(PlanState *node, void *context)
 {
-	if (node == NULL)
+	if (node == NULL || node->shutdown)
 		return false;
 
 	check_stack_depth();
@@ -825,6 +825,8 @@ ExecShutdownNode_walker(PlanState *node, void *context)
 	if (node->instrument && node->instrument->running)
 		InstrStopNode(node->instrument, 0);
 
+	node->shutdown = true;
+
 	return false;
 }
 
diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
index 8d2201ab67f..7c22dd035bc 100644
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -164,7 +164,10 @@ MultiExecPrivateHash(HashState *node)
 
 		slot = ExecProcNode(outerNode);
 		if (TupIsNull(slot))
+		{
+			ExecShutdownNode(outerNode);
 			break;
+		}
 		/* We have to compute the hash value */
 		econtext->ecxt_outertuple = slot;
 
diff --git a/src/backend/executor/nodeHashjoin.c b/src/backend/executor/nodeHashjoin.c
index 5661ad76830..cf3c5e5cc6f 100644
--- a/src/backend/executor/nodeHashjoin.c
+++ b/src/backend/executor/nodeHashjoin.c
@@ -1591,6 +1591,9 @@ ExecShutdownHashJoin(HashJoinState *node)
 		 */
 		ExecHashTableDetachBatch(node->hj_HashTable);
 		ExecHashTableDetach(node->hj_HashTable);
+
+		ExecHashTableDestroy(node->hj_HashTable);
+		node->hj_HashTable = NULL;
 	}
 }
 
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index e107d6e5f81..4e45faa4a66 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1205,6 +1205,8 @@ typedef struct PlanState
 
 	bool		async_capable;	/* true if node is async-capable */
 
+	bool		shutdown;		/* ExecShutdownNode() called */
+
 	/*
 	 * Scanslot's descriptor if known. This is a bit of a hack, but otherwise
 	 * it's hard for expression compilation to optimize based on the
-- 
2.43.0

From 7a08dbac3f40afd50de2f8f7c3a2a5e7e13397fb Mon Sep 17 00:00:00 2001
From: Jeff Davis <j...@j-davis.com>
Date: Fri, 15 Aug 2025 16:35:46 -0700
Subject: [PATCH 2/2] measure peak memory usage

---
 src/backend/executor/execMain.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index b8b9d2a85f7..1021e714b66 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -1636,6 +1636,8 @@ ExecutePlan(QueryDesc *queryDesc,
 	bool		use_parallel_mode;
 	TupleTableSlot *slot;
 	uint64		current_tuple_count;
+	size_t		mem_current = 0;
+	size_t		mem_hwm = 0;
 
 	/*
 	 * initialize local variables
@@ -1677,6 +1679,10 @@ ExecutePlan(QueryDesc *queryDesc,
 		 */
 		slot = ExecProcNode(planstate);
 
+		mem_current = MemoryContextMemAllocated(TopMemoryContext, true);
+		if (mem_current > mem_hwm)
+			mem_hwm = mem_current;
+
 		/*
 		 * if the tuple is null, then we assume there is nothing more to
 		 * process so we just end the loop...
@@ -1737,6 +1743,13 @@ ExecutePlan(QueryDesc *queryDesc,
 
 	if (use_parallel_mode)
 		ExitParallelMode();
+
+	mem_current = MemoryContextMemAllocated(TopMemoryContext, true);
+	if (mem_current > mem_hwm)
+		mem_hwm = mem_current;
+
+	MemoryContextStats(TopMemoryContext);
+	elog(NOTICE, "mem_hwm: %zu", mem_hwm);
 }
 
 
-- 
2.43.0

Reply via email to