From bf45f8b4eafe776e5eb9b7806082b548cba2a9ca Mon Sep 17 00:00:00 2001
From: David Rowley <dgrowley@gmail.com>
Date: Wed, 30 Apr 2025 00:03:07 +1200
Subject: [PATCH v1] Teach planner to short-circuit plans with empty setop
 inputs

This adjusts UNION / INTERSECT / EXCEPT planning so that the planner
produces more optimal plans when one or more of the set operator's
subqueries has been proven to be empty (a dummy rel).

For UNION, if any of the inputs are empty, then that input can be removed
from the Append / MergeAppend.  Previously, a const-false "Result" node
would appear to represent this.  Removing empty inputs has a few extra
benefits when only 1 union child remains as it means the Append or
MergeAppend can be removed in setrefs.c and also we can provide better
n_distinct estimates by looking at the remaining input's statistics.

For INTERSECT, if either input is empty, the entire result is empty.
This is true regardless of INTERSECT or INTERSECT ALL.

For EXCEPT, if the left input is empty, then there's certainly no
result rows and there's no point in scanning the right input as there
are no rows to match those ones up to on the left input.  When EXCEPT's
right input is empty, we can simply return all rows on the left input
when it's EXCEPT ALL, otherwise, for EXCEPT without ALL, we must
de-duplicate the left input rows.  Row estimates for the latter should
also be better with this change as we can obtain n_distinct estimates
from the left input directly.

Author: David Rowley <dgrowleyml@gmail.com>
Discussion: https://postgr.es/m/18904-c5fea7892f4d26ed@postgresql.org
---
 src/backend/optimizer/path/joinrels.c  |   2 +
 src/backend/optimizer/prep/prepunion.c | 188 +++++++++++++++++++++++--
 src/test/regress/expected/union.out    | 120 ++++++++++++++++
 src/test/regress/sql/union.sql         |  59 ++++++++
 4 files changed, 360 insertions(+), 9 deletions(-)

diff --git a/src/backend/optimizer/path/joinrels.c b/src/backend/optimizer/path/joinrels.c
index 535248aa525..43f4051bc6e 100644
--- a/src/backend/optimizer/path/joinrels.c
+++ b/src/backend/optimizer/path/joinrels.c
@@ -1298,6 +1298,8 @@ is_dummy_rel(RelOptInfo *rel)
 			path = ((ProjectionPath *) path)->subpath;
 		else if (IsA(path, ProjectSetPath))
 			path = ((ProjectSetPath *) path)->subpath;
+		else if (IsA(path, SubqueryScanPath))
+			path = ((SubqueryScanPath *) path)->subpath;
 		else
 			break;
 	}
diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c
index 6bd0f4a5dc3..dc2bcf8efd0 100644
--- a/src/backend/optimizer/prep/prepunion.c
+++ b/src/backend/optimizer/prep/prepunion.c
@@ -763,6 +763,10 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root,
 		RelOptInfo *rel = lfirst(lc);
 		Path	   *ordered_path;
 
+		/* Skip any UNION children that are proven not to yield any rows */
+		if (is_dummy_rel(rel))
+			continue;
+
 		cheapest_pathlist = lappend(cheapest_pathlist,
 									rel->cheapest_total_path);
 
@@ -812,6 +816,15 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root,
 	result_rel->consider_parallel = consider_parallel;
 	result_rel->consider_startup = (root->tuple_fraction > 0);
 
+	/* If all UNION children were dummy rels, make the resulting rel dummy */
+	if (cheapest_pathlist == NIL)
+	{
+		result_rel->reltarget = create_pathtarget(root, list_nth(tlist_list, 0));
+		mark_dummy_rel(result_rel);
+
+		return result_rel;
+	}
+
 	/*
 	 * Append the child results together using the cheapest paths from each
 	 * union child.
@@ -876,15 +889,33 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root,
 		bool		can_sort = grouping_is_sortable(groupList);
 		bool		can_hash = grouping_is_hashable(groupList);
 
-		/*
-		 * XXX for the moment, take the number of distinct groups as equal to
-		 * the total input size, i.e., the worst case.  This is too
-		 * conservative, but it's not clear how to get a decent estimate of
-		 * the true size.  One should note as well the propensity of novices
-		 * to write UNION rather than UNION ALL even when they don't expect
-		 * any duplicates...
-		 */
-		dNumGroups = apath->rows;
+		if (list_length(cheapest_pathlist) == 1)
+		{
+			Path	   *path = linitial(cheapest_pathlist);
+
+			/*
+			 * In the case where only one union child remains due to the
+			 * detection of one or more dummy union children, we'll obtain an
+			 * estimate on the surviving child directly.
+			 */
+			dNumGroups = estimate_num_groups(root,
+											 path->pathtarget->exprs,
+											 path->rows,
+											 NULL,
+											 NULL);
+		}
+		else
+		{
+			/*
+			 * Otherwise, for the moment, take the number of distinct groups
+			 * as equal to the total input size, i.e., the worst case.  This
+			 * is too conservative, but it's not clear how to get a decent
+			 * estimate of the true size.  One should note as well the
+			 * propensity of novices to write UNION rather than UNION ALL even
+			 * when they don't expect any duplicates...
+			 */
+			dNumGroups = apath->rows;
+		}
 
 		if (can_hash)
 		{
@@ -1148,6 +1179,145 @@ generate_nonunion_paths(SetOperationStmt *op, PlannerInfo *root,
 		result_rel->reltarget = create_setop_pathtarget(root, tlist,
 														list_make2(lpath, rpath));
 
+	/* Check for provably empty setop inputs and add short-circuit paths. */
+	if (op->op == SETOP_EXCEPT)
+	{
+		/*
+		 * For EXCEPTs, if the left side is dummy then there's no need to
+		 * inspect the right-hand side as scanning the right to find tuples to
+		 * remove won't make the left-hand input any more empty.
+		 */
+		if (is_dummy_rel(lrel))
+		{
+			result_rel->reltarget = create_pathtarget(root, lpath_tlist);
+			mark_dummy_rel(result_rel);
+
+			return result_rel;
+		}
+
+		/*
+		 * For EXCEPT, things are easy for the EXCEPT ALL case -- we can
+		 * simply scan the left-hand input.  For EXCEPT without ALL, we must
+		 * perform the work to deduplicate the results.
+		 */
+		if (is_dummy_rel(rrel))
+		{
+			if (op->all)
+			{
+				/*
+				 * EXCEPT ALL: If the right-hand input is dummy then we can
+				 * simply scan the left-hand input without further processing.
+				 */
+				add_path(result_rel, lpath);
+
+				return result_rel;
+			}
+			else
+			{
+				dNumGroups = estimate_num_groups(root,
+												 get_tlist_exprs(lpath_tlist, false),
+												 lrel->rows,
+												 NULL, NULL);
+
+				/*
+				 * EXCEPT (without ALL): Scan the left-hand input only.
+				 * Duplicates need to be removed, so an AggPath is needed.
+				 */
+				if (can_hash)
+				{
+					/*
+					 * Because the set op target list is made up with Vars
+					 * with varno==0 we must do something to allow the Agg to
+					 * find the Vars it expects for grouping.  In the normal
+					 * case, when there are no dummy setop inputs, the
+					 * SetOpPath would do this for us.  Here we make use of an
+					 * AppendPath putting the left input as the only Append
+					 * subpath.  This is a little bit of a hack, but the
+					 * single child Append doesn't cost us much as setrefs.c
+					 * will remove it later in planning.
+					 */
+					path = (Path *) create_append_path(root, result_rel, list_make1(lpath),
+													   NIL, NIL, NULL, 0, false, -1);
+
+					/* Hashed aggregate plan --- no sort needed */
+					path = (Path *) create_agg_path(root,
+													result_rel,
+													path,
+													result_rel->reltarget,
+													AGG_HASHED,
+													AGGSPLIT_SIMPLE,
+													groupList,
+													NIL,
+													NULL,
+													dNumGroups);
+					add_path(result_rel, path);
+				}
+
+				if (can_sort)
+				{
+					path = get_cheapest_path_for_pathkeys(lrel->pathlist,
+														  nonunion_pathkeys,
+														  NULL,
+														  TOTAL_COST,
+														  false);
+
+					if (path)
+					{
+						/*
+						 * As above, create a single-child Append for Var
+						 * translation purposes.
+						 */
+						path = (Path *) create_append_path(root, result_rel, list_make1(path),
+														   NIL, NIL, NULL, 0, false, -1);
+						path = (Path *)
+							create_unique_path(root,
+											   result_rel,
+											   path,
+											   list_length(path->pathkeys),
+											   dNumGroups);
+						add_path(result_rel, path);
+					}
+
+					/*
+					 * Also try sorting the cheapest Path as that might be
+					 * cheaper than the best pre-sorted Path.  Again, add the
+					 * single child Append for Var translation, as above.
+					 */
+					path = (Path *) create_append_path(root, result_rel, list_make1(lpath),
+													   NIL, NIL, NULL, 0, false, -1);
+
+					path = (Path *)
+						create_sort_path(root, result_rel, path,
+										 make_pathkeys_for_sortclauses(root, groupList, tlist),
+										 -1.0);
+					path = (Path *) create_unique_path(root,
+													   result_rel,
+													   path,
+													   list_length(path->pathkeys),
+													   dNumGroups);
+					add_path(result_rel, path);
+
+				}
+				return result_rel;
+			}
+		}
+	}
+	else
+	{
+		/*
+		 * For INTERSECT, if either input is a dummy rel then we can mark the
+		 * result_rel as dummy since intersecting with an empty relation can
+		 * never yield any results.  This is true regardless of INTERSECT or
+		 * INTERSECT ALL.
+		 */
+		if (is_dummy_rel(lrel) || is_dummy_rel(rrel))
+		{
+			result_rel->reltarget = create_pathtarget(root, lpath_tlist);
+			mark_dummy_rel(result_rel);
+			return result_rel;
+		}
+	}
+
 	/*
 	 * Estimate number of distinct groups that we'll need hashtable entries
 	 * for; this is the size of the left-hand input for EXCEPT, or the smaller
diff --git a/src/test/regress/expected/union.out b/src/test/regress/expected/union.out
index d3ea433db15..4637862fa44 100644
--- a/src/test/regress/expected/union.out
+++ b/src/test/regress/expected/union.out
@@ -1216,6 +1216,126 @@ select event_id
 
 drop table events_child, events, other_events;
 reset enable_indexonlyscan;
+--
+-- Test handling of UNION / EXCEPT / INTERSECT with provably empty inputs
+--
+-- Ensure the empty UNION input is pruned and de-duplication is done for the
+-- remaining relation.
+EXPLAIN (COSTS OFF, VERBOSE)
+SELECT two FROM tenk1 WHERE 1=2
+UNION
+SELECT four FROM tenk1
+ORDER BY 1;
+              QUERY PLAN              
+--------------------------------------
+ Sort
+   Output: tenk1.four
+   Sort Key: tenk1.four
+   ->  HashAggregate
+         Output: tenk1.four
+         Group Key: tenk1.four
+         ->  Seq Scan on public.tenk1
+               Output: tenk1.four
+(8 rows)
+
+-- Validate that the results of the above are correct
+SELECT two FROM tenk1 WHERE 1=2
+UNION
+SELECT four FROM tenk1
+ORDER BY 1;
+ two 
+-----
+   0
+   1
+   2
+   3
+(4 rows)
+
+-- All UNION inputs are proven empty.  Ensure the planner provides a
+-- const-false Result node
+EXPLAIN (COSTS OFF, VERBOSE)
+SELECT two FROM tenk1 WHERE 1=2
+UNION
+SELECT four FROM tenk1 WHERE 1=2
+UNION
+SELECT ten FROM tenk1 WHERE 1=2;
+           QUERY PLAN           
+--------------------------------
+ Result
+   Output: unnamed_subquery.two
+   Replaces: Aggregate
+   One-Time Filter: false
+(4 rows)
+
+-- Ensure the planner provides a const-false Result node
+EXPLAIN (COSTS OFF, VERBOSE)
+SELECT two FROM tenk1 WHERE 1=2
+INTERSECT
+SELECT four FROM tenk1;
+                          QUERY PLAN                           
+---------------------------------------------------------------
+ Result
+   Output: unnamed_subquery.two
+   Replaces: Aggregate on unnamed_subquery, unnamed_subquery_1
+   One-Time Filter: false
+(4 rows)
+
+-- As above, with the inputs swapped
+EXPLAIN (COSTS OFF, VERBOSE)
+SELECT four FROM tenk1
+INTERSECT
+SELECT two FROM tenk1 WHERE 1=2;
+                          QUERY PLAN                           
+---------------------------------------------------------------
+ Result
+   Output: unnamed_subquery_1.two
+   Replaces: Aggregate on unnamed_subquery, unnamed_subquery_1
+   One-Time Filter: false
+(4 rows)
+
+-- Ensure the planner provides a const-false Result node when the left input
+-- is empty
+EXPLAIN (COSTS OFF, VERBOSE)
+SELECT two FROM tenk1 WHERE 1=2
+EXCEPT
+SELECT four FROM tenk1;
+                          QUERY PLAN                           
+---------------------------------------------------------------
+ Result
+   Output: unnamed_subquery.two
+   Replaces: Aggregate on unnamed_subquery, unnamed_subquery_1
+   One-Time Filter: false
+(4 rows)
+
+-- Ensure the planner only scans the left input and that it de-duplicates it
+EXPLAIN (COSTS OFF, VERBOSE)
+SELECT two FROM tenk1
+EXCEPT
+SELECT four FROM tenk1 WHERE 1=2
+ORDER BY 1;
+              QUERY PLAN              
+--------------------------------------
+ Sort
+   Output: tenk1.two
+   Sort Key: tenk1.two
+   ->  HashAggregate
+         Output: tenk1.two
+         Group Key: tenk1.two
+         ->  Seq Scan on public.tenk1
+               Output: tenk1.two
+(8 rows)
+
+-- Check the results of the above
+SELECT two FROM tenk1
+EXCEPT
+SELECT four FROM tenk1 WHERE 1=2
+ORDER BY 1;
+ two 
+-----
+   0
+   1
+(2 rows)
+
 -- Test constraint exclusion of UNION ALL subqueries
 explain (costs off)
  SELECT * FROM
diff --git a/src/test/regress/sql/union.sql b/src/test/regress/sql/union.sql
index 13700a6bfc4..b9f432068a4 100644
--- a/src/test/regress/sql/union.sql
+++ b/src/test/regress/sql/union.sql
@@ -459,6 +459,65 @@ drop table events_child, events, other_events;
 
 reset enable_indexonlyscan;
 
+--
+-- Test handling of UNION / EXCEPT / INTERSECT with provably empty inputs
+--
+
+-- Ensure the empty UNION input is pruned and de-duplication is done for the
+-- remaining relation.
+EXPLAIN (COSTS OFF, VERBOSE)
+SELECT two FROM tenk1 WHERE 1=2
+UNION
+SELECT four FROM tenk1
+ORDER BY 1;
+
+-- Validate that the results of the above are correct
+SELECT two FROM tenk1 WHERE 1=2
+UNION
+SELECT four FROM tenk1
+ORDER BY 1;
+
+-- All UNION inputs are proven empty.  Ensure the planner provides a
+-- const-false Result node
+EXPLAIN (COSTS OFF, VERBOSE)
+SELECT two FROM tenk1 WHERE 1=2
+UNION
+SELECT four FROM tenk1 WHERE 1=2
+UNION
+SELECT ten FROM tenk1 WHERE 1=2;
+
+-- Ensure the planner provides a const-false Result node
+EXPLAIN (COSTS OFF, VERBOSE)
+SELECT two FROM tenk1 WHERE 1=2
+INTERSECT
+SELECT four FROM tenk1;
+
+-- As above, with the inputs swapped
+EXPLAIN (COSTS OFF, VERBOSE)
+SELECT four FROM tenk1
+INTERSECT
+SELECT two FROM tenk1 WHERE 1=2;
+
+-- Ensure the planner provides a const-false Result node when the left input
+-- is empty
+EXPLAIN (COSTS OFF, VERBOSE)
+SELECT two FROM tenk1 WHERE 1=2
+EXCEPT
+SELECT four FROM tenk1;
+
+-- Ensure the planner only scans the left input and that it de-duplicates it
+EXPLAIN (COSTS OFF, VERBOSE)
+SELECT two FROM tenk1
+EXCEPT
+SELECT four FROM tenk1 WHERE 1=2
+ORDER BY 1;
+
+-- Check the results of the above
+SELECT two FROM tenk1
+EXCEPT
+SELECT four FROM tenk1 WHERE 1=2
+ORDER BY 1;
+
 -- Test constraint exclusion of UNION ALL subqueries
 explain (costs off)
  SELECT * FROM
-- 
2.43.0