Hi, hackers!

I found one pull-up that works if the inner join condition is written through the where condition,

|create temp table ta (id int primary key, val int); insert into ta values(1,1); insert into ta values(2,2); ||insert into ta values(3,3);|

|create temp table tb (id int primary key, aval int); insert into tb values(4,1); insert into tb values(5,1); insert into tb values(1,2); create temp table tc (id int primary key, aid int); insert into tc values(6,1); insert into tc values(7,2);|

|EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF) SELECT * FROM ta WHERE EXISTS (SELECT * FROM tb, tc WHERE ta.id = tb.id);|
                               QUERY PLAN
-------------------------------------------------------------------------
 Nested Loop Semi Join (actual rows=1 loops=1)
   Buffers: local hit=6
   ->  Seq Scan on ta (actual rows=3 loops=1)
         Buffers: local hit=1
   ->  Nested Loop (actual rows=0 loops=3)
         Buffers: local hit=5
         ->  Index Only Scan using tb_pkey on tb (actual rows=0 loops=3)
               Index Cond: (id = ta.id)
               Heap Fetches: 1
               Buffers: local hit=4
         ->  Seq Scan on tc (actual rows=1 loops=1)
               Buffers: local hit=1
 Planning:
   Buffers: shared hit=67 read=12
(14 rows)

but it doesn't work if it is written through the outside condition.

|alena@postgres=# EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF) SELECT * FROM ta WHERE EXISTS (SELECT * FROM tb JOIN tc ON ta.id = tb.id); QUERY PLAN ------------------------------------------------------ Seq Scan on ta (actual rows=1 loops=1) Filter: EXISTS(SubPlan 1) Rows Removed by Filter: 2 Buffers: local hit=5 SubPlan 1 -> Nested Loop (actual rows=0 loops=3) Buffers: local hit=4 -> Seq Scan on tb (actual rows=0 loops=3) Filter: (ta.id = id) Rows Removed by Filter: 3 Buffers: local hit=3 -> Seq Scan on tc (actual rows=1 loops=1) Buffers: local hit=1 Planning: Buffers: shared hit=16 read=9 (15 rows) |

|I have written a patch to add this functionality and now it gives an query plan: |

|alena@postgres=# EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF)
 SELECT *
   FROM ta
  WHERE EXISTS (SELECT *
                  FROM tb JOIN tc
                  ON ta.id = tb.id);
                     QUERY PLAN
-------------------------------------------------------------------------
 Nested Loop Semi Join (actual rows=1 loops=1)
   Buffers: local hit=6
   ->  Seq Scan on ta (actual rows=3 loops=1)
         Buffers: local hit=1
   ->  Nested Loop (actual rows=0 loops=3)
         Buffers: local hit=5
         ->  Index Only Scan using tb_pkey on tb (actual rows=0 loops=3)
               Index Cond: (id = ta.id)
               Heap Fetches: 1
               Buffers: local hit=4
         ->  Seq Scan on tc (actual rows=1 loops=1)
               Buffers: local hit=1
(12 rows)|

tb and tc form a Cartesian product, but in the case of the intersection condition with tuples from the table ta (ta.id = tb.id). So, according to the join condition, tb intersects only with 1, and only it gets into the result, but at the same time they appear twice - this is because of the Cartesian product of tb with tc

|*How it works:*
|

I rewrote the code a bit so that it considers not only the quals in jointree->quals, but also those in join expression (subselect->jointree->fromlist). If they satisfy the conditions for using pull up, I add them to the list of clauses and form a "Bool" expression from them, joined by an "AND" operation.

--

Regards, Alena Rybakina Postgres Professional
From 3b3d761cd4d67e299cdbd3c2e6cf5256f27da5eb Mon Sep 17 00:00:00 2001
From: Alena Rybakina <a.rybak...@postgrespro.ru>
Date: Tue, 24 Dec 2024 07:29:42 +0300
Subject: [PATCH] Add EXISTS pull up if subquery join expressions  are
 independent and have a reference only to the outer part of the subquery  or
 they are constant. We should transform expression onto subselect query  with
 using on the referenced table. We need only to know that the table is  not
 empty.

This query:
SELECT *
   FROM ta
  WHERE EXISTS (SELECT *
                  FROM tb
                   left outer JOIN tc
                    ON ta.id = tb.id);
can be transformed to:
select * from ta where exists (select * from tb limit 1);
---
 src/backend/optimizer/plan/subselect.c  |  94 ++++++++++++++-
 src/test/regress/expected/subselect.out | 151 ++++++++++++++++++++++++
 src/test/regress/sql/subselect.sql      |  65 ++++++++++
 3 files changed, 305 insertions(+), 5 deletions(-)

diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c
index ed62e3a0fcf..9b2b6addfaf 100644
--- a/src/backend/optimizer/plan/subselect.c
+++ b/src/backend/optimizer/plan/subselect.c
@@ -1376,6 +1376,18 @@ convert_EXISTS_sublink_to_join(PlannerInfo *root, SubLink *sublink,
 	int			varno;
 	Relids		clause_varnos;
 	Relids		upper_varnos;
+	ListCell *lc;
+	List *clauses = NIL;
+	List *all_clauses = NIL;
+	int first_elem = true;
+	Const *const_var = makeConst(BOOLOID,
+									-1,
+									InvalidOid,
+									sizeof(bool),
+									(Datum) 1,
+									false,
+									true);
+
 
 	Assert(sublink->subLinkType == EXISTS_SUBLINK);
 
@@ -1403,14 +1415,86 @@ convert_EXISTS_sublink_to_join(PlannerInfo *root, SubLink *sublink,
 	 * with noplace to evaluate the targetlist.
 	 */
 	if (!simplify_EXISTS_query(root, subselect))
-		return NULL;
+	{
+			return NULL;
+	}
+
+	if (subselect->jointree->quals)
+		all_clauses = lappend(all_clauses, subselect->jointree->quals);
+
+	subselect->jointree->quals = NULL;
+
+	/* Gather all clauses in main list for the further consideration */
+	all_clauses = list_concat(all_clauses, subselect->jointree->fromlist);
 
 	/*
-	 * Separate out the WHERE clause.  (We could theoretically also remove
-	 * top-level plain JOIN/ON clauses, but it's probably not worth the
-	 * trouble.)
+	 * We will able to remove top-level plain JOIN/ON clauses if they are not outer join.
 	 */
-	whereClause = subselect->jointree->quals;
+	foreach (lc, all_clauses)
+	{
+		Node *je = ((Node *) lfirst(lc));
+
+		whereClause = je;
+
+		if (IsA(je, RangeTblRef))
+		{
+			goto end;
+		}
+
+		if ((IsA(je, JoinExpr) && ((JoinExpr *)je)->jointype != JOIN_INNER))
+		{
+			goto end;
+		}
+
+		if (IsA(je, JoinExpr) && ((JoinExpr *)je)->quals != NULL)
+			whereClause = ((JoinExpr *)je)->quals;
+
+		/*
+		* On the other hand, the WHERE clause must contain some Vars of the
+		* parent query, else it's not gonna be a join.
+		*/
+		if (!contain_vars_of_level(whereClause, 1))
+		{
+			goto end;
+		}
+
+		/*
+		* We don't risk optimizing if the WHERE clause is volatile, either.
+		*/
+		if (contain_volatile_functions(whereClause))
+		{
+			goto end;
+		}
+
+		/*
+		 * In case of a successful attempt, replaces it with the correct condition
+		 */
+		if (IsA(je, JoinExpr))
+			((JoinExpr *)je)->quals = (Node *) const_var;
+
+		clauses = lappend(clauses, whereClause);
+
+		first_elem = false;
+		subselect->jointree->fromlist = list_delete_ptr(subselect->jointree->fromlist, lc);
+
+		end:
+			if (first_elem)
+				return NULL;
+			continue;
+	}
+
+	list_free(all_clauses);
+
+	/* We don't have any cluses for pull-up creation */
+	if (clauses == NIL)
+		return NULL;
+	else
+		/* We can easily combine clauses through AND operator because they are independent */
+		whereClause = list_length(clauses) > 1 ?
+							(Node *) makeBoolExpr(AND_EXPR, clauses, -1) :
+							(Node *) linitial(clauses);
+
+
 	subselect->jointree->quals = NULL;
 
 	/*
diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out
index ebc545e2461..6806fa9bb06 100644
--- a/src/test/regress/expected/subselect.out
+++ b/src/test/regress/expected/subselect.out
@@ -812,6 +812,157 @@ where exists (
       from text_tbl ) ss
   where road.name = ss.f1 );
 rollback;
+-- Test case for exist sublink where we can consider some undependent expression
+-- with outer link
+--
+EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF)
+ SELECT 1
+   FROM ta
+  WHERE EXISTS (SELECT 1
+                  FROM tb
+                  JOIN tc
+                    ON ta.id = tb.id);
+                               QUERY PLAN                                
+-------------------------------------------------------------------------
+ Nested Loop Semi Join (actual rows=2 loops=1)
+   ->  Seq Scan on ta (actual rows=2 loops=1)
+   ->  Nested Loop (actual rows=1 loops=2)
+         ->  Index Only Scan using tb_pkey on tb (actual rows=1 loops=2)
+               Index Cond: (id = ta.id)
+               Heap Fetches: 2
+         ->  Seq Scan on tc (actual rows=1 loops=2)
+(7 rows)
+
+EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF)
+ SELECT 1
+   FROM ta
+  WHERE EXISTS (SELECT 1
+                  FROM tb
+                  JOIN tc
+                    ON ta.id = tc.id);
+                               QUERY PLAN                                
+-------------------------------------------------------------------------
+ Nested Loop Semi Join (actual rows=2 loops=1)
+   ->  Seq Scan on ta (actual rows=2 loops=1)
+   ->  Nested Loop (actual rows=1 loops=2)
+         ->  Index Only Scan using tc_pkey on tc (actual rows=1 loops=2)
+               Index Cond: (id = ta.id)
+               Heap Fetches: 2
+         ->  Seq Scan on tb (actual rows=1 loops=2)
+(7 rows)
+
+-- Join compound expression
+EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF)
+ SELECT 1
+   FROM ta
+  WHERE EXISTS (SELECT 1
+                  FROM tb
+                  JOIN tc
+                    ON ta.id = tc.id and
+                       ta.id = tb.id);
+                         QUERY PLAN                          
+-------------------------------------------------------------
+ Hash Right Semi Join (actual rows=2 loops=1)
+   Hash Cond: (tc.id = ta.id)
+   ->  Hash Join (actual rows=2 loops=1)
+         Hash Cond: (tb.id = tc.id)
+         ->  Seq Scan on tb (actual rows=4 loops=1)
+         ->  Hash (actual rows=2 loops=1)
+               Buckets: 4096  Batches: 1  Memory Usage: 33kB
+               ->  Seq Scan on tc (actual rows=2 loops=1)
+   ->  Hash (actual rows=2 loops=1)
+         Buckets: 4096  Batches: 1  Memory Usage: 33kB
+         ->  Seq Scan on ta (actual rows=2 loops=1)
+(11 rows)
+
+EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF)
+ SELECT 1
+   FROM ta
+  WHERE EXISTS (SELECT 1
+                  FROM tb
+                  JOIN tc
+                    ON ta.id = tc.id and
+                       ta.id = tb.id);
+                         QUERY PLAN                          
+-------------------------------------------------------------
+ Hash Right Semi Join (actual rows=2 loops=1)
+   Hash Cond: (tc.id = ta.id)
+   ->  Hash Join (actual rows=2 loops=1)
+         Hash Cond: (tb.id = tc.id)
+         ->  Seq Scan on tb (actual rows=4 loops=1)
+         ->  Hash (actual rows=2 loops=1)
+               Buckets: 4096  Batches: 1  Memory Usage: 33kB
+               ->  Seq Scan on tc (actual rows=2 loops=1)
+   ->  Hash (actual rows=2 loops=1)
+         Buckets: 4096  Batches: 1  Memory Usage: 33kB
+         ->  Seq Scan on ta (actual rows=2 loops=1)
+(11 rows)
+
+-- Compound expression with const type
+EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF)
+ SELECT 1
+   FROM ta
+  WHERE EXISTS (SELECT 1
+                  FROM tb
+                  JOIN tc
+                    ON ta.id = tc.id and
+                       ta.id = 1);
+                               QUERY PLAN                                
+-------------------------------------------------------------------------
+ Nested Loop Semi Join (actual rows=1 loops=1)
+   ->  Index Only Scan using ta_pkey on ta (actual rows=1 loops=1)
+         Index Cond: (id = 1)
+         Heap Fetches: 1
+   ->  Nested Loop (actual rows=1 loops=1)
+         ->  Index Only Scan using tc_pkey on tc (actual rows=1 loops=1)
+               Index Cond: (id = 1)
+               Heap Fetches: 1
+         ->  Seq Scan on tb (actual rows=1 loops=1)
+(9 rows)
+
+EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF)
+ SELECT 1
+   FROM ta
+  WHERE EXISTS (SELECT 1
+                  FROM tb
+                  JOIN tc
+                    ON tb.id = 1 and
+                       ta.id = 1);
+                               QUERY PLAN                                
+-------------------------------------------------------------------------
+ Nested Loop Semi Join (actual rows=1 loops=1)
+   ->  Index Only Scan using ta_pkey on ta (actual rows=1 loops=1)
+         Index Cond: (id = 1)
+         Heap Fetches: 1
+   ->  Nested Loop (actual rows=1 loops=1)
+         ->  Index Only Scan using tb_pkey on tb (actual rows=1 loops=1)
+               Index Cond: (id = 1)
+               Heap Fetches: 1
+         ->  Seq Scan on tc (actual rows=1 loops=1)
+(9 rows)
+
+-- Disabled pull up because it is applcapable for INNER JOIN connection
+EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF)
+ SELECT 1
+   FROM ta
+  WHERE EXISTS (SELECT 1
+                  FROM tb
+                  RIGHT JOIN tc
+                    ON ta.id = tc.id);
+                         QUERY PLAN                         
+------------------------------------------------------------
+ Seq Scan on ta (actual rows=2 loops=1)
+   Filter: EXISTS(SubPlan 1)
+   SubPlan 1
+     ->  Nested Loop Left Join (actual rows=1 loops=2)
+           Join Filter: (ta.id = tc.id)
+           Rows Removed by Join Filter: 2
+           ->  Seq Scan on tc (actual rows=1 loops=2)
+           ->  Materialize (actual rows=2 loops=2)
+                 Storage: Memory  Maximum Storage: 17kB
+                 ->  Seq Scan on tb (actual rows=4 loops=1)
+(10 rows)
+
 --
 -- Test case for sublinks pushed down into subselects via join alias expansion
 --
diff --git a/src/test/regress/sql/subselect.sql b/src/test/regress/sql/subselect.sql
index 6ed3636a9e4..f73eb65aaa0 100644
--- a/src/test/regress/sql/subselect.sql
+++ b/src/test/regress/sql/subselect.sql
@@ -439,6 +439,71 @@ where exists (
 
 rollback;
 
+-- Test case for exist sublink where we can consider some undependent expression
+-- with outer link
+--
+EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF)
+ SELECT 1
+   FROM ta
+  WHERE EXISTS (SELECT 1
+                  FROM tb
+                  JOIN tc
+                    ON ta.id = tb.id);
+
+EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF)
+ SELECT 1
+   FROM ta
+  WHERE EXISTS (SELECT 1
+                  FROM tb
+                  JOIN tc
+                    ON ta.id = tc.id);
+
+-- Join compound expression
+EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF)
+ SELECT 1
+   FROM ta
+  WHERE EXISTS (SELECT 1
+                  FROM tb
+                  JOIN tc
+                    ON ta.id = tc.id and
+                       ta.id = tb.id);
+
+EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF)
+ SELECT 1
+   FROM ta
+  WHERE EXISTS (SELECT 1
+                  FROM tb
+                  JOIN tc
+                    ON ta.id = tc.id and
+                       ta.id = tb.id);
+
+-- Compound expression with const type
+EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF)
+ SELECT 1
+   FROM ta
+  WHERE EXISTS (SELECT 1
+                  FROM tb
+                  JOIN tc
+                    ON ta.id = tc.id and
+                       ta.id = 1);
+
+EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF)
+ SELECT 1
+   FROM ta
+  WHERE EXISTS (SELECT 1
+                  FROM tb
+                  JOIN tc
+                    ON tb.id = 1 and
+                       ta.id = 1);
+-- Disabled pull up because it is applcapable for INNER JOIN connection
+EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF)
+ SELECT 1
+   FROM ta
+  WHERE EXISTS (SELECT 1
+                  FROM tb
+                  RIGHT JOIN tc
+                    ON ta.id = tc.id);
+
 --
 -- Test case for sublinks pushed down into subselects via join alias expansion
 --
-- 
2.34.1

Reply via email to