Re: [PR] Improve TableScan with filters pushdown unparsing (joins) [datafusion]

via GitHub Mon, 28 Oct 2024 18:53:36 -0700


goldmedal commented on code in PR #13132:
URL: https://github.com/apache/datafusion/pull/13132#discussion_r1819970251



##########
datafusion/sql/tests/cases/plan_to_sql.rs:
##########
@@ -1008,6 +1008,93 @@ fn test_sort_with_push_down_fetch() -> Result<()> {
     Ok(())
 }
 
+#[test]
+fn test_join_with_table_scan_filters() -> Result<()> {
+    let schema_left = Schema::new(vec![
+        Field::new("id", DataType::Utf8, false),
+        Field::new("name", DataType::Utf8, false),
+    ]);
+
+    let schema_right = Schema::new(vec![
+        Field::new("id", DataType::Utf8, false),
+        Field::new("age", DataType::Utf8, false),
+    ]);
+
+    let left_plan = table_scan_with_filters(
+        Some("left_table"),
+        &schema_left,
+        None,
+        vec![col("name").like(lit("some_name"))],
+    )?
+    .alias("left")?
+    .build()?;
+
+    let right_plan = table_scan_with_filters(
+        Some("right_table"),
+        &schema_right,
+        None,
+        vec![col("age").gt(lit(10))],
+    )?
+    .build()?;
+
+    let join_plan_with_filter = LogicalPlanBuilder::from(left_plan.clone())
+        .join(
+            right_plan.clone(),
+            datafusion_expr::JoinType::Inner,
+            (vec!["left.id"], vec!["right_table.id"]),
+            Some(col("left.id").gt(lit(5))),
+        )?
+        .build()?;
+
+    let sql = plan_to_sql(&join_plan_with_filter)?;
+
+    let expected_sql = r#"SELECT * FROM left_table AS "left" JOIN right_table 
ON "left".id = right_table.id AND (("left".id > 5) AND ("left"."name" LIKE 
'some_name' AND (age > 10)))"#;
+
+    assert_eq!(sql.to_string(), expected_sql);
+
+    let join_plan_no_filter = LogicalPlanBuilder::from(left_plan.clone())
+        .join(
+            right_plan,
+            datafusion_expr::JoinType::Inner,
+            (vec!["left.id"], vec!["right_table.id"]),
+            None,
+        )?
+        .build()?;
+
+    let sql = plan_to_sql(&join_plan_no_filter)?;
+
+    let expected_sql = r#"SELECT * FROM left_table AS "left" JOIN right_table 
ON "left".id = right_table.id AND ("left"."name" LIKE 'some_name' AND (age > 
10))"#;

Review Comment:
   > @goldmedal - Is my understanding correct that tha main concern is that the 
first option is preferred as it could be executed more efficient by target 
engine?
   
   Yes, if we can push down the predicate to the table scan, it usually means 
it will perform better.
   
   I tried the subquery pattern:
   ```
   -----------------inner join-------------------
   ###### predicate in filter ######
   SQL: select o_orderkey from orders inner join (select c_custkey  from 
customer where c_name = 'Customer#000000001') on o_custkey = c_custkey
   Projection: orders.o_orderkey
     Inner Join: orders.o_custkey = customer.c_custkey
       TableScan: orders projection=[o_orderkey, o_custkey]
       Projection: customer.c_custkey
         Filter: customer.c_name = Utf8("Customer#000000001")
           TableScan: customer projection=[c_custkey, c_name], 
partial_filters=[customer.c_name = Utf8("Customer#000000001")]
   -----------------left join-------------------
   ###### predicate in filter ######
   SQL: select o_orderkey from orders left join (select c_custkey  from 
customer where c_name = 'Customer#000000001') on o_custkey = c_custkey
   Projection: orders.o_orderkey
     Left Join: orders.o_custkey = customer.c_custkey
       TableScan: orders projection=[o_orderkey, o_custkey]
       Projection: customer.c_custkey
         Filter: customer.c_name = Utf8("Customer#000000001")
           TableScan: customer projection=[c_custkey, c_name], 
partial_filters=[customer.c_name = Utf8("Customer#000000001")]
   -----------------right join-------------------
   ###### predicate in filter ######
   SQL: select o_orderkey from orders right join (select c_custkey  from 
customer where c_name = 'Customer#000000001') on o_custkey = c_custkey
   Projection: orders.o_orderkey
     Right Join: orders.o_custkey = customer.c_custkey
       TableScan: orders projection=[o_orderkey, o_custkey]
       Projection: customer.c_custkey
         Filter: customer.c_name = Utf8("Customer#000000001")
           TableScan: customer projection=[c_custkey, c_name], 
partial_filters=[customer.c_name = Utf8("Customer#000000001")]
   -----------------full join-------------------
   ###### predicate in filter ######
   SQL: select o_orderkey from orders full join (select c_custkey  from 
customer where c_name = 'Customer#000000001') on o_custkey = c_custkey
   Projection: orders.o_orderkey
     Full Join: orders.o_custkey = customer.c_custkey
       TableScan: orders projection=[o_orderkey, o_custkey]
       Projection: customer.c_custkey
         Filter: customer.c_name = Utf8("Customer#000000001")
           TableScan: customer projection=[c_custkey, c_name], 
partial_filters=[customer.c_name = Utf8("Customer#000000001")]
   ```
   
   Every predicate is pushed down to the table scan. It's better 👍
   I haven't checked the planner of other databases but I think they're similar.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Improve TableScan with filters pushdown unparsing (joins) [datafusion]

Reply via email to