goldmedal commented on code in PR #13132:
URL: https://github.com/apache/datafusion/pull/13132#discussion_r1819970251
##########
datafusion/sql/tests/cases/plan_to_sql.rs:
##########
@@ -1008,6 +1008,93 @@ fn test_sort_with_push_down_fetch() -> Result<()> {
Ok(())
}
+#[test]
+fn test_join_with_table_scan_filters() -> Result<()> {
+ let schema_left = Schema::new(vec![
+ Field::new("id", DataType::Utf8, false),
+ Field::new("name", DataType::Utf8, false),
+ ]);
+
+ let schema_right = Schema::new(vec![
+ Field::new("id", DataType::Utf8, false),
+ Field::new("age", DataType::Utf8, false),
+ ]);
+
+ let left_plan = table_scan_with_filters(
+ Some("left_table"),
+ &schema_left,
+ None,
+ vec![col("name").like(lit("some_name"))],
+ )?
+ .alias("left")?
+ .build()?;
+
+ let right_plan = table_scan_with_filters(
+ Some("right_table"),
+ &schema_right,
+ None,
+ vec![col("age").gt(lit(10))],
+ )?
+ .build()?;
+
+ let join_plan_with_filter = LogicalPlanBuilder::from(left_plan.clone())
+ .join(
+ right_plan.clone(),
+ datafusion_expr::JoinType::Inner,
+ (vec!["left.id"], vec!["right_table.id"]),
+ Some(col("left.id").gt(lit(5))),
+ )?
+ .build()?;
+
+ let sql = plan_to_sql(&join_plan_with_filter)?;
+
+ let expected_sql = r#"SELECT * FROM left_table AS "left" JOIN right_table
ON "left".id = right_table.id AND (("left".id > 5) AND ("left"."name" LIKE
'some_name' AND (age > 10)))"#;
+
+ assert_eq!(sql.to_string(), expected_sql);
+
+ let join_plan_no_filter = LogicalPlanBuilder::from(left_plan.clone())
+ .join(
+ right_plan,
+ datafusion_expr::JoinType::Inner,
+ (vec!["left.id"], vec!["right_table.id"]),
+ None,
+ )?
+ .build()?;
+
+ let sql = plan_to_sql(&join_plan_no_filter)?;
+
+ let expected_sql = r#"SELECT * FROM left_table AS "left" JOIN right_table
ON "left".id = right_table.id AND ("left"."name" LIKE 'some_name' AND (age >
10))"#;
Review Comment:
> @goldmedal - Is my understanding correct that tha main concern is that the
first option is preferred as it could be executed more efficient by target
engine?
Yes, if we can push down the predicate to the table scan, it usually means
it will perform better.
I tried the subquery pattern:
```
-----------------inner join-------------------
###### predicate in filter ######
SQL: select o_orderkey from orders inner join (select c_custkey from
customer where c_name = 'Customer#000000001') on o_custkey = c_custkey
Projection: orders.o_orderkey
Inner Join: orders.o_custkey = customer.c_custkey
TableScan: orders projection=[o_orderkey, o_custkey]
Projection: customer.c_custkey
Filter: customer.c_name = Utf8("Customer#000000001")
TableScan: customer projection=[c_custkey, c_name],
partial_filters=[customer.c_name = Utf8("Customer#000000001")]
-----------------left join-------------------
###### predicate in filter ######
SQL: select o_orderkey from orders left join (select c_custkey from
customer where c_name = 'Customer#000000001') on o_custkey = c_custkey
Projection: orders.o_orderkey
Left Join: orders.o_custkey = customer.c_custkey
TableScan: orders projection=[o_orderkey, o_custkey]
Projection: customer.c_custkey
Filter: customer.c_name = Utf8("Customer#000000001")
TableScan: customer projection=[c_custkey, c_name],
partial_filters=[customer.c_name = Utf8("Customer#000000001")]
-----------------right join-------------------
###### predicate in filter ######
SQL: select o_orderkey from orders right join (select c_custkey from
customer where c_name = 'Customer#000000001') on o_custkey = c_custkey
Projection: orders.o_orderkey
Right Join: orders.o_custkey = customer.c_custkey
TableScan: orders projection=[o_orderkey, o_custkey]
Projection: customer.c_custkey
Filter: customer.c_name = Utf8("Customer#000000001")
TableScan: customer projection=[c_custkey, c_name],
partial_filters=[customer.c_name = Utf8("Customer#000000001")]
-----------------full join-------------------
###### predicate in filter ######
SQL: select o_orderkey from orders full join (select c_custkey from
customer where c_name = 'Customer#000000001') on o_custkey = c_custkey
Projection: orders.o_orderkey
Full Join: orders.o_custkey = customer.c_custkey
TableScan: orders projection=[o_orderkey, o_custkey]
Projection: customer.c_custkey
Filter: customer.c_name = Utf8("Customer#000000001")
TableScan: customer projection=[c_custkey, c_name],
partial_filters=[customer.c_name = Utf8("Customer#000000001")]
```
Every predicate is pushed down to the table scan. It's better 👍
I haven't checked the planner of other databases but I think they're similar.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]