goldmedal commented on code in PR #13132:
URL: https://github.com/apache/datafusion/pull/13132#discussion_r1819298737
##########
datafusion/sql/tests/cases/plan_to_sql.rs:
##########
@@ -1008,6 +1008,93 @@ fn test_sort_with_push_down_fetch() -> Result<()> {
Ok(())
}
+#[test]
+fn test_join_with_table_scan_filters() -> Result<()> {
+ let schema_left = Schema::new(vec![
+ Field::new("id", DataType::Utf8, false),
+ Field::new("name", DataType::Utf8, false),
+ ]);
+
+ let schema_right = Schema::new(vec![
+ Field::new("id", DataType::Utf8, false),
+ Field::new("age", DataType::Utf8, false),
+ ]);
+
+ let left_plan = table_scan_with_filters(
+ Some("left_table"),
+ &schema_left,
+ None,
+ vec![col("name").like(lit("some_name"))],
+ )?
+ .alias("left")?
+ .build()?;
+
+ let right_plan = table_scan_with_filters(
+ Some("right_table"),
+ &schema_right,
+ None,
+ vec![col("age").gt(lit(10))],
+ )?
+ .build()?;
+
+ let join_plan_with_filter = LogicalPlanBuilder::from(left_plan.clone())
+ .join(
+ right_plan.clone(),
+ datafusion_expr::JoinType::Inner,
+ (vec!["left.id"], vec!["right_table.id"]),
+ Some(col("left.id").gt(lit(5))),
+ )?
+ .build()?;
+
+ let sql = plan_to_sql(&join_plan_with_filter)?;
+
+ let expected_sql = r#"SELECT * FROM left_table AS "left" JOIN right_table
ON "left".id = right_table.id AND (("left".id > 5) AND ("left"."name" LIKE
'some_name' AND (age > 10)))"#;
+
+ assert_eq!(sql.to_string(), expected_sql);
+
+ let join_plan_no_filter = LogicalPlanBuilder::from(left_plan.clone())
+ .join(
+ right_plan,
+ datafusion_expr::JoinType::Inner,
+ (vec!["left.id"], vec!["right_table.id"]),
+ None,
+ )?
+ .build()?;
+
+ let sql = plan_to_sql(&join_plan_no_filter)?;
+
+ let expected_sql = r#"SELECT * FROM left_table AS "left" JOIN right_table
ON "left".id = right_table.id AND ("left"."name" LIKE 'some_name' AND (age >
10))"#;
Review Comment:
I noticed you put the pushdown condition in the join condition instead of
the `WHERE`. In my opinion, the SQL plan will be different if we sometimes put
the condition in a different place.
I did some tests for different join type and different place (join condition
or filter) in DataFusion
```rust
let join_type = vec![
"inner join", "left join", "right join", "full join"
];
for join in join_type {
println!("-----------------{join}-------------------");
println!("###### predicate in filter ######");
let sql = format!("select o_orderkey from orders {join} customer on
o_custkey = c_custkey where c_name = 'Customer#000000001'");
println!("SQL: {}", sql);
match ctx.sql(&sql).await?.into_optimized_plan() {
Ok(plan) => {println!("{plan}")},
Err(e) => eprintln!("Error: {}", e),
}
println!("###### predicate in join condition ######");
let sql = format!("select o_orderkey from orders {join} customer on
o_custkey = c_custkey and c_name = 'Customer#000000001'");
println!("SQL: {}", sql);
match ctx.sql(&sql).await?.into_optimized_plan() {
Ok(plan) => {println!("{plan}")},
Err(e) => eprintln!("Error: {}", e),
}
}
```
The result is
```
-----------------inner join-------------------
###### predicate in filter ######
SQL: select o_orderkey from orders inner join customer on o_custkey =
c_custkey where c_name = 'Customer#000000001'
Projection: orders.o_orderkey
Inner Join: orders.o_custkey = customer.c_custkey
TableScan: orders projection=[o_orderkey, o_custkey]
Projection: customer.c_custkey
Filter: customer.c_name = Utf8("Customer#000000001")
TableScan: customer projection=[c_custkey, c_name],
partial_filters=[customer.c_name = Utf8("Customer#000000001")]
###### predicate in join condition ######
SQL: select o_orderkey from orders inner join customer on o_custkey =
c_custkey and c_name = 'Customer#000000001'
Projection: orders.o_orderkey
Inner Join: orders.o_custkey = customer.c_custkey
TableScan: orders projection=[o_orderkey, o_custkey]
Projection: customer.c_custkey
Filter: customer.c_name = Utf8("Customer#000000001")
TableScan: customer projection=[c_custkey, c_name],
partial_filters=[customer.c_name = Utf8("Customer#000000001")]
-----------------left join-------------------
###### predicate in filter ######
SQL: select o_orderkey from orders left join customer on o_custkey =
c_custkey where c_name = 'Customer#000000001'
Projection: orders.o_orderkey
Inner Join: orders.o_custkey = customer.c_custkey
TableScan: orders projection=[o_orderkey, o_custkey]
Projection: customer.c_custkey
Filter: customer.c_name = Utf8("Customer#000000001")
TableScan: customer projection=[c_custkey, c_name],
partial_filters=[customer.c_name = Utf8("Customer#000000001")]
###### predicate in join condition ######
SQL: select o_orderkey from orders left join customer on o_custkey =
c_custkey and c_name = 'Customer#000000001'
Projection: orders.o_orderkey
Left Join: orders.o_custkey = customer.c_custkey
TableScan: orders projection=[o_orderkey, o_custkey]
Projection: customer.c_custkey
Filter: customer.c_name = Utf8("Customer#000000001")
TableScan: customer projection=[c_custkey, c_name],
partial_filters=[customer.c_name = Utf8("Customer#000000001")]
-----------------right join-------------------
###### predicate in filter ######
SQL: select o_orderkey from orders right join customer on o_custkey =
c_custkey where c_name = 'Customer#000000001'
Projection: orders.o_orderkey
Right Join: orders.o_custkey = customer.c_custkey
TableScan: orders projection=[o_orderkey, o_custkey]
Projection: customer.c_custkey
Filter: customer.c_name = Utf8("Customer#000000001")
TableScan: customer projection=[c_custkey, c_name],
partial_filters=[customer.c_name = Utf8("Customer#000000001")]
###### predicate in join condition ######
SQL: select o_orderkey from orders right join customer on o_custkey =
c_custkey and c_name = 'Customer#000000001'
Projection: orders.o_orderkey
Right Join: orders.o_custkey = customer.c_custkey Filter: customer.c_name
= Utf8("Customer#000000001")
TableScan: orders projection=[o_orderkey, o_custkey]
TableScan: customer projection=[c_custkey, c_name]
-----------------full join-------------------
###### predicate in filter ######
SQL: select o_orderkey from orders full join customer on o_custkey =
c_custkey where c_name = 'Customer#000000001'
Projection: orders.o_orderkey
Right Join: orders.o_custkey = customer.c_custkey
TableScan: orders projection=[o_orderkey, o_custkey]
Projection: customer.c_custkey
Filter: customer.c_name = Utf8("Customer#000000001")
TableScan: customer projection=[c_custkey, c_name],
partial_filters=[customer.c_name = Utf8("Customer#000000001")]
###### predicate in join condition ######
SQL: select o_orderkey from orders full join customer on o_custkey =
c_custkey and c_name = 'Customer#000000001'
Projection: orders.o_orderkey
Full Join: orders.o_custkey = customer.c_custkey Filter: customer.c_name =
Utf8("Customer#000000001")
TableScan: orders projection=[o_orderkey, o_custkey]
TableScan: customer projection=[c_custkey, c_name]
```
We can find the plan is the same in `inner join` and `left join`. The filter
pushdown works fine. However, in `right join` and `full join` cases, if we put
the predicate in the join condition, the filter pushdown doesn't work.
In the DataFusion case, the filter pushdown always works when putting the
filter in `WHERE`.
I'm not pretty sure if it's a common rule (putting the predicate in `WHERE`
is better) for the other database. However, in the DataFusino case, we're
better to put them in `WHERE`.
By the way, this PR is ok for me now. I think it can be improved by a
follow-up PR if we care about the performance of the generated SQL.
cc @alamb @phillipleblanc
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]