sgrebnov commented on code in PR #13132:
URL: https://github.com/apache/datafusion/pull/13132#discussion_r1819442675


##########
datafusion/sql/tests/cases/plan_to_sql.rs:
##########
@@ -1008,6 +1008,93 @@ fn test_sort_with_push_down_fetch() -> Result<()> {
     Ok(())
 }
 
+#[test]
+fn test_join_with_table_scan_filters() -> Result<()> {
+    let schema_left = Schema::new(vec![
+        Field::new("id", DataType::Utf8, false),
+        Field::new("name", DataType::Utf8, false),
+    ]);
+
+    let schema_right = Schema::new(vec![
+        Field::new("id", DataType::Utf8, false),
+        Field::new("age", DataType::Utf8, false),
+    ]);
+
+    let left_plan = table_scan_with_filters(
+        Some("left_table"),
+        &schema_left,
+        None,
+        vec![col("name").like(lit("some_name"))],
+    )?
+    .alias("left")?
+    .build()?;
+
+    let right_plan = table_scan_with_filters(
+        Some("right_table"),
+        &schema_right,
+        None,
+        vec![col("age").gt(lit(10))],
+    )?
+    .build()?;
+
+    let join_plan_with_filter = LogicalPlanBuilder::from(left_plan.clone())
+        .join(
+            right_plan.clone(),
+            datafusion_expr::JoinType::Inner,
+            (vec!["left.id"], vec!["right_table.id"]),
+            Some(col("left.id").gt(lit(5))),
+        )?
+        .build()?;
+
+    let sql = plan_to_sql(&join_plan_with_filter)?;
+
+    let expected_sql = r#"SELECT * FROM left_table AS "left" JOIN right_table 
ON "left".id = right_table.id AND (("left".id > 5) AND ("left"."name" LIKE 
'some_name' AND (age > 10)))"#;
+
+    assert_eq!(sql.to_string(), expected_sql);
+
+    let join_plan_no_filter = LogicalPlanBuilder::from(left_plan.clone())
+        .join(
+            right_plan,
+            datafusion_expr::JoinType::Inner,
+            (vec!["left.id"], vec!["right_table.id"]),
+            None,
+        )?
+        .build()?;
+
+    let sql = plan_to_sql(&join_plan_no_filter)?;
+
+    let expected_sql = r#"SELECT * FROM left_table AS "left" JOIN right_table 
ON "left".id = right_table.id AND ("left"."name" LIKE 'some_name' AND (age > 
10))"#;

Review Comment:
   @goldmedal - thank you for deep review. I suspect that filters were not 
fully pushed down for the right join and full join cases by DF during 
optimization for samples above as two test queries are not exactly the same as 
how records are filtered:
   ```
   `on o_custkey = c_custkey where c_name = 'Customer#000000001'  <-- filter is 
applied after join, fully filter out non matching records
   `on o_custkey = c_custkey and c_name = 'Customer#000000001'` <-- filtering 
is done during join, will join/include NULL
   ```
   
   It seems in all examples above the original `WHERE` was moved inside Join by 
optimizer (all cases), so optimized plan should be unparsed as below for right 
join, for example
   
   `select o_orderkey from orders right join (select c_custkey  from customer 
where c_name = 'Customer#000000001') on o_custkey = c_custkey`
   
   and we will translate it to
   
   ```
   select o_orderkey from orders right join customers on o_custkey = c_custkey 
and c_name = 'Customer#000000001'
   ```
   
   @goldmedal  - Is my understanding correct that tha main concern is that the 
first option is preferred as it could be executed more efficient by target 
engine?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to