hareshkh commented on code in PR #21121:
URL: https://github.com/apache/datafusion/pull/21121#discussion_r2980855299


##########
datafusion/substrait/src/logical_plan/consumer/rel/join_rel.rs:
##########
@@ -89,49 +84,61 @@ pub async fn from_join_rel(
 }
 
 fn split_eq_and_noneq_join_predicate_with_nulls_equality(
-    filter: &Expr,
-) -> (Vec<(Column, Column)>, bool, Option<Expr>) {
-    let exprs = split_conjunction(filter);
+    filter: Expr,
+) -> (Vec<(Column, Column)>, NullEquality, Option<Expr>) {
+    let exprs = split_conjunction_owned(filter);
 
-    let mut accum_join_keys: Vec<(Column, Column)> = vec![];
+    let mut eq_keys: Vec<(Column, Column)> = vec![];
+    let mut indistinct_keys: Vec<(Column, Column)> = vec![];
     let mut accum_filters: Vec<Expr> = vec![];
-    let mut nulls_equal_nulls = false;
 
     for expr in exprs {
-        #[expect(clippy::collapsible_match)]
         match expr {
-            Expr::BinaryExpr(binary_expr) => match binary_expr {
-                x @ (BinaryExpr {
-                    left,
-                    op: Operator::Eq,
-                    right,
+            Expr::BinaryExpr(BinaryExpr {
+                left,
+                op: op @ (Operator::Eq | Operator::IsNotDistinctFrom),
+                right,
+            }) => match (*left, *right) {
+                (Expr::Column(l), Expr::Column(r)) => match op {
+                    Operator::Eq => eq_keys.push((l, r)),
+                    Operator::IsNotDistinctFrom => indistinct_keys.push((l, 
r)),
+                    _ => unreachable!(),
+                },
+                (left, right) => {
+                    accum_filters.push(Expr::BinaryExpr(BinaryExpr {
+                        left: Box::new(left),
+                        op,
+                        right: Box::new(right),
+                    }));
                 }
-                | BinaryExpr {
-                    left,
-                    op: Operator::IsNotDistinctFrom,
-                    right,
-                }) => {
-                    nulls_equal_nulls = match x.op {
-                        Operator::Eq => false,
-                        Operator::IsNotDistinctFrom => true,
-                        _ => unreachable!(),
-                    };
-
-                    match (left.as_ref(), right.as_ref()) {
-                        (Expr::Column(l), Expr::Column(r)) => {
-                            accum_join_keys.push((l.clone(), r.clone()));
-                        }
-                        _ => accum_filters.push(expr.clone()),
-                    }
-                }
-                _ => accum_filters.push(expr.clone()),
             },
-            _ => accum_filters.push(expr.clone()),
+            _ => accum_filters.push(expr),
         }
     }
 
+    let (join_keys, null_equality) =
+        match (eq_keys.is_empty(), indistinct_keys.is_empty()) {
+            // Mixed: use eq_keys as equijoin keys, demote indistinct keys to 
filter

Review Comment:
   The optimiser currently already has this behaviour of favouring Eq 
predicates of Indistinct predicates. Added a SLT to confirm that behaviour - 
https://github.com/apache/datafusion/pull/21121/changes#diff-63fc43cf735eb03abd4d114cfbbf24982939425938a74b354fb7db6da7d499d7R305,
 and replicating that behaviour here.
   
   I also think that Eq keys will have a lower hash table bloat due to nulls 
being filtered while the selectivity is a function of data - having a hash join 
on 3 `indistinct` keys could produce more data than 1 `eq` key.
   
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to