NGA-TRAN commented on code in PR #17319:
URL: https://github.com/apache/datafusion/pull/17319#discussion_r2355867758


##########
datafusion/sqllogictest/test_files/join_is_not_distinct_from.slt:
##########
@@ -0,0 +1,280 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Test IS NOT DISTINCT FROM join functionality
+# This tests the optimizer's ability to convert IS NOT DISTINCT FROM joins
+# to equijoins with proper null equality handling
+
+statement ok
+CREATE TABLE t0 (
+    id INT,
+    val INT
+)
+
+statement ok
+CREATE TABLE t1 (
+    id INT,
+    val INT
+)
+
+statement ok
+CREATE TABLE t2 (
+    id INT,
+    val INT
+)
+
+statement ok
+INSERT INTO t0 VALUES
+(1, 10),
+(2, NULL),
+(5, 50)
+
+statement ok
+INSERT INTO t1 VALUES
+(1, 10),
+(2, NULL),
+(3, 30),
+(6, NULL)
+
+statement ok
+INSERT INTO t2 VALUES
+(1, 10),
+(2, NULL),
+(4, 40),
+(6, 6)
+
+# Test basic IS NOT DISTINCT FROM join functionality
+query IIII rowsort
+SELECT t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val
+FROM t1
+JOIN t2 ON t1.val IS NOT DISTINCT FROM t2.val
+----
+1 1 10 10
+2 2 NULL NULL
+6 2 NULL NULL
+
+# Test that IS NOT DISTINCT FROM join produces HashJoin when used alone
+query TT
+EXPLAIN SELECT t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val
+FROM t1
+JOIN t2 ON t1.val IS NOT DISTINCT FROM t2.val
+----
+logical_plan
+01)Projection: t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val
+02)--Inner Join: t1.val = t2.val
+03)----TableScan: t1 projection=[id, val]
+04)----TableScan: t2 projection=[id, val]
+physical_plan
+01)ProjectionExec: expr=[id@0 as t1_id, id@2 as t2_id, val@1 as val, val@3 as 
val]
+02)--CoalesceBatchesExec: target_batch_size=8192
+03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(val@1, val@1)]
+04)------DataSourceExec: partitions=1, partition_sizes=[1]
+05)------DataSourceExec: partitions=1, partition_sizes=[1]
+
+# For nested expression comparision, it should still able to be converted to 
Hash Join
+query IIII rowsort
+SELECT t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val
+FROM t1
+JOIN t2 ON ((t1.val+1) IS NOT DISTINCT FROM (t2.val+1)) AND ((t1.val + 1) IS 
NOT DISTINCT FROM 11);
+----
+1 1 10 10
+
+# The plan should include HashJoin
+query TT
+EXPLAIN SELECT t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val
+FROM t1
+JOIN t2 ON ((t1.val+1) IS NOT DISTINCT FROM (t2.val+1)) AND ((t1.val + 1) IS 
NOT DISTINCT FROM 11);
+----
+logical_plan
+01)Projection: t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val
+02)--Inner Join: CAST(t1.val AS Int64) + Int64(1) = CAST(t2.val AS Int64) + 
Int64(1)
+03)----Filter: CAST(t1.val AS Int64) + Int64(1) IS NOT DISTINCT FROM Int64(11)
+04)------TableScan: t1 projection=[id, val]
+05)----TableScan: t2 projection=[id, val]
+physical_plan
+01)ProjectionExec: expr=[id@0 as t1_id, id@2 as t2_id, val@1 as val, val@3 as 
val]
+02)--CoalesceBatchesExec: target_batch_size=8192
+03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(t1.val + 
Int64(1)@2, t2.val + Int64(1)@2)], projection=[id@0, val@1, id@3, val@4]
+04)------CoalescePartitionsExec
+05)--------ProjectionExec: expr=[id@0 as id, val@1 as val, CAST(val@1 AS 
Int64) + 1 as t1.val + Int64(1)]
+06)----------RepartitionExec: partitioning=RoundRobinBatch(4), 
input_partitions=1
+07)------------CoalesceBatchesExec: target_batch_size=8192
+08)--------------FilterExec: CAST(val@1 AS Int64) + 1 IS NOT DISTINCT FROM 11
+09)----------------DataSourceExec: partitions=1, partition_sizes=[1]
+10)------ProjectionExec: expr=[id@0 as id, val@1 as val, CAST(val@1 AS Int64) 
+ 1 as t2.val + Int64(1)]
+11)--------DataSourceExec: partitions=1, partition_sizes=[1]
+
+# Mixed join predicate with `IS DISTINCT FROM` and `IS NOT DISTINCT FROM`
+query IIII rowsort
+SELECT t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val
+FROM t1
+JOIN t2 ON ((t1.val+1) IS NOT DISTINCT FROM (t2.val+1)) AND ((t1.val % 3) IS 
DISTINCT FROM (t2.val % 3));
+----
+
+# The plan should include HashJoin
+query TT
+EXPLAIN SELECT t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val
+FROM t1
+JOIN t2 ON ((t1.val+1) IS NOT DISTINCT FROM (t2.val+1)) AND ((t1.val % 3) IS 
DISTINCT FROM (t2.val % 3));
+----
+logical_plan
+01)Projection: t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val
+02)--Inner Join: CAST(t1.val AS Int64) + Int64(1) = CAST(t2.val AS Int64) + 
Int64(1) Filter: CAST(t1.val AS Int64) % Int64(3) IS DISTINCT FROM CAST(t2.val 
AS Int64) % Int64(3)
+03)----TableScan: t1 projection=[id, val]
+04)----TableScan: t2 projection=[id, val]
+physical_plan
+01)ProjectionExec: expr=[id@0 as t1_id, id@2 as t2_id, val@1 as val, val@3 as 
val]
+02)--CoalesceBatchesExec: target_batch_size=8192
+03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(t1.val + 
Int64(1)@2, t2.val + Int64(1)@2)], filter=CAST(val@0 AS Int64) % 3 IS DISTINCT 
FROM CAST(val@1 AS Int64) % 3, projection=[id@0, val@1, id@3, val@4]

Review Comment:
   Have you verify the `Hash join` here is expected? Does it filter `IS 
DISTINCT` correctly?



##########
datafusion/sqllogictest/test_files/join_is_not_distinct_from.slt:
##########
@@ -0,0 +1,280 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Test IS NOT DISTINCT FROM join functionality
+# This tests the optimizer's ability to convert IS NOT DISTINCT FROM joins
+# to equijoins with proper null equality handling
+
+statement ok
+CREATE TABLE t0 (
+    id INT,
+    val INT
+)
+
+statement ok
+CREATE TABLE t1 (
+    id INT,
+    val INT
+)
+
+statement ok
+CREATE TABLE t2 (
+    id INT,
+    val INT
+)
+
+statement ok
+INSERT INTO t0 VALUES
+(1, 10),
+(2, NULL),
+(5, 50)
+
+statement ok
+INSERT INTO t1 VALUES
+(1, 10),
+(2, NULL),
+(3, 30),
+(6, NULL)
+
+statement ok
+INSERT INTO t2 VALUES
+(1, 10),
+(2, NULL),
+(4, 40),
+(6, 6)
+
+# Test basic IS NOT DISTINCT FROM join functionality
+query IIII rowsort
+SELECT t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val
+FROM t1
+JOIN t2 ON t1.val IS NOT DISTINCT FROM t2.val
+----
+1 1 10 10
+2 2 NULL NULL
+6 2 NULL NULL
+
+# Test that IS NOT DISTINCT FROM join produces HashJoin when used alone
+query TT
+EXPLAIN SELECT t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val
+FROM t1
+JOIN t2 ON t1.val IS NOT DISTINCT FROM t2.val
+----
+logical_plan
+01)Projection: t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val
+02)--Inner Join: t1.val = t2.val
+03)----TableScan: t1 projection=[id, val]
+04)----TableScan: t2 projection=[id, val]
+physical_plan
+01)ProjectionExec: expr=[id@0 as t1_id, id@2 as t2_id, val@1 as val, val@3 as 
val]
+02)--CoalesceBatchesExec: target_batch_size=8192
+03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(val@1, val@1)]
+04)------DataSourceExec: partitions=1, partition_sizes=[1]
+05)------DataSourceExec: partitions=1, partition_sizes=[1]
+
+# For nested expression comparision, it should still able to be converted to 
Hash Join
+query IIII rowsort
+SELECT t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val
+FROM t1
+JOIN t2 ON ((t1.val+1) IS NOT DISTINCT FROM (t2.val+1)) AND ((t1.val + 1) IS 
NOT DISTINCT FROM 11);
+----
+1 1 10 10
+
+# The plan should include HashJoin
+query TT
+EXPLAIN SELECT t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val
+FROM t1
+JOIN t2 ON ((t1.val+1) IS NOT DISTINCT FROM (t2.val+1)) AND ((t1.val + 1) IS 
NOT DISTINCT FROM 11);
+----
+logical_plan
+01)Projection: t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val
+02)--Inner Join: CAST(t1.val AS Int64) + Int64(1) = CAST(t2.val AS Int64) + 
Int64(1)
+03)----Filter: CAST(t1.val AS Int64) + Int64(1) IS NOT DISTINCT FROM Int64(11)
+04)------TableScan: t1 projection=[id, val]
+05)----TableScan: t2 projection=[id, val]
+physical_plan
+01)ProjectionExec: expr=[id@0 as t1_id, id@2 as t2_id, val@1 as val, val@3 as 
val]
+02)--CoalesceBatchesExec: target_batch_size=8192
+03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(t1.val + 
Int64(1)@2, t2.val + Int64(1)@2)], projection=[id@0, val@1, id@3, val@4]
+04)------CoalescePartitionsExec
+05)--------ProjectionExec: expr=[id@0 as id, val@1 as val, CAST(val@1 AS 
Int64) + 1 as t1.val + Int64(1)]
+06)----------RepartitionExec: partitioning=RoundRobinBatch(4), 
input_partitions=1
+07)------------CoalesceBatchesExec: target_batch_size=8192
+08)--------------FilterExec: CAST(val@1 AS Int64) + 1 IS NOT DISTINCT FROM 11
+09)----------------DataSourceExec: partitions=1, partition_sizes=[1]
+10)------ProjectionExec: expr=[id@0 as id, val@1 as val, CAST(val@1 AS Int64) 
+ 1 as t2.val + Int64(1)]
+11)--------DataSourceExec: partitions=1, partition_sizes=[1]
+
+# Mixed join predicate with `IS DISTINCT FROM` and `IS NOT DISTINCT FROM`
+query IIII rowsort
+SELECT t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val
+FROM t1
+JOIN t2 ON ((t1.val+1) IS NOT DISTINCT FROM (t2.val+1)) AND ((t1.val % 3) IS 
DISTINCT FROM (t2.val % 3));
+----
+
+# The plan should include HashJoin
+query TT
+EXPLAIN SELECT t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val
+FROM t1
+JOIN t2 ON ((t1.val+1) IS NOT DISTINCT FROM (t2.val+1)) AND ((t1.val % 3) IS 
DISTINCT FROM (t2.val % 3));
+----
+logical_plan
+01)Projection: t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val
+02)--Inner Join: CAST(t1.val AS Int64) + Int64(1) = CAST(t2.val AS Int64) + 
Int64(1) Filter: CAST(t1.val AS Int64) % Int64(3) IS DISTINCT FROM CAST(t2.val 
AS Int64) % Int64(3)

Review Comment:
   👍 to see the `IS NOT DISTINCT` has rewritten into equality `CAST(t1.val AS 
Int64) + Int64(1) = CAST(t2.val AS Int64) + Int64(1)`



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to