This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit d8e18fac619a129270634513359b3386212ff919
Author: Riza Suminto <[email protected]>
AuthorDate: Thu Apr 24 15:50:42 2025 -0700

    IMPALA-13991: Skip CROSS_JOIN rewrite if subquery is in disjunctive
    
    Inside StmtRewriter.mergeExpr() there is an optimization that set
    JoinOperator.CROSS_JOIN under certain scenario. This patch adds criteria
    to SKIP that rewrite if subquery is coming from inside disjunctive
    expression, regardless of joinConjunct value. If joinConjunct is NOT
    NULL, the inlineView maybe correlated through that joinConjunct. If
    joinConjunct is NULL, then expr is a (NOT) EXISTS predicate. EXISTS
    within a disjunct is not supported yet (see IMPALA-9931).
    
    Testing:
    - Add planner and query test for the corner case.
      Before this patch, the query return wrong result.
    - Fixed wrong testcase in subquery-rewrite.test.
    
    Change-Id: Iac0deb0b2fb1536684cce2e004156a20b769b9ab
    Reviewed-on: http://gerrit.cloudera.org:8080/22815
    Reviewed-by: Impala Public Jenkins <[email protected]>
    Tested-by: Impala Public Jenkins <[email protected]>
---
 .../org/apache/impala/analysis/StmtRewriter.java   |  9 ++++--
 .../queries/PlannerTest/nested-loop-join.test      | 36 ++++++++++++++++++++++
 .../queries/PlannerTest/subquery-rewrite.test      | 17 +++++-----
 .../queries/QueryTest/single-node-nlj.test         | 17 ++++++++++
 4 files changed, 70 insertions(+), 9 deletions(-)

diff --git a/fe/src/main/java/org/apache/impala/analysis/StmtRewriter.java 
b/fe/src/main/java/org/apache/impala/analysis/StmtRewriter.java
index 5b0868d71..2ec24786e 100644
--- a/fe/src/main/java/org/apache/impala/analysis/StmtRewriter.java
+++ b/fe/src/main/java/org/apache/impala/analysis/StmtRewriter.java
@@ -22,7 +22,6 @@ import java.util.Collections;
 import java.util.List;
 import java.util.Set;
 
-import org.apache.impala.analysis.AnalysisContext.AnalysisResult;
 import org.apache.impala.analysis.SetOperationStmt.SetOperand;
 import org.apache.impala.analysis.SetOperationStmt.SetOperator;
 import org.apache.impala.common.AnalysisException;
@@ -773,7 +772,13 @@ public class StmtRewriter {
         break;
       }
 
-      if (!hasEqJoinPred && !inlineView.isCorrelated()) {
+      if (!hasEqJoinPred && !inlineView.isCorrelated() && !isDisjunctive) {
+        // IMPALA-13991: It is not safe to rewrite into CROSS_JOIN if
+        // isDisjunctive is True, regardless of joinConjunct value.
+        // If joinConjunct is NOT NULL, the inlineView maybe correlated through
+        // that joinConjunct.
+        // If joinConjunct is NULL, then expr is a (NOT) EXISTS predicate.
+        // EXISTS within a disjunct is not supported yet (see IMPALA-9931).
         // TODO: IMPALA-9948: we could support non-equi joins here
         // TODO: Remove this when independent subquery evaluation is 
implemented.
         // TODO: IMPALA-5100 to cover all cases, we do let through runtime 
scalars with
diff --git 
a/testdata/workloads/functional-planner/queries/PlannerTest/nested-loop-join.test
 
b/testdata/workloads/functional-planner/queries/PlannerTest/nested-loop-join.test
index 9ceac1560..18bb5bbbc 100644
--- 
a/testdata/workloads/functional-planner/queries/PlannerTest/nested-loop-join.test
+++ 
b/testdata/workloads/functional-planner/queries/PlannerTest/nested-loop-join.test
@@ -328,3 +328,39 @@ PLAN-ROOT SINK
    HDFS partitions=24/24 files=24 size=478.45KB
    row-size=4B cardinality=7.30K
 ====
+# IMPALA-13991: Disable CROSS_JOIN rewrite if there is a scalar subquery inside
+# disjunctive expression. right_tc is '2009-04-01 01:01:00'.
+select id, timestamp_col from functional.alltypes
+where (id in (1000, 1001))
+  or ((id in (1, 2))
+      and timestamp_col < (select hours_add(timestamp_col, 1) right_tc
+                           from functional.alltypestiny
+                           order by timestamp_col DESC limit 1));
+---- DISTRIBUTEDPLAN
+PLAN-ROOT SINK
+|
+06:EXCHANGE [UNPARTITIONED]
+|
+03:NESTED LOOP JOIN [LEFT OUTER JOIN, BROADCAST]
+|  join predicates: timestamp_col < hours_add(timestamp_col, 1)
+|  predicates: (id IN (1000, 1001)) OR ((id IN (1, 2)) AND 
hours_add(timestamp_col, 1) IS NOT NULL)
+|  row-size=36B cardinality=7.30K
+|
+|--05:EXCHANGE [BROADCAST]
+|  |
+|  04:MERGING-EXCHANGE [UNPARTITIONED]
+|  |  order by: timestamp_col DESC
+|  |  limit: 1
+|  |
+|  02:TOP-N [LIMIT=1]
+|  |  order by: timestamp_col DESC
+|  |  row-size=16B cardinality=1
+|  |
+|  01:SCAN HDFS [functional.alltypestiny]
+|     HDFS partitions=4/4 files=4 size=460B
+|     row-size=16B cardinality=8
+|
+00:SCAN HDFS [functional.alltypes]
+   HDFS partitions=24/24 files=24 size=478.45KB
+   row-size=20B cardinality=7.30K
+====
diff --git 
a/testdata/workloads/functional-planner/queries/PlannerTest/subquery-rewrite.test
 
b/testdata/workloads/functional-planner/queries/PlannerTest/subquery-rewrite.test
index 8e83a774e..6946a5fee 100644
--- 
a/testdata/workloads/functional-planner/queries/PlannerTest/subquery-rewrite.test
+++ 
b/testdata/workloads/functional-planner/queries/PlannerTest/subquery-rewrite.test
@@ -4106,17 +4106,19 @@ PLAN-ROOT SINK
    HDFS partitions=24/24 files=24 size=478.45KB
    row-size=8B cardinality=7.30K
 ====
-# Uncorrelated subquery in BETWEEN in OR rewritten to INNER JOIN.
+# Uncorrelated subquery in BETWEEN in OR rewritten to LEFT OUTER JOIN.
+# Subquery will return 1.
 select int_col from functional.alltypes t
-where (t.int_col is null or
+where (t.int_col = 0 or
       (t.int_col between
           (select min(int_col) from functional.alltypesagg t2)
           and 2))
 ---- PLAN
 PLAN-ROOT SINK
 |
-03:NESTED LOOP JOIN [INNER JOIN]
-|  predicates: t.int_col >= min(int_col), (t.int_col IS NULL OR min(int_col) 
IS NOT NULL AND t.int_col <= 2)
+03:NESTED LOOP JOIN [LEFT OUTER JOIN]
+|  join predicates: t.int_col >= min(int_col)
+|  predicates: (t.int_col = 0 OR min(int_col) IS NOT NULL AND t.int_col <= 2)
 |  row-size=8B cardinality=7.30K
 |
 |--02:AGGREGATE [FINALIZE]
@@ -4135,8 +4137,9 @@ PLAN-ROOT SINK
 |
 07:EXCHANGE [UNPARTITIONED]
 |
-03:NESTED LOOP JOIN [INNER JOIN, BROADCAST]
-|  predicates: t.int_col >= min(int_col), (t.int_col IS NULL OR min(int_col) 
IS NOT NULL AND t.int_col <= 2)
+03:NESTED LOOP JOIN [LEFT OUTER JOIN, BROADCAST]
+|  join predicates: t.int_col >= min(int_col)
+|  predicates: (t.int_col = 0 OR min(int_col) IS NOT NULL AND t.int_col <= 2)
 |  row-size=8B cardinality=7.30K
 |
 |--06:EXCHANGE [BROADCAST]
@@ -4149,7 +4152,7 @@ PLAN-ROOT SINK
 |  |
 |  02:AGGREGATE
 |  |  output: min(int_col)
-|  |  row-size=4B cardinality=1
+|  |  row-size=4B cardinality=3
 |  |
 |  01:SCAN HDFS [functional.alltypesagg t2]
 |     HDFS partitions=11/11 files=11 size=814.73KB
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/single-node-nlj.test 
b/testdata/workloads/functional-query/queries/QueryTest/single-node-nlj.test
index 82370af53..143776538 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/single-node-nlj.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/single-node-nlj.test
@@ -222,4 +222,21 @@ limit 5
 7295
 ---- TYPES
 INT
+====
+---- QUERY
+# IMPALA-13991: Disable CROSS_JOIN rewrite if there is a scalar subquery inside
+# disjunctive expression. right_tc is '2009-04-01 01:01:00'.
+select id, timestamp_col from functional.alltypes
+where (id in (1000, 1001))
+  or ((id in (1, 2))
+      and timestamp_col < (select hours_add(timestamp_col, 1) right_tc
+                           from functional.alltypestiny
+                           order by timestamp_col DESC limit 1));
+---- RESULTS: VERIFY_IS_EQUAL_SORTED
+1,2009-01-01 00:01:00
+2,2009-01-01 00:02:00.100000000
+1000,2009-04-11 01:40:04.500000000
+1001,2009-04-11 01:41:04.500000000
+---- TYPES
+INT,TIMESTAMP
 ====
\ No newline at end of file

Reply via email to