This is an automated email from the ASF dual-hosted git repository.
stigahuang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
The following commit(s) were added to refs/heads/master by this push:
new 07d44b7af IMPALA-13262: Do not always migrate inferred predicates into
inline view
07d44b7af is described below
commit 07d44b7affa0652f14d8044cd7ffa604f250b77b
Author: Fang-Yu Rao <[email protected]>
AuthorDate: Thu Aug 15 17:23:02 2024 -0700
IMPALA-13262: Do not always migrate inferred predicates into inline view
This patch removes a predicate inferred from a set of analytic
predicates if both sides of the inferred predicate reference the same
TupleId when migrating those analytic predicates into an inline view.
This is to prevent Impala from pushing the inferred conjunct to the
scan node before the analytic functions are applied, which could produce
an incorrect result.
Testing:
- Added additional query and planner test cases to verify Impala's
behavior after this patch.
- Verified the patch passed the core tests.
Change-Id: I6e2632b3b1a140ae0104ceba4e2f474ac1bbcda1
Reviewed-on: http://gerrit.cloudera.org:8080/21688
Reviewed-by: Michael Smith <[email protected]>
Reviewed-by: Riza Suminto <[email protected]>
Tested-by: Impala Public Jenkins <[email protected]>
---
.../apache/impala/planner/SingleNodePlanner.java | 40 ++++++--
.../queries/PlannerTest/inline-view.test | 69 +++++++++++++
.../queries/QueryTest/inline-view.test | 112 +++++++++++++++++++++
3 files changed, 211 insertions(+), 10 deletions(-)
diff --git a/fe/src/main/java/org/apache/impala/planner/SingleNodePlanner.java
b/fe/src/main/java/org/apache/impala/planner/SingleNodePlanner.java
index fc935b382..7eb0b5419 100644
--- a/fe/src/main/java/org/apache/impala/planner/SingleNodePlanner.java
+++ b/fe/src/main/java/org/apache/impala/planner/SingleNodePlanner.java
@@ -1520,8 +1520,8 @@ public class SingleNodePlanner {
List<Expr> viewPredicates =
Expr.substituteList(preds, inlineViewRef.getSmap(), analyzer, false);
- // perform any post-processing of the predicates before registering
- removeDisqualifyingInferredPreds(inlineViewRef.getAnalyzer(),
viewPredicates);
+ // Perform any post-processing of the predicates before registering.
+ removeDisqualifyingInferredPreds(inlineViewRef, viewPredicates);
// Unset the On-clause flag of the migrated conjuncts because the migrated
conjuncts
// apply to the post-join/agg/analytic result of the inline view.
@@ -1538,7 +1538,15 @@ public class SingleNodePlanner {
* (in the Analyzer), we do this check here anyways as a safety in case any
such
* predicate 'fell through' to this stage.
*/
- private void removeDisqualifyingInferredPreds(Analyzer analyzer, List<Expr>
preds) {
+ private void removeDisqualifyingInferredPreds(InlineViewRef inlineViewRef,
+ List<Expr> preds) {
+ Analyzer analyzer = inlineViewRef.getAnalyzer();
+ String WARN_MESSAGE_HEADER = "Removed inferred predicate %s from the list
of " +
+ "predicates considered for inline view because %s.";
+ // If !canMigrateConjuncts(inlineViewRef) evaluates to true, then that
means we are
+ // calling addConjunctsIntoInlineView() to migrate some analytic conjuncts
into
+ // 'inlineViewRef'. Refer to migrateConjunctsToInlineView() for more
details.
+ boolean isAnalytic = !canMigrateConjuncts(inlineViewRef);
ListIterator<Expr> iter = preds.listIterator();
while (iter.hasNext()) {
Expr e = iter.next();
@@ -1548,16 +1556,28 @@ public class SingleNodePlanner {
if (slots == null) continue;
TupleId leftParent = analyzer.getTupleId(slots.first);
TupleId rightParent = analyzer.getTupleId(slots.second);
- // check if either the left parent or right parent is an outer joined
tuple
- // Note: strictly, we may be ok to check only for the null producing
- // side but we are being conservative here to check both sides. With
- // additional testing we could potentially relax this.
if (analyzer.isOuterJoined(leftParent) ||
analyzer.isOuterJoined(rightParent)) {
+ // check if either the left parent or right parent is an outer
joined tuple
+ // Note: strictly, we may be ok to check only for the null producing
+ // side but we are being conservative here to check both sides. With
+ // additional testing we could potentially relax this.
iter.remove();
- LOG.warn("Removed inferred predicate " + p.toSql() + " from the list
of " +
- "predicates considered for inline view because either the
left " +
- "or right side is derived from an outer join output.");
+ LOG.warn(String.format(WARN_MESSAGE_HEADER, p.toSql(), "either the
left " +
+ "or right side is derived from an outer join output"));
+ } else if (isAnalytic && leftParent.equals(rightParent)) {
+ // We do not require that both 'leftParent' and 'rightParent'
resolve to the
+ // same base table because in the case where we migrate an inferred
binary
+ // predicate from an outer inline view into an inner inline view,
none of
+ // 'leftParent' and 'rightParent' could resolve to a base table,
although they
+ // correspond to the same TupleId (of the inner inline view). In
such a case,
+ // we still have to remove this inferred binary predicate to prevent
it from
+ // being pushed to the scan node of the base table.
+ iter.remove();
+ LOG.warn(String.format(WARN_MESSAGE_HEADER, p.toSql(), "both sides "
+
+ "of the predicate reference the same TupleId " + leftParent + "
which " +
+ "could result in pushing the conjunct to the scan node before
the " +
+ "analytic function is applied"));
}
}
}
diff --git
a/testdata/workloads/functional-planner/queries/PlannerTest/inline-view.test
b/testdata/workloads/functional-planner/queries/PlannerTest/inline-view.test
index 7cfbeb03e..bd4c1566d 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/inline-view.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/inline-view.test
@@ -2664,3 +2664,72 @@ PLAN-ROOT SINK
runtime filters: RF000 -> other_table.id
row-size=4B cardinality=7.30K
====
+# IMPALA-13262: Do not migrate an inferred predicate into an inline view if
both sides of
+# the inferred predicate reference the same TupleId.
+SELECT * FROM
+(
+SELECT year, id, int_col, ROW_NUMBER() over(PARTITION BY year ORDER BY id
DESC) rn
+FROM functional.alltypes
+) iv
+WHERE rn = 3650 AND iv.id = iv.int_col;
+---- PLAN
+PLAN-ROOT SINK
+|
+03:SELECT
+| predicates: id = int_col, row_number() = 3650
+| row-size=20B cardinality=730
+|
+02:ANALYTIC
+| functions: row_number()
+| partition by: `year`
+| order by: id DESC
+| window: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+| row-size=20B cardinality=7.30K
+|
+01:SORT
+| order by: year ASC NULLS LAST, id DESC
+| row-size=12B cardinality=7.30K
+|
+00:SCAN HDFS [functional.alltypes]
+ HDFS partitions=24/24 files=24 size=478.45KB
+ row-size=12B cardinality=7.30K
+====
+# IMPALA-13262: This query shows that the inferred predicate 'iv_1.id =
iv_1.int_col'
+# won't be migrated into the inline view 'iv_2' so that 'id = int_col' won't
be pushed
+# to the scan node of the table 'functional.alltypes'.
+SELECT * FROM
+(
+SELECT year, id, int_col, ROW_NUMBER() over(PARTITION BY year ORDER BY id
DESC) rn
+FROM (SELECT * FROM functional.alltypes UNION ALL SELECT * FROM
functional.alltypes) iv_2
+) iv_1
+WHERE rn > 7200 and iv_1.id = iv_1.int_col;
+---- PLAN
+PLAN-ROOT SINK
+|
+05:SELECT
+| predicates: id = int_col, row_number() > 7200
+| row-size=20B cardinality=1.46K
+|
+04:ANALYTIC
+| functions: row_number()
+| partition by: `year`
+| order by: id DESC
+| window: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+| row-size=20B cardinality=14.60K
+|
+03:SORT
+| order by: year ASC NULLS LAST, id DESC
+| row-size=12B cardinality=14.60K
+|
+00:UNION
+| pass-through-operands: all
+| row-size=12B cardinality=14.60K
+|
+|--02:SCAN HDFS [functional.alltypes]
+| HDFS partitions=24/24 files=24 size=478.45KB
+| row-size=12B cardinality=7.30K
+|
+01:SCAN HDFS [functional.alltypes]
+ HDFS partitions=24/24 files=24 size=478.45KB
+ row-size=12B cardinality=7.30K
+====
diff --git
a/testdata/workloads/functional-query/queries/QueryTest/inline-view.test
b/testdata/workloads/functional-query/queries/QueryTest/inline-view.test
index 5c0cc185d..1ea226740 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/inline-view.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/inline-view.test
@@ -593,3 +593,115 @@ select distinct a,b,c from v;
---- TYPES
TINYINT, SMALLINT, SMALLINT
====
+---- QUERY
+# IMPALA-13262: Do not migrate an inferred predicate into an inline view if
both sides of
+# the inferred predicate reference the same TupleId.
+# Note that the following are the rows in the inline view 'iv' that satisfy
+# 'iv.id = iv.int_col'.
+# If the predicate 'id = int_col' was pushed down to the scan node of the table
+# 'functional.alltypes', Impala would have produced 0 row, which is the wrong
result.
+# +------+------+---------+------+
+# | year | id | int_col | rn |
+# +------+------+---------+------+
+# | 2009 | 9 | 9 | 3641 |
+# | 2009 | 8 | 8 | 3642 |
+# | 2009 | 7 | 7 | 3643 |
+# | 2009 | 6 | 6 | 3644 |
+# | 2009 | 5 | 5 | 3645 |
+# | 2009 | 4 | 4 | 3646 |
+# | 2009 | 3 | 3 | 3647 |
+# | 2009 | 2 | 2 | 3648 |
+# | 2009 | 1 | 1 | 3649 |
+# | 2009 | 0 | 0 | 3650 |
+SELECT * FROM
+(
+SELECT year, id, int_col, ROW_NUMBER() over(PARTITION BY year ORDER BY id
DESC) rn
+FROM functional.alltypes
+) iv
+WHERE rn = 3650 AND iv.id = iv.int_col;
+---- RESULTS
+2009,0,0,3650
+---- TYPES
+INT, INT, INT, BIGINT
+====
+---- QUERY
+# IMPALA-13262: This is a variant of the query above to show the rows
satisfying
+# 'rn = 3650' but not necessarily 'iv.id = iv.int_col'.
+SELECT * FROM
+(
+SELECT year, id, int_col, ROW_NUMBER() over(PARTITION BY year ORDER BY id
DESC) rn
+FROM functional.alltypes
+) iv
+WHERE rn = 3650;
+---- RESULTS: VERIFY_IS_EQUAL_SORTED
+2009,0,0,3650
+2010,3650,0,3650
+---- TYPES
+INT, INT, INT, BIGINT
+====
+---- QUERY
+# IMPALA-13262: This is a variant of the query above to verify we won't get
any row if
+# we apply 'id = int_col' to the base table 'functional.alltypes' before the
application
+# of the analytic predicate 'rn = 3650'.
+SELECT * FROM
+(
+SELECT year, id, int_col, ROW_NUMBER() over(PARTITION BY year ORDER BY id
DESC) rn
+FROM functional.alltypes WHERE id = int_col
+) iv
+WHERE rn = 3650;
+---- RESULTS
+---- TYPES
+INT, INT, INT, BIGINT
+====
+---- QUERY
+# IMPALA-13262: This is the query that shows the results of the inline view
'iv' above.
+SELECT year, id, int_col, ROW_NUMBER() over(PARTITION BY year ORDER BY id
DESC) rn
+FROM functional.alltypes WHERE id = int_col;
+---- RESULTS
+2009,9,9,1
+2009,8,8,2
+2009,7,7,3
+2009,6,6,4
+2009,5,5,5
+2009,4,4,6
+2009,3,3,7
+2009,2,2,8
+2009,1,1,9
+2009,0,0,10
+---- TYPES
+INT, INT, INT, BIGINT
+====
+---- QUERY
+# IMPALA-13262: This query shows that the inferred predicate 'iv_1.id =
iv_1.int_col'
+# won't be migrated into the inline view 'iv_2' so that 'id = int_col' won't
be pushed
+# to the scan node of the table 'functional.alltypes'.
+SELECT * FROM
+(
+SELECT year, id, int_col, ROW_NUMBER() over(PARTITION BY year ORDER BY id
DESC) rn
+FROM (SELECT * FROM functional.alltypes UNION ALL SELECT * FROM
functional.alltypes) iv_2
+) iv_1
+WHERE rn > 7200 and iv_1.id = iv_1.int_col;
+---- RESULTS
+2009,9,9,7281
+2009,9,9,7282
+2009,8,8,7283
+2009,8,8,7284
+2009,7,7,7285
+2009,7,7,7286
+2009,6,6,7287
+2009,6,6,7288
+2009,5,5,7289
+2009,5,5,7290
+2009,4,4,7291
+2009,4,4,7292
+2009,3,3,7293
+2009,3,3,7294
+2009,2,2,7295
+2009,2,2,7296
+2009,1,1,7297
+2009,1,1,7298
+2009,0,0,7299
+2009,0,0,7300
+---- TYPES
+INT, INT, INT, BIGINT
+====