This is an automated email from the ASF dual-hosted git repository. stigahuang pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit c1aac4b3a4fb616763cedc59648cfde6e8f5ec70 Author: Daniel Becker <[email protected]> AuthorDate: Tue Apr 1 10:18:51 2025 +0200 IMPALA-13873: Missing equivalence conjunct in aggregation node with inline views Some queries involving plain (distinct) UNIONs miss conjuncts, leading to incorrect results: Example: WITH u1 AS (select 10 a, 10 b), t AS (select a, b, min(b) over (partition by a) min_b from u1 UNION select 10, 10, 20) select t.* from t where t.b = t.min_b; Expected result: +----+----+-------+ | a | b | min_b | +----+----+-------+ | 10 | 10 | 10 | +----+----+-------+ Actual result: +----+----+-------+ | a | b | min_b | +----+----+-------+ | 10 | 10 | 10 | | 10 | 20 | 10 | +----+----+-------+ This is caused by MultiAggregateInfo assuming that conjuncts bound by grouping slots that are produced by SlotRef grouping expressions are already evaluated below the AggregationNode. However, this is not true in all cases: with UNIONs, there may be conjuncts that are unassigned below the AggregationNode. This may happen if a conjunct cannot be pushed into all operands of a UNION, because the source tuples in the operands do not contain all of the slots referenced by the predicate. In the example above, it happens in the first operand: select a, b, min(b) over (partition by a) min_b from u1 The source tuple, 'u1', contains only two slots ('a' and 'b'), but does not contain a slot corresponding to 'min(b)' - therefore the predicate 't.b = t.min_b' is not bound by the tuple of 'u1'. In theory, the predicate could still be evaluated directly after materialising the tuple with 'min(b)', still inside the UNION operand, but Impala currently does not work that way. In these cases, the conjuncts need to be evaluated in the AggregationNode (possibly in addition to some of the UNION operands). This change fixes this problem by introducing a method in MultiAggregateInfo: 'setConjunctsToKeep()', where the caller can pass a list of conjuncts that will not be eliminated. This is called during the planning of the UNION if there are unassigned conjuncts remaining. Testing: - Added a PlannerTest and an EE test for the case where a conjunct was previously incorrectly removed from the AggregationNode. - Existing tests cover the case when conjuncts can be safely removed from an AggregationNode above a UnionNode because the conjuncts are pushed into all union operands, see for example https://github.com/apache/impala/blob/6f2d9a2/testdata/workloads/functional-planner/queries/PlannerTest/union.test#L3914 Change-Id: I67a59cd96d83181ce249fd6ca141906f549a09b3 Reviewed-on: http://gerrit.cloudera.org:8080/22746 Reviewed-by: Impala Public Jenkins <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> --- .../apache/impala/analysis/MultiAggregateInfo.java | 24 ++++ .../apache/impala/planner/SingleNodePlanner.java | 9 +- .../queries/PlannerTest/union.test | 125 +++++++++++++++++++++ .../queries/QueryTest/aggregation.test | 45 ++++++++ 4 files changed, 202 insertions(+), 1 deletion(-) diff --git a/fe/src/main/java/org/apache/impala/analysis/MultiAggregateInfo.java b/fe/src/main/java/org/apache/impala/analysis/MultiAggregateInfo.java index 24c12fda9..9530b7aae 100644 --- a/fe/src/main/java/org/apache/impala/analysis/MultiAggregateInfo.java +++ b/fe/src/main/java/org/apache/impala/analysis/MultiAggregateInfo.java @@ -133,6 +133,11 @@ public class MultiAggregateInfo { // Result of substituting 'groupingExprs_' with the output smap of the AggregationNode. private List<Expr> substGroupingExprs_; + // These conjuncts are kept even if they would otherwise be removed as redundant. This + // can be used for example when a conjunct cannot be pushed into all operands of a UNION + // node below the AggregationNode, so it needs to be evaluated by the AggregationNode. + private List<Expr> conjunctsToKeep_; + // Results of analyze(): // Aggregation classes and their AggregateInfos. If there is a class with non-distinct @@ -220,6 +225,10 @@ public class MultiAggregateInfo { return new MultiAggregateInfo(distinctAggInfo); } + public void setConjunctsToKeep(List<Expr> conjuncts) { + conjunctsToKeep_ = conjuncts; + } + public void analyze(Analyzer analyzer) throws AnalysisException { if (isAnalyzed_) return; @@ -1024,7 +1033,22 @@ public class MultiAggregateInfo { if (markAssigned) analyzer.markConjunctsAssigned(unassignedConjuncts); result.addAll(bindingPredicates); result.addAll(unassignedConjuncts); + + // Select those conjuncts in 'result' that we need to keep. + List<Expr> origConjunctsToKeep = new ArrayList<>(); + if (conjunctsToKeep_ != null) { + for (Expr conj : result) { + if (conjunctsToKeep_.contains(conj)) { + origConjunctsToKeep.add(conj); + } + } + } + analyzer.createEquivConjuncts(tid, result, groupBySids); + + // Add back conjuncts that we need to keep but may have been removed. + if (origConjunctsToKeep != null) result.addAll(origConjunctsToKeep); + Expr.removeDuplicates(result); return result; } diff --git a/fe/src/main/java/org/apache/impala/planner/SingleNodePlanner.java b/fe/src/main/java/org/apache/impala/planner/SingleNodePlanner.java index 921cdd8cf..07333895e 100644 --- a/fe/src/main/java/org/apache/impala/planner/SingleNodePlanner.java +++ b/fe/src/main/java/org/apache/impala/planner/SingleNodePlanner.java @@ -2346,8 +2346,15 @@ public class SingleNodePlanner implements SingleNodePlannerIntf { if (unionStmt.hasUnionDistinctOps()) { result = createUnionPlan( analyzer, unionStmt, unionStmt.getUnionDistinctOperands(), null); + + MultiAggregateInfo unionDistinctAggInfo = unionStmt.getDistinctAggInfo(); + + // Make sure MultiAggregateInfo does not remove conjuncts that are unassigned at + // this point as redundant. + unionDistinctAggInfo.setConjunctsToKeep(analyzer.getUnassignedConjuncts(result)); + result = new AggregationNode( - ctx_.getNextNodeId(), result, unionStmt.getDistinctAggInfo(), AggPhase.FIRST); + ctx_.getNextNodeId(), result, unionDistinctAggInfo, AggPhase.FIRST); result.init(analyzer); } // create ALL tree diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/union.test b/testdata/workloads/functional-planner/queries/PlannerTest/union.test index e004dfc25..fc5ed39a0 100644 --- a/testdata/workloads/functional-planner/queries/PlannerTest/union.test +++ b/testdata/workloads/functional-planner/queries/PlannerTest/union.test @@ -4634,3 +4634,128 @@ Per-Host Resources: mem-estimate=80.00MB mem-reservation=128.00KB thread-reserva tuple-ids=0 row-size=4B cardinality=11.00K in pipelines: 00(GETNEXT) ==== +# Regression test for IMPALA-13873: the conjunct 'b = min_b' in the aggregation node +# should not be discarded. +WITH u1 AS (select 10 a, 10 b), +t AS (select a, b, min(b) over (partition by a) min_b from u1 UNION select 10, 10, 20) +select t.* from t where t.b = t.min_b; +---- QUERYOPTIONS +explain_level=2 +---- PLAN +F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1 +| Per-Host Resources: mem-estimate=20.00MB mem-reservation=11.94MB thread-reservation=1 +PLAN-ROOT SINK +| output exprs: a, b, min_b +| mem-estimate=4.00MB mem-reservation=4.00MB spill-buffer=2.00MB thread-reservation=0 +| +04:AGGREGATE [FINALIZE] +| group by: a, b, min_b +| having: b = min_b +| mem-estimate=10.00MB mem-reservation=1.94MB spill-buffer=64.00KB thread-reservation=0 +| tuple-ids=2 row-size=3B cardinality=1 +| in pipelines: 04(GETNEXT), 02(OPEN) +| +00:UNION +| constant-operands=1 +| mem-estimate=0B mem-reservation=0B thread-reservation=0 +| tuple-ids=2 row-size=3B cardinality=2 +| in pipelines: 02(GETNEXT) +| +03:ANALYTIC +| functions: min(b) +| partition by: a +| mem-estimate=4.00MB mem-reservation=4.00MB spill-buffer=2.00MB thread-reservation=0 +| tuple-ids=5,4 row-size=3B cardinality=1 +| in pipelines: 02(GETNEXT) +| +02:SORT +| order by: a ASC NULLS LAST +| mem-estimate=6.00MB mem-reservation=6.00MB spill-buffer=2.00MB thread-reservation=0 +| tuple-ids=5 row-size=2B cardinality=1 +| in pipelines: 02(GETNEXT) +| +01:UNION + constant-operands=1 + mem-estimate=0B mem-reservation=0B thread-reservation=0 + tuple-ids=0 row-size=2B cardinality=1 + in pipelines: <none> +==== +# Regression test for IMPALA-13873: the conjunct 'b = min_b' in the aggregation node +# should not be discarded. +# Querying from tables, not inline expressions. +WITH t AS ( + select d2 a, d3 b, min(d3) over (partition by d2) min_b from functional.decimal_tbl + UNION + select smallint_col, int_col, bigint_col from functional.alltypestiny +) +select t.* from t where t.b = t.min_b; +---- QUERYOPTIONS +explain_level=1 +---- PLAN +PLAN-ROOT SINK +| +05:AGGREGATE [FINALIZE] +| group by: a, b, min_b +| having: b = min_b +| row-size=40B cardinality=1 +| +00:UNION +| row-size=40B cardinality=4 +| +|--04:SCAN HDFS [functional.alltypestiny] +| HDFS partitions=4/4 files=4 size=460B +| predicates: functional.alltypestiny.int_col = functional.alltypestiny.bigint_col +| row-size=14B cardinality=1 +| +03:ANALYTIC +| functions: min(d3) +| partition by: d2 +| row-size=40B cardinality=3 +| +02:SORT +| order by: d2 ASC NULLS LAST +| row-size=24B cardinality=3 +| +01:SCAN HDFS [functional.decimal_tbl] + HDFS partitions=1/1 files=1 size=195B + row-size=24B cardinality=3 +==== +# Regression test for IMPALA-13873: the conjunct 'b = min_b' in the aggregation node +# should not be discarded. +# The predicate cannot be pushed down to any UNION operand. +WITH u1 AS (select tinyint_col, id from functional.alltypestiny), +t AS ( + select tinyint_col, id, min(id) over (partition by tinyint_col) min_id from u1 + UNION + select tinyint_col, id, id+100 as min_id from u1) +select t.* from t where t.id = t.min_id; +---- QUERYOPTIONS +explain_level=1 +---- PLAN +PLAN-ROOT SINK +| +05:AGGREGATE [FINALIZE] +| group by: tinyint_col, id, min_id +| having: id = min_id +| row-size=13B cardinality=2 +| +00:UNION +| row-size=13B cardinality=16 +| +|--04:SCAN HDFS [functional.alltypestiny] +| HDFS partitions=4/4 files=4 size=460B +| row-size=5B cardinality=8 +| +03:ANALYTIC +| functions: min(id) +| partition by: tinyint_col +| row-size=9B cardinality=8 +| +02:SORT +| order by: tinyint_col ASC NULLS LAST +| row-size=5B cardinality=8 +| +01:SCAN HDFS [functional.alltypestiny] + HDFS partitions=4/4 files=4 size=460B + row-size=5B cardinality=8 +==== diff --git a/testdata/workloads/functional-query/queries/QueryTest/aggregation.test b/testdata/workloads/functional-query/queries/QueryTest/aggregation.test index cb20e8c6d..8839edba0 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/aggregation.test +++ b/testdata/workloads/functional-query/queries/QueryTest/aggregation.test @@ -2911,3 +2911,48 @@ select s_store_sk, regr_count(s_number_employees, s_floor_space) over (partition ---- TYPES int,bigint ==== +---- QUERY +# Regression test for IMPALA-13873: the conjunct 'b = min_b' in the aggregation node +# should not be discarded. +WITH u1 AS (select 10 a, 10 b), +t AS (select a, b, min(b) over (partition by a) min_b from u1 UNION select 10, 10, 20) +select t.* from t where t.b = t.min_b; +---- RESULTS +10,10,10 +---- TYPES +TINYINT,TINYINT,TINYINT +==== +---- QUERY +# Regression test for IMPALA-13873: the conjunct 'b = min_b' in the aggregation node +# should not be discarded. +# Querying from tables, not inline expressions. +WITH t AS ( + select d2 a, d3 b, min(d3) over (partition by d2) min_b from functional.decimal_tbl + UNION + select smallint_col, int_col, bigint_col from functional.alltypestiny +) +select t.* from t where t.b = t.min_b; +---- RESULTS +2222,1.2345678900,1.2345678900 +0,0.0000000000,0.0000000000 +333,123.4567890000,123.4567890000 +111,12.3456789000,12.3456789000 +---- TYPES +decimal,decimal,decimal +==== +---- QUERY +# Regression test for IMPALA-13873: the conjunct 'b = min_b' in the aggregation node +# should not be discarded. +# The predicate cannot be pushed down to any UNION operand. +WITH u1 AS (select tinyint_col, id from functional.alltypestiny), +t AS ( + select tinyint_col, id, min(id) over (partition by tinyint_col) min_id from u1 + UNION + select tinyint_col, id, id+100 as min_id from u1) +select t.* from t where t.id = t.min_id; +---- RESULTS +1,1,1 +0,0,0 +---- TYPES +tinyint,int,bigint +====
