This is an automated email from the ASF dual-hosted git repository. stigahuang pushed a commit to branch branch-4.4.1 in repository https://gitbox.apache.org/repos/asf/impala.git
commit 221d4f1e28233c0fbbfb66731171bbf2b8ccb32b Author: Riza Suminto <[email protected]> AuthorDate: Wed Jun 12 22:38:53 2024 -0700 IMPALA-13077: Fix selectivity estimation for SEMI JOIN JoinNode.getSemiJoinCardinality() will skip an equality expression if either NDV or Cardinality of equality expression is unknown (-1). This patch fix the unknown NDV issue by making JoinNode.getNdv() wraps around ColumnStats.fromExpr(). Testing: - Add test case where LEFT SEMI JOIN from subquery can reduce cardinality estimate of leftmost ScanNode in the query plan. - Add new pattern at CARDINALITY_FILTER to ignore reduction by runtime filter. - Pass core tests. Change-Id: I9c799df535d764c3f87ededef1c48eaa103293a0 Reviewed-on: http://gerrit.cloudera.org:8080/21516 Reviewed-by: Impala Public Jenkins <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> --- .../java/org/apache/impala/planner/JoinNode.java | 10 ++-- .../org/apache/impala/planner/PlannerTest.java | 2 + .../java/org/apache/impala/testutil/TestUtils.java | 9 ++-- .../queries/PlannerTest/implicit-joins.test | 58 ++++++++++++++++++++++ .../PlannerTest/tpcds_cpu_cost/tpcds-q59.test | 6 +-- 5 files changed, 71 insertions(+), 14 deletions(-) diff --git a/fe/src/main/java/org/apache/impala/planner/JoinNode.java b/fe/src/main/java/org/apache/impala/planner/JoinNode.java index ab9f05227..10f24a5a7 100644 --- a/fe/src/main/java/org/apache/impala/planner/JoinNode.java +++ b/fe/src/main/java/org/apache/impala/planner/JoinNode.java @@ -755,15 +755,11 @@ public abstract class JoinNode extends PlanNode { } /** - * Unwraps the SlotRef in expr and returns the NDVs of it. - * Returns -1 if the NDVs are unknown or if expr is not a SlotRef. + * Returns the NDVs of a given expression. + * Returns -1 if the NDVs are unknown. */ public static long getNdv(Expr expr) { - SlotRef slotRef = expr.unwrapSlotRef(false); - if (slotRef == null) return -1; - SlotDescriptor slotDesc = slotRef.getDesc(); - if (slotDesc == null) return -1; - ColumnStats stats = slotDesc.getStats(); + ColumnStats stats = ColumnStats.fromExpr(expr); if (!stats.hasNumDistinctValues()) return -1; return stats.getNumDistinctValues(); } diff --git a/fe/src/test/java/org/apache/impala/planner/PlannerTest.java b/fe/src/test/java/org/apache/impala/planner/PlannerTest.java index 16dbe3029..30105dd4c 100644 --- a/fe/src/test/java/org/apache/impala/planner/PlannerTest.java +++ b/fe/src/test/java/org/apache/impala/planner/PlannerTest.java @@ -1187,6 +1187,8 @@ public class PlannerTest extends PlannerTestBase { filter.transform(" foo=bar cardinality=10.3K")); assertEquals(" foo=bar cardinality=", filter.transform(" foo=bar cardinality=unavailable")); + assertEquals(" foo=bar cardinality=", + filter.transform(" foo=bar cardinality=1.58K(filtered from 2.88M)")); filter = TestUtils.ROW_SIZE_FILTER; assertEquals(" row-size= cardinality=10.3K", filter.transform(" row-size=10B cardinality=10.3K")); diff --git a/fe/src/test/java/org/apache/impala/testutil/TestUtils.java b/fe/src/test/java/org/apache/impala/testutil/TestUtils.java index 761a2ffc5..eba1e68d3 100644 --- a/fe/src/test/java/org/apache/impala/testutil/TestUtils.java +++ b/fe/src/test/java/org/apache/impala/testutil/TestUtils.java @@ -206,10 +206,11 @@ public class TestUtils { public static final IgnoreValueFilter ROW_SIZE_FILTER = new IgnoreValueFilter("row-size", "\\S+"); - // Ignore cardinality=27.30K or cardinality=unavailable - // entries - public static final IgnoreValueFilter CARDINALITY_FILTER = - new IgnoreValueFilter("cardinality", "\\S+"); + // Ignore 'cardinality=27.30K' or 'cardinality=unavailable' or + // 'cardinality=1.58K(filtered from 2.88M)' entries. + // See PlanNode.getExplainString(). + public static final IgnoreValueFilter CARDINALITY_FILTER = new IgnoreValueFilter( + "cardinality", "(unavailable|[0-9\\.KMGT]+(\\(filtered from [0-9\\.KMGT]+\\))?)"); public static final IgnoreValueFilter ICEBERG_SNAPSHOT_ID_FILTER = new IgnoreValueFilter("Iceberg snapshot id", " \\d+", ':'); diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/implicit-joins.test b/testdata/workloads/functional-planner/queries/PlannerTest/implicit-joins.test index d210576d7..982852408 100644 --- a/testdata/workloads/functional-planner/queries/PlannerTest/implicit-joins.test +++ b/testdata/workloads/functional-planner/queries/PlannerTest/implicit-joins.test @@ -481,3 +481,61 @@ PLAN-ROOT SINK runtime filters: RF000 -> t1.id row-size=89B cardinality=7.30K ==== +# IMPALA-13077: leftmost scan cardinality should be reduced to avg rows per partition. +select count(*) from tpcds.store_sales +where ss_sold_date_sk=( + select min(d_date_sk) + 1000 from tpcds.date_dim) +---- PLAN +PLAN-ROOT SINK +| +04:AGGREGATE [FINALIZE] +| output: count(*) +| row-size=8B cardinality=1 +| +03:HASH JOIN [LEFT SEMI JOIN] +| hash predicates: ss_sold_date_sk = min(d_date_sk) + 1000 +| runtime filters: RF000 <- min(d_date_sk) + 1000 +| row-size=4B cardinality=1.58K +| +|--02:AGGREGATE [FINALIZE] +| | output: min(d_date_sk) +| | row-size=4B cardinality=1 +| | +| 01:SCAN HDFS [tpcds.date_dim] +| HDFS partitions=1/1 files=1 size=9.84MB +| row-size=4B cardinality=73.05K +| +00:SCAN HDFS [tpcds.store_sales] + HDFS partitions=1824/1824 files=1824 size=346.60MB + runtime filters: RF000 -> ss_sold_date_sk + row-size=4B cardinality=1.58K(filtered from 2.88M) +==== +# IMPALA-13077: leftmost scan cardinality should be reduced following cardinality +# reduction from the build side, even when the join predicate is not +# SlotRef vs SlotRef equality. +select count(*) from tpcds.store_sales ss +where exists ( + select 1 from tpcds.date_dim + where d_year = 2100 and ss.ss_sold_date_sk = d_date_sk + 1000) +---- PLAN +PLAN-ROOT SINK +| +03:AGGREGATE [FINALIZE] +| output: count(*) +| row-size=8B cardinality=1 +| +02:HASH JOIN [LEFT SEMI JOIN] +| hash predicates: ss.ss_sold_date_sk = d_date_sk + 1000 +| runtime filters: RF000 <- d_date_sk + 1000 +| row-size=4B cardinality=589.03K +| +|--01:SCAN HDFS [tpcds.date_dim] +| HDFS partitions=1/1 files=1 size=9.84MB +| predicates: d_year = 2100 +| row-size=8B cardinality=373 +| +00:SCAN HDFS [tpcds.store_sales ss] + HDFS partitions=1824/1824 files=1824 size=346.60MB + runtime filters: RF000 -> ss.ss_sold_date_sk + row-size=4B cardinality=589.03K(filtered from 2.88M) +==== diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/tpcds_cpu_cost/tpcds-q59.test b/testdata/workloads/functional-planner/queries/PlannerTest/tpcds_cpu_cost/tpcds-q59.test index 6a681075c..eb9879fff 100644 --- a/testdata/workloads/functional-planner/queries/PlannerTest/tpcds_cpu_cost/tpcds-q59.test +++ b/testdata/workloads/functional-planner/queries/PlannerTest/tpcds_cpu_cost/tpcds-q59.test @@ -45,9 +45,9 @@ limit 100; # end query 59 in stream 0 using template query59.tpl ---- PLAN Max Per-Host Resource Reservation: Memory=131.02MB Threads=1 -Per-Host Resource Estimates: Memory=2.30GB +Per-Host Resource Estimates: Memory=2.23GB F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1 -| Per-Instance Resources: mem-estimate=2.30GB mem-reservation=131.02MB thread-reservation=1 runtime-filters-memory=8.00MB +| Per-Instance Resources: mem-estimate=2.23GB mem-reservation=131.02MB thread-reservation=1 runtime-filters-memory=8.00MB | max-parallelism=1 segment-costs=[20807146710, 20807146710, 667717443159, 1000] PLAN-ROOT SINK | output exprs: s_store_name1, s_store_id1, d_week_seq1, sun_sales1 / sun_sales2, mon_sales1 / mon_sales2, tue_sales1 / tue_sales2, wed_sales1 / wed_sales2, thu_sales1 / thu_sales2, fri_sales1 / fri_sales2, sat_sales1 / sat_sales2 @@ -63,7 +63,7 @@ PLAN-ROOT SINK | hash predicates: d_week_seq = d_week_seq - 52, s_store_id = s_store_id | fk/pk conjuncts: none | runtime filters: RF000[bloom] <- d_week_seq - 52, RF001[bloom] <- s_store_id, RF002[min_max] <- d_week_seq - 52, RF003[min_max] <- s_store_id -| mem-estimate=896.65MB mem-reservation=34.00MB spill-buffer=2.00MB thread-reservation=0 +| mem-estimate=823.87MB mem-reservation=34.00MB spill-buffer=2.00MB thread-reservation=0 | tuple-ids=2,5,4,9,12,11 row-size=336B cardinality=33.56G cost=6087277948 | in pipelines: 03(GETNEXT), 11(OPEN) |
