This is an automated email from the ASF dual-hosted git repository. englefly pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.0 by this push: new 7490df85b60 [feat](nereids)disable join reorder if column stats is invalid (#41790) (branch-3.0) (#42917) 7490df85b60 is described below commit 7490df85b60f6c3eb584f1864e41abf5ddedcc10 Author: minghong <engle...@gmail.com> AuthorDate: Wed Oct 30 23:49:29 2024 +0800 [feat](nereids)disable join reorder if column stats is invalid (#41790) (branch-3.0) (#42917) ## Proposed changes pick #41790 disable join reorder if any condition is matched: 1. any table row count is -1 2. any column, whose ndv is 0, but MinExpr or MaxExpr is not null 3. ndv > 10* rowCount Issue Number: close #xxx <!--Describe your changes.--> (cherry picked from commit e238a8705f25e2b5bc5a9e95b2025712f98dd682) ## Proposed changes Issue Number: close #xxx <!--Describe your changes.--> --- .../org/apache/doris/nereids/NereidsPlanner.java | 4 +- .../org/apache/doris/nereids/StatementContext.java | 10 ++ .../doris/nereids/stats/StatsCalculator.java | 46 +++++++-- .../infer_predicate/infer_intersect_except.out | 11 -- .../suites/nereids_hint_tpcds_p0/load.groovy | 2 +- .../suites/nereids_p0/stats/invalid_stats.groovy | 111 +++++++++++++++++++++ .../nereids_rules_p0/eager_aggregate/basic.groovy | 4 +- .../eager_aggregate/basic_one_side.groovy | 3 +- .../infer_predicate/infer_intersect_except.groovy | 26 ++++- .../nereids_tpcds_shape_sf1000_p0/load.groovy | 2 +- .../constraints/load.groovy | 2 +- .../nereids_tpcds_shape_sf100_p0/load.groovy | 6 +- .../tpcds_sf100/constraints/load.groovy | 2 +- .../suites/new_shapes_p0/tpcds_sf100/load.groovy | 6 +- .../suites/new_shapes_p0/tpcds_sf1000/load.groovy | 2 +- 15 files changed, 196 insertions(+), 41 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/NereidsPlanner.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/NereidsPlanner.java index 4acae7164f8..7ea92ee73b3 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/NereidsPlanner.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/NereidsPlanner.java @@ -274,7 +274,9 @@ public class NereidsPlanner extends Planner { && !cascadesContext.isLeadingDisableJoinReorder()) { List<LogicalOlapScan> scans = cascadesContext.getRewritePlan() .collectToList(LogicalOlapScan.class::isInstance); - StatsCalculator.disableJoinReorderIfTableRowCountNotAvailable(scans, cascadesContext); + Optional<String> disableJoinReorderReason = StatsCalculator + .disableJoinReorderIfStatsInvalid(scans, cascadesContext); + disableJoinReorderReason.ifPresent(statementContext::setDisableJoinReorderReason); } optimize(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/StatementContext.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/StatementContext.java index 08e1e3fa815..ed64864da50 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/StatementContext.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/StatementContext.java @@ -169,6 +169,8 @@ public class StatementContext implements Closeable { private List<PlannerHook> plannerHooks = new ArrayList<>(); + private String disableJoinReorderReason; + public StatementContext() { this(ConnectContext.get(), null, 0); } @@ -558,4 +560,12 @@ public class StatementContext implements Closeable { this.tableIdMapping.put(tableIdentifier, tableId); return tableId; } + + public Optional<String> getDisableJoinReorderReason() { + return Optional.ofNullable(disableJoinReorderReason); + } + + public void setDisableJoinReorderReason(String disableJoinReorderReason) { + this.disableJoinReorderReason = disableJoinReorderReason; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java index 3c9baac01be..ad7d91fa33b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java @@ -212,24 +212,40 @@ public class StatsCalculator extends DefaultPlanVisitor<Statistics, Void> { } /** - * disable join reorder if any table row count is not available. + * disable join reorder if + * 1. any table rowCount is not available, or + * 2. col stats ndv=0 but minExpr or maxExpr is not null + * 3. ndv > 10 * rowCount */ - public static void disableJoinReorderIfTableRowCountNotAvailable( - List<LogicalOlapScan> scans, CascadesContext context) { + public static Optional<String> disableJoinReorderIfStatsInvalid(List<LogicalOlapScan> scans, + CascadesContext context) { StatsCalculator calculator = new StatsCalculator(context); + if (ConnectContext.get() == null) { + // ut case + return Optional.empty(); + } for (LogicalOlapScan scan : scans) { double rowCount = calculator.getOlapTableRowCount(scan); - if (rowCount == -1 && ConnectContext.get() != null) { + // row count not available + if (rowCount == -1) { + LOG.info("disable join reorder since row count not available: " + + scan.getTable().getNameWithFullQualifiers()); + return Optional.of("table[" + scan.getTable().getName() + "] row count is invalid"); + } + // ndv abnormal + Optional<String> reason = calculator.checkNdvValidation(scan, rowCount); + if (reason.isPresent()) { try { ConnectContext.get().getSessionVariable().disableNereidsJoinReorderOnce(); - LOG.info("disable join reorder since row count not available: " - + scan.getTable().getNameWithFullQualifiers()); + LOG.info("disable join reorder since col stats invalid: " + + reason.get()); } catch (Exception e) { LOG.info("disableNereidsJoinReorderOnce failed"); } - return; + return reason; } } + return Optional.empty(); } /** @@ -403,6 +419,22 @@ public class StatsCalculator extends DefaultPlanVisitor<Statistics, Void> { return rowCount; } + // check validation of ndv. + private Optional<String> checkNdvValidation(OlapScan olapScan, double rowCount) { + for (Slot slot : ((Plan) olapScan).getOutput()) { + if (isVisibleSlotReference(slot)) { + ColumnStatistic cache = getColumnStatsFromTableCache((CatalogRelation) olapScan, (SlotReference) slot); + if (!cache.isUnKnown) { + if ((cache.ndv == 0 && (cache.minExpr != null || cache.maxExpr != null)) + || cache.ndv > rowCount * 10) { + return Optional.of("slot " + slot.getName() + " has invalid column stats: " + cache); + } + } + } + } + return Optional.empty(); + } + private Statistics computeOlapScan(OlapScan olapScan) { OlapTable olapTable = olapScan.getTable(); double tableRowCount = getOlapTableRowCount(olapScan); diff --git a/regression-test/data/nereids_rules_p0/infer_predicate/infer_intersect_except.out b/regression-test/data/nereids_rules_p0/infer_predicate/infer_intersect_except.out index 2609ca5f4c9..d897514ee49 100644 --- a/regression-test/data/nereids_rules_p0/infer_predicate/infer_intersect_except.out +++ b/regression-test/data/nereids_rules_p0/infer_predicate/infer_intersect_except.out @@ -52,17 +52,6 @@ PhysicalResultSink ----filter((cast(a as BIGINT) < -1)) ------PhysicalOlapScan[infer_intersect_except2] --- !except_and_intersect -- -PhysicalResultSink ---PhysicalExcept -----filter((infer_intersect_except1.a > 0)) -------PhysicalOlapScan[infer_intersect_except1] -----PhysicalIntersect -------filter((infer_intersect_except2.b > 'ab')) ---------PhysicalOlapScan[infer_intersect_except2] -------filter((infer_intersect_except3.a = 1) and (infer_intersect_except3.b = 'abc')) ---------PhysicalOlapScan[infer_intersect_except3] - -- !except_and_intersect_except_predicate_to_right -- PhysicalResultSink --PhysicalExcept diff --git a/regression-test/suites/nereids_hint_tpcds_p0/load.groovy b/regression-test/suites/nereids_hint_tpcds_p0/load.groovy index 77b37a9c1b4..f4cf57113a6 100644 --- a/regression-test/suites/nereids_hint_tpcds_p0/load.groovy +++ b/regression-test/suites/nereids_hint_tpcds_p0/load.groovy @@ -2336,7 +2336,7 @@ suite("load") { """ sql """ - alter table call_center modify column cc_closed_date_sk set stats ('row_count'='42', 'ndv'='0', 'num_nulls'='42', 'min_value'='0', 'max_value'='179769313', 'data_size'='168') + alter table call_center modify column cc_closed_date_sk set stats ('row_count'='42', 'ndv'='0', 'num_nulls'='42', 'data_size'='168') """ sql """ diff --git a/regression-test/suites/nereids_p0/stats/invalid_stats.groovy b/regression-test/suites/nereids_p0/stats/invalid_stats.groovy new file mode 100644 index 00000000000..5304cd8c2c1 --- /dev/null +++ b/regression-test/suites/nereids_p0/stats/invalid_stats.groovy @@ -0,0 +1,111 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("invalid_stats") { + // multi_sql """ + // set global enable_auto_analyze=false; + // SET enable_nereids_planner=true; + // SET enable_fallback_to_original_planner=false; + // set disable_nereids_rules=PRUNE_EMPTY_PARTITION; + + + // drop table if exists region; + // CREATE TABLE region ( + // r_regionkey int NOT NULL, + // r_name VARCHAR(25) NOT NULL, + // r_comment VARCHAR(152) + // )ENGINE=OLAP + // DUPLICATE KEY(`r_regionkey`) + // COMMENT "OLAP" + // DISTRIBUTED BY HASH(`r_regionkey`) BUCKETS 1 + // PROPERTIES ( + // "replication_num" = "1" + // ); + + // drop table if exists nation; + // CREATE TABLE `nation` ( + // `n_nationkey` int(11) NOT NULL, + // `n_name` varchar(25) NOT NULL, + // `n_regionkey` int(11) NOT NULL, + // `n_comment` varchar(152) NULL + // ) ENGINE=OLAP + // DUPLICATE KEY(`N_NATIONKEY`) + // COMMENT "OLAP" + // DISTRIBUTED BY HASH(`N_NATIONKEY`) BUCKETS 1 + // PROPERTIES ( + // "replication_num" = "1" + // ); + // alter table nation modify column n_nationkey set stats ('ndv'='25', 'num_nulls'='0', 'min_value'='0', 'max_value'='24', 'row_count'='25'); + + // alter table nation modify column n_regionkey set stats ('ndv'='5', 'num_nulls'='0', 'min_value'='0', 'max_value'='4', 'row_count'='25'); + + // """ + + // explain { + // sql "select * from region" + // notContains("join reorder with unknown column statistics") + // } + + // explain { + // sql "select * from region where r_regionkey=1" + // contains("join reorder with unknown column statistics") + // } + + // explain { + // sql "select r_regionkey from region group by r_regionkey" + // contains("join reorder with unknown column statistics") + // } + + // explain { + // sql "select r_regionkey from region join nation on r_regionkey=n_regionkey" + // contains("join reorder with unknown column statistics") + // } + + // sql "alter table region modify column r_regionkey set stats ('ndv'='5', 'num_nulls'='0', 'min_value'='0', 'max_value'='4', 'row_count'='5');" + + // explain { + // sql "select * from region where r_regionkey=1" + // notContains("join reorder with unknown column statistics") + // } + + // explain { + // sql "select r_regionkey from region group by r_regionkey" + // notContains("join reorder with unknown column statistics") + // } + + // explain { + // sql "select r_regionkey from region join nation on r_regionkey=n_regionkey" + // notContains("join reorder with unknown column statistics") + // } + + // explain { + // sql "select r_name from region join nation on r_regionkey=n_regionkey" + // notContains("join reorder with unknown column statistics") + // } + + // explain { + // sql """ + // select r_name + // from (select r_name, r_regionkey + 1 x from region) T join nation on T.x=n_regionkey + // """ + // notContains("join reorder with unknown column statistics") + // } +} +// disable jo: alter table region modify column r_regionkey set stats ('ndv'='0', 'num_nulls'='0', 'min_value'='0', 'max_value'='4', 'row_count'='0'); +// disable jo: alter table region modify column r_regionkey set stats ('ndv'='11', 'num_nulls'='0', 'min_value'='0', 'max_value'='4', 'row_count'='1'); + +// alter table region modify column r_regionkey set stats ('ndv'='10', 'num_nulls'='0', 'min_value'='0', 'max_value'='4', 'row_count'='1'); diff --git a/regression-test/suites/nereids_rules_p0/eager_aggregate/basic.groovy b/regression-test/suites/nereids_rules_p0/eager_aggregate/basic.groovy index 117d0c01f24..e37e9a65da1 100644 --- a/regression-test/suites/nereids_rules_p0/eager_aggregate/basic.groovy +++ b/regression-test/suites/nereids_rules_p0/eager_aggregate/basic.groovy @@ -15,14 +15,14 @@ // specific language governing permissions and limitations // under the License. -suite("eager_aggregate_basic") { +suite("basic") { sql "SET enable_nereids_planner=true" sql "set runtime_filter_mode=OFF" sql "SET enable_fallback_to_original_planner=false" sql "SET ignore_shape_nodes='PhysicalDistribute,PhysicalProject'" sql "set disable_nereids_rules=PRUNE_EMPTY_PARTITION" - + sql "set disable_join_reorder=true;" sql """ DROP TABLE IF EXISTS shunt_log_com_dd_library; """ diff --git a/regression-test/suites/nereids_rules_p0/eager_aggregate/basic_one_side.groovy b/regression-test/suites/nereids_rules_p0/eager_aggregate/basic_one_side.groovy index f3e6f593aa9..78503039d22 100644 --- a/regression-test/suites/nereids_rules_p0/eager_aggregate/basic_one_side.groovy +++ b/regression-test/suites/nereids_rules_p0/eager_aggregate/basic_one_side.groovy @@ -15,13 +15,14 @@ // specific language governing permissions and limitations // under the License. -suite("eager_aggregate_basic_one_side") { +suite("basic_one_side") { sql "SET enable_nereids_planner=true" sql "set runtime_filter_mode=OFF" sql "SET enable_fallback_to_original_planner=false" sql "SET ignore_shape_nodes='PhysicalDistribute,PhysicalProject'" sql "set disable_nereids_rules=PRUNE_EMPTY_PARTITION" + sql "set disable_join_reorder=true" sql """ DROP TABLE IF EXISTS shunt_log_com_dd_library_one_side; diff --git a/regression-test/suites/nereids_rules_p0/infer_predicate/infer_intersect_except.groovy b/regression-test/suites/nereids_rules_p0/infer_predicate/infer_intersect_except.groovy index fb8ef0a75f9..aec3f3384e4 100644 --- a/regression-test/suites/nereids_rules_p0/infer_predicate/infer_intersect_except.groovy +++ b/regression-test/suites/nereids_rules_p0/infer_predicate/infer_intersect_except.groovy @@ -115,10 +115,28 @@ suite("infer_intersect_except") { select a+1,b from infer_intersect_except1 where a>0 intersect select a+1,b from infer_intersect_except2 where a+1<0; """ - qt_except_and_intersect """ - explain shape plan - select a,b from infer_intersect_except1 where a>0 except select 1,'abc' from infer_intersect_except2 where b>'ab' intersect select a,b from infer_intersect_except3 where a<10; - """ + explain { + sql """ + shape plan + select a,b from infer_intersect_except1 where a > 0 + except + select 1, 'abc' from infer_intersect_except2 where b > 'ab' + intersect + select a, b from infer_intersect_except3 where a < 10; + """ + notContains("a < 10") + contains("(infer_intersect_except3.a = 1) and (infer_intersect_except3.b = 'abc')") +// PhysicalResultSink +// --PhysicalExcept +// ----filter((infer_intersect_except1.a > 0)) +// ------PhysicalOlapScan[infer_intersect_except1] +// ----PhysicalIntersect +// ------filter((infer_intersect_except3.a = 1) and (infer_intersect_except3.b = 'abc')) +// --------PhysicalOlapScan[infer_intersect_except3] +// ------filter((infer_intersect_except2.b > 'ab')) +// --------PhysicalOlapScan[infer_intersect_except2] + } + qt_except_and_intersect_except_predicate_to_right """ explain shape plan diff --git a/regression-test/suites/nereids_tpcds_shape_sf1000_p0/load.groovy b/regression-test/suites/nereids_tpcds_shape_sf1000_p0/load.groovy index b868aab6130..1a5c3980e84 100644 --- a/regression-test/suites/nereids_tpcds_shape_sf1000_p0/load.groovy +++ b/regression-test/suites/nereids_tpcds_shape_sf1000_p0/load.groovy @@ -2336,7 +2336,7 @@ suite("load") { """ sql """ - alter table call_center modify column cc_closed_date_sk set stats ('row_count'='42', 'ndv'='0', 'num_nulls'='42', 'min_value'='0', 'max_value'='179769313', 'data_size'='168') + alter table call_center modify column cc_closed_date_sk set stats ('row_count'='42', 'ndv'='0', 'num_nulls'='42', 'data_size'='168') """ sql """ diff --git a/regression-test/suites/nereids_tpcds_shape_sf100_p0/constraints/load.groovy b/regression-test/suites/nereids_tpcds_shape_sf100_p0/constraints/load.groovy index 3020a285e63..b2dca961e4d 100644 --- a/regression-test/suites/nereids_tpcds_shape_sf100_p0/constraints/load.groovy +++ b/regression-test/suites/nereids_tpcds_shape_sf100_p0/constraints/load.groovy @@ -2340,7 +2340,7 @@ suite("load") { """ sql """ - alter table call_center modify column cc_closed_date_sk set stats ('row_count'='42', 'ndv'='0', 'num_nulls'='42', 'min_value'='0', 'max_value'='179769313', 'data_size'='168') + alter table call_center modify column cc_closed_date_sk set stats ('row_count'='42', 'ndv'='0', 'num_nulls'='42', 'data_size'='168') """ sql """ diff --git a/regression-test/suites/nereids_tpcds_shape_sf100_p0/load.groovy b/regression-test/suites/nereids_tpcds_shape_sf100_p0/load.groovy index 120ebf97298..48ad6bda53d 100644 --- a/regression-test/suites/nereids_tpcds_shape_sf100_p0/load.groovy +++ b/regression-test/suites/nereids_tpcds_shape_sf100_p0/load.groovy @@ -1299,7 +1299,7 @@ alter table web_page modify column wp_max_ad_count set stats ('row_count'='2040' """ sql """ -alter table call_center modify column cc_closed_date_sk set stats ('row_count'='30', 'ndv'='0', 'min_value'='2415022', 'max_value'='2488070', 'avg_size'='120', 'max_size'='120' ) +alter table call_center modify column cc_closed_date_sk set stats ('row_count'='30', 'ndv'='0', 'num_nulls'='42', 'avg_size'='120', 'max_size'='120' ) """ sql """ @@ -2018,10 +2018,6 @@ sql """ alter table ship_mode modify column sm_contract set stats ('row_count'='20', 'ndv'='20', 'min_value'='2mM8l', 'max_value'='yVfotg7Tio3MVhBg6Bkn', 'avg_size'='252', 'max_size'='252' ) """ -sql """ -alter table call_center modify column cc_closed_date_sk set stats ('row_count'='30', 'ndv'='0', 'min_value'='0', 'max_value'='0', 'avg_size'='120', 'max_size'='120' ) -""" - sql """ alter table customer_address modify column ca_zip set stats ('row_count'='1000000', 'ndv'='7733', 'min_value'='', 'max_value'='99981', 'avg_size'='4848150', 'max_size'='4848150' ) """ diff --git a/regression-test/suites/new_shapes_p0/tpcds_sf100/constraints/load.groovy b/regression-test/suites/new_shapes_p0/tpcds_sf100/constraints/load.groovy index 8b4fdeee69a..1ed3ebba10e 100644 --- a/regression-test/suites/new_shapes_p0/tpcds_sf100/constraints/load.groovy +++ b/regression-test/suites/new_shapes_p0/tpcds_sf100/constraints/load.groovy @@ -2343,7 +2343,7 @@ suite("load") { """ sql """ - alter table call_center modify column cc_closed_date_sk set stats ('row_count'='42', 'ndv'='0', 'num_nulls'='42', 'min_value'='0', 'max_value'='179769313', 'data_size'='168') + alter table call_center modify column cc_closed_date_sk set stats ('row_count'='42', 'ndv'='0', 'num_nulls'='42', 'data_size'='168') """ sql """ diff --git a/regression-test/suites/new_shapes_p0/tpcds_sf100/load.groovy b/regression-test/suites/new_shapes_p0/tpcds_sf100/load.groovy index 23ce70d931e..4b99eafdea9 100644 --- a/regression-test/suites/new_shapes_p0/tpcds_sf100/load.groovy +++ b/regression-test/suites/new_shapes_p0/tpcds_sf100/load.groovy @@ -1302,7 +1302,7 @@ alter table web_page modify column wp_max_ad_count set stats ('row_count'='2040' """ sql """ -alter table call_center modify column cc_closed_date_sk set stats ('row_count'='30', 'ndv'='0', 'min_value'='2415022', 'max_value'='2488070', 'avg_size'='120', 'max_size'='120' ) +alter table call_center modify column cc_closed_date_sk set stats ('row_count'='30', 'ndv'='0', 'num_nulls'='42', 'avg_size'='120', 'max_size'='120' ) """ sql """ @@ -2021,10 +2021,6 @@ sql """ alter table ship_mode modify column sm_contract set stats ('row_count'='20', 'ndv'='20', 'min_value'='2mM8l', 'max_value'='yVfotg7Tio3MVhBg6Bkn', 'avg_size'='252', 'max_size'='252' ) """ -sql """ -alter table call_center modify column cc_closed_date_sk set stats ('row_count'='30', 'ndv'='0', 'min_value'='0', 'max_value'='0', 'avg_size'='120', 'max_size'='120' ) -""" - sql """ alter table customer_address modify column ca_zip set stats ('row_count'='1000000', 'ndv'='7733', 'min_value'='', 'max_value'='99981', 'avg_size'='4848150', 'max_size'='4848150' ) """ diff --git a/regression-test/suites/new_shapes_p0/tpcds_sf1000/load.groovy b/regression-test/suites/new_shapes_p0/tpcds_sf1000/load.groovy index bc28fdde8c0..14c11d3ea84 100644 --- a/regression-test/suites/new_shapes_p0/tpcds_sf1000/load.groovy +++ b/regression-test/suites/new_shapes_p0/tpcds_sf1000/load.groovy @@ -2339,7 +2339,7 @@ suite("load") { """ sql """ - alter table call_center modify column cc_closed_date_sk set stats ('row_count'='42', 'ndv'='0', 'num_nulls'='42', 'min_value'='0', 'max_value'='179769313', 'data_size'='168') + alter table call_center modify column cc_closed_date_sk set stats ('row_count'='42', 'ndv'='0', 'num_nulls'='42', 'data_size'='168') """ sql """ --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org