This is an automated email from the ASF dual-hosted git repository. lijibing pushed a commit to branch branch-2.0 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.0 by this push: new 177e1af868f Return UNKNOWN column stats if ndv is 0. (#31439) (#31588) 177e1af868f is described below commit 177e1af868f118aeef1977b0b41252797125a3e5 Author: Jibing-Li <64681310+jibing...@users.noreply.github.com> AuthorDate: Thu Feb 29 16:56:26 2024 +0800 Return UNKNOWN column stats if ndv is 0. (#31439) (#31588) --- .../org/apache/doris/statistics/ColStatsData.java | 5 ++++ .../doris/statistics/ColumnStatisticBuilder.java | 4 +-- .../statistics/ColumnStatisticsCacheLoader.java | 30 ++++++++++++++-------- .../apache/doris/statistics/StatisticsCache.java | 5 +++- .../doris/nereids/stats/FilterEstimationTest.java | 2 ++ .../suites/statistics/analyze_stats.groovy | 17 ++++++++++++ 6 files changed, 50 insertions(+), 13 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java index c90b3dd8e1d..6bbafdbe5b5 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java @@ -131,6 +131,11 @@ public class ColStatsData { } public ColumnStatistic toColumnStatistic() { + // For non-empty table, return UNKNOWN if we can't collect ndv value. + // Because inaccurate ndv is very misleading. + if (count > 0 && ndv == 0 && count != nullCount) { + return ColumnStatistic.UNKNOWN; + } try { ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder(); columnStatisticBuilder.setCount(count); diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java index f8ed6a1b6ab..a512fbadbda 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java @@ -25,8 +25,8 @@ public class ColumnStatisticBuilder { private double avgSizeByte; private double numNulls; private double dataSize; - private double minValue; - private double maxValue; + private double minValue = Double.NEGATIVE_INFINITY; + private double maxValue = Double.POSITIVE_INFINITY; private LiteralExpr minExpr; private LiteralExpr maxExpr; diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticsCacheLoader.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticsCacheLoader.java index 0b66fa5e7b1..bc5fc4c10c0 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticsCacheLoader.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticsCacheLoader.java @@ -37,22 +37,32 @@ public class ColumnStatisticsCacheLoader extends StatisticsCacheLoader<Optional< try { // Load from statistics table. columnStatistic = loadFromStatsTable(key); - if (columnStatistic.isPresent()) { - return columnStatistic; - } - // Load from data source metadata - try { - TableIf table = StatisticsUtil.findTable(key.catalogId, key.dbId, key.tableId); - columnStatistic = table.getColumnStatistic(key.colName); - } catch (Exception e) { - LOG.debug(String.format("Exception to get column statistics by metadata. [Catalog:{}, DB:{}, Table:{}]", - key.catalogId, key.dbId, key.tableId), e); + if (!columnStatistic.isPresent()) { + // Load from data source metadata + try { + TableIf table = StatisticsUtil.findTable(key.catalogId, key.dbId, key.tableId); + columnStatistic = table.getColumnStatistic(key.colName); + } catch (Exception e) { + if (LOG.isDebugEnabled()) { + LOG.debug(String.format("Exception to get column statistics by metadata." + + "[Catalog:{}, DB:{}, Table:{}]", + key.catalogId, key.dbId, key.tableId), e); + } + } } } catch (Throwable t) { LOG.warn("Failed to load stats for column [Catalog:{}, DB:{}, Table:{}, Column:{}], Reason: {}", key.catalogId, key.dbId, key.tableId, key.colName, t.getMessage()); LOG.debug(t); } + if (columnStatistic.isPresent()) { + // For non-empty table, return UNKNOWN if we can't collect ndv value. + // Because inaccurate ndv is very misleading. + ColumnStatistic stats = columnStatistic.get(); + if (stats.count > 0 && stats.ndv == 0 && stats.count != stats.numNulls) { + columnStatistic = Optional.of(ColumnStatistic.UNKNOWN); + } + } return columnStatistic; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCache.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCache.java index 62e11f5c9d8..1826f10a38a 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCache.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCache.java @@ -173,7 +173,10 @@ public class StatisticsCache { String colId = statsId.colId; final StatisticsCacheKey k = new StatisticsCacheKey(tblId, idxId, colId); - final ColumnStatistic c = ColumnStatistic.fromResultRow(r); + ColumnStatistic c = ColumnStatistic.fromResultRow(r); + if (c.count > 0 && c.ndv == 0 && c.count != c.numNulls) { + c = ColumnStatistic.UNKNOWN; + } putCache(k, c); } catch (Throwable t) { LOG.warn("Error when preheating stats cache", t); diff --git a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java index 66e64145901..9b37f1119a4 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java @@ -134,6 +134,8 @@ class FilterEstimationTest { Map<Expression, ColumnStatistic> slotToColumnStat = new HashMap<>(); ColumnStatisticBuilder builder = new ColumnStatisticBuilder() .setNdv(500) + .setMaxValue(0) + .setMinValue(0) .setIsUnknown(false); slotToColumnStat.put(a, builder.build()); Statistics stat = new Statistics(1000, slotToColumnStat); diff --git a/regression-test/suites/statistics/analyze_stats.groovy b/regression-test/suites/statistics/analyze_stats.groovy index 35ba207b882..1fa86e52b02 100644 --- a/regression-test/suites/statistics/analyze_stats.groovy +++ b/regression-test/suites/statistics/analyze_stats.groovy @@ -2670,6 +2670,23 @@ PARTITION `p599` VALUES IN (599) sql """drop stats alter_test""" alter_result = sql """show table stats alter_test""" assertEquals("false", alter_result[0][7]) + sql """alter table alter_test modify column id set stats ('row_count'='100', 'ndv'='0', 'num_nulls'='0.0', 'data_size'='2.69975443E8', 'min_value'='1', 'max_value'='2');""" + alter_result = sql """show column stats alter_test(id)""" + assertEquals(1, alter_result.size()) + alter_result = sql """show column cached stats alter_test(id)""" + assertEquals(0, alter_result.size()) + alter_result = sql """show column cached stats alter_test(id)""" + assertEquals(0, alter_result.size()) + sql """alter table alter_test modify column id set stats ('row_count'='100', 'ndv'='0', 'num_nulls'='100', 'data_size'='2.69975443E8', 'min_value'='1', 'max_value'='2');""" + alter_result = sql """show column stats alter_test(id)""" + assertEquals(1, alter_result.size()) + alter_result = sql """show column cached stats alter_test(id)""" + assertEquals(1, alter_result.size()) + sql """alter table alter_test modify column id set stats ('row_count'='100', 'ndv'='1', 'num_nulls'='0', 'data_size'='2.69975443E8', 'min_value'='1', 'max_value'='2');""" + alter_result = sql """show column stats alter_test(id)""" + assertEquals(1, alter_result.size()) + alter_result = sql """show column cached stats alter_test(id)""" + assertEquals(1, alter_result.size()) // Test trigger type, manual default full, manual high health value, sample empty, kill job, show analyze sql """DROP DATABASE IF EXISTS trigger""" --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org