This is an automated email from the ASF dual-hosted git repository.

lijibing pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-2.0 by this push:
     new 177e1af868f Return UNKNOWN column stats if ndv is 0. (#31439) (#31588)
177e1af868f is described below

commit 177e1af868f118aeef1977b0b41252797125a3e5
Author: Jibing-Li <64681310+jibing...@users.noreply.github.com>
AuthorDate: Thu Feb 29 16:56:26 2024 +0800

    Return UNKNOWN column stats if ndv is 0. (#31439) (#31588)
---
 .../org/apache/doris/statistics/ColStatsData.java  |  5 ++++
 .../doris/statistics/ColumnStatisticBuilder.java   |  4 +--
 .../statistics/ColumnStatisticsCacheLoader.java    | 30 ++++++++++++++--------
 .../apache/doris/statistics/StatisticsCache.java   |  5 +++-
 .../doris/nereids/stats/FilterEstimationTest.java  |  2 ++
 .../suites/statistics/analyze_stats.groovy         | 17 ++++++++++++
 6 files changed, 50 insertions(+), 13 deletions(-)

diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java
index c90b3dd8e1d..6bbafdbe5b5 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java
@@ -131,6 +131,11 @@ public class ColStatsData {
     }
 
     public ColumnStatistic toColumnStatistic() {
+        // For non-empty table, return UNKNOWN if we can't collect ndv value.
+        // Because inaccurate ndv is very misleading.
+        if (count > 0 && ndv == 0 && count != nullCount) {
+            return ColumnStatistic.UNKNOWN;
+        }
         try {
             ColumnStatisticBuilder columnStatisticBuilder = new 
ColumnStatisticBuilder();
             columnStatisticBuilder.setCount(count);
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java
 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java
index f8ed6a1b6ab..a512fbadbda 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java
@@ -25,8 +25,8 @@ public class ColumnStatisticBuilder {
     private double avgSizeByte;
     private double numNulls;
     private double dataSize;
-    private double minValue;
-    private double maxValue;
+    private double minValue = Double.NEGATIVE_INFINITY;
+    private double maxValue = Double.POSITIVE_INFINITY;
     private LiteralExpr minExpr;
     private LiteralExpr maxExpr;
 
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticsCacheLoader.java
 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticsCacheLoader.java
index 0b66fa5e7b1..bc5fc4c10c0 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticsCacheLoader.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticsCacheLoader.java
@@ -37,22 +37,32 @@ public class ColumnStatisticsCacheLoader extends 
StatisticsCacheLoader<Optional<
         try {
             // Load from statistics table.
             columnStatistic = loadFromStatsTable(key);
-            if (columnStatistic.isPresent()) {
-                return columnStatistic;
-            }
-            // Load from data source metadata
-            try {
-                TableIf table = StatisticsUtil.findTable(key.catalogId, 
key.dbId, key.tableId);
-                columnStatistic = table.getColumnStatistic(key.colName);
-            } catch (Exception e) {
-                LOG.debug(String.format("Exception to get column statistics by 
metadata. [Catalog:{}, DB:{}, Table:{}]",
-                        key.catalogId, key.dbId, key.tableId), e);
+            if (!columnStatistic.isPresent()) {
+                // Load from data source metadata
+                try {
+                    TableIf table = StatisticsUtil.findTable(key.catalogId, 
key.dbId, key.tableId);
+                    columnStatistic = table.getColumnStatistic(key.colName);
+                } catch (Exception e) {
+                    if (LOG.isDebugEnabled()) {
+                        LOG.debug(String.format("Exception to get column 
statistics by metadata."
+                                + "[Catalog:{}, DB:{}, Table:{}]",
+                                key.catalogId, key.dbId, key.tableId), e);
+                    }
+                }
             }
         } catch (Throwable t) {
             LOG.warn("Failed to load stats for column [Catalog:{}, DB:{}, 
Table:{}, Column:{}], Reason: {}",
                     key.catalogId, key.dbId, key.tableId, key.colName, 
t.getMessage());
             LOG.debug(t);
         }
+        if (columnStatistic.isPresent()) {
+            // For non-empty table, return UNKNOWN if we can't collect ndv 
value.
+            // Because inaccurate ndv is very misleading.
+            ColumnStatistic stats = columnStatistic.get();
+            if (stats.count > 0 && stats.ndv == 0 && stats.count != 
stats.numNulls) {
+                columnStatistic = Optional.of(ColumnStatistic.UNKNOWN);
+            }
+        }
         return columnStatistic;
     }
 
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCache.java 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCache.java
index 62e11f5c9d8..1826f10a38a 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCache.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCache.java
@@ -173,7 +173,10 @@ public class StatisticsCache {
                 String colId = statsId.colId;
                 final StatisticsCacheKey k =
                         new StatisticsCacheKey(tblId, idxId, colId);
-                final ColumnStatistic c = ColumnStatistic.fromResultRow(r);
+                ColumnStatistic c = ColumnStatistic.fromResultRow(r);
+                if (c.count > 0 && c.ndv == 0 && c.count != c.numNulls) {
+                    c = ColumnStatistic.UNKNOWN;
+                }
                 putCache(k, c);
             } catch (Throwable t) {
                 LOG.warn("Error when preheating stats cache", t);
diff --git 
a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java
 
b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java
index 66e64145901..9b37f1119a4 100644
--- 
a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java
+++ 
b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java
@@ -134,6 +134,8 @@ class FilterEstimationTest {
         Map<Expression, ColumnStatistic> slotToColumnStat = new HashMap<>();
         ColumnStatisticBuilder builder = new ColumnStatisticBuilder()
                 .setNdv(500)
+                .setMaxValue(0)
+                .setMinValue(0)
                 .setIsUnknown(false);
         slotToColumnStat.put(a, builder.build());
         Statistics stat = new Statistics(1000, slotToColumnStat);
diff --git a/regression-test/suites/statistics/analyze_stats.groovy 
b/regression-test/suites/statistics/analyze_stats.groovy
index 35ba207b882..1fa86e52b02 100644
--- a/regression-test/suites/statistics/analyze_stats.groovy
+++ b/regression-test/suites/statistics/analyze_stats.groovy
@@ -2670,6 +2670,23 @@ PARTITION `p599` VALUES IN (599)
     sql """drop stats alter_test"""
     alter_result = sql """show table stats alter_test"""
     assertEquals("false", alter_result[0][7])
+    sql """alter table alter_test modify column id set stats 
('row_count'='100', 'ndv'='0', 'num_nulls'='0.0', 'data_size'='2.69975443E8', 
'min_value'='1', 'max_value'='2');"""
+    alter_result = sql """show column stats alter_test(id)"""
+    assertEquals(1, alter_result.size())
+    alter_result = sql """show column cached stats alter_test(id)"""
+    assertEquals(0, alter_result.size())
+    alter_result = sql """show column cached stats alter_test(id)"""
+    assertEquals(0, alter_result.size())
+    sql """alter table alter_test modify column id set stats 
('row_count'='100', 'ndv'='0', 'num_nulls'='100', 'data_size'='2.69975443E8', 
'min_value'='1', 'max_value'='2');"""
+    alter_result = sql """show column stats alter_test(id)"""
+    assertEquals(1, alter_result.size())
+    alter_result = sql """show column cached stats alter_test(id)"""
+    assertEquals(1, alter_result.size())
+    sql """alter table alter_test modify column id set stats 
('row_count'='100', 'ndv'='1', 'num_nulls'='0', 'data_size'='2.69975443E8', 
'min_value'='1', 'max_value'='2');"""
+    alter_result = sql """show column stats alter_test(id)"""
+    assertEquals(1, alter_result.size())
+    alter_result = sql """show column cached stats alter_test(id)"""
+    assertEquals(1, alter_result.size())
 
     // Test trigger type, manual default full, manual high health value, 
sample empty, kill job, show analyze
     sql """DROP DATABASE IF EXISTS trigger"""


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to