This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.1 by this push:
new 1a1649376f6 branch-2.1:[improvement](statistics)Eliminate null values
while sample analyzing ndv. (#50574) (#51648)
1a1649376f6 is described below
commit 1a1649376f650c23a3c95d483c95507fe8aa3263
Author: James <[email protected]>
AuthorDate: Fri Jun 13 15:38:11 2025 +0800
branch-2.1:[improvement](statistics)Eliminate null values while sample
analyzing ndv. (#50574) (#51648)
backport: https://github.com/apache/doris/pull/50574
---
.../doris/nereids/stats/StatsCalculator.java | 11 ++-
.../apache/doris/statistics/BaseAnalysisTask.java | 10 +-
.../org/apache/doris/statistics/ColStatsData.java | 14 ++-
.../doris/statistics/BaseAnalysisTaskTest.java | 46 ++++++++-
.../apache/doris/statistics/ColStatsDataTest.java | 2 +-
.../doris/statistics/OlapAnalysisTaskTest.java | 4 +-
.../hive/test_hive_statistics_all_type_p0.groovy | 4 +-
.../suites/statistics/test_analyze_all_null.groovy | 108 +++++++++++++++++++++
8 files changed, 177 insertions(+), 22 deletions(-)
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java
index d2dc508e5b4..0a1ae85adfe 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java
@@ -885,8 +885,15 @@ public class StatsCalculator extends
DefaultPlanVisitor<Statistics, Void> {
return ColumnStatistic.UNKNOWN;
}
} else {
- return
Env.getCurrentEnv().getStatisticsCache().getColumnStatistics(
- catalogId, dbId, table.getId(), idxId, colName);
+ ColumnStatistic columnStatistics =
Env.getCurrentEnv().getStatisticsCache().getColumnStatistics(
+ catalogId, dbId, table.getId(), idxId, colName);
+ if (!columnStatistics.isUnKnown
+ && columnStatistics.ndv == 0
+ && (columnStatistics.minExpr != null ||
columnStatistics.maxExpr != null)
+ && columnStatistics.numNulls == columnStatistics.count) {
+ return ColumnStatistic.UNKNOWN;
+ }
+ return columnStatistics;
}
}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
index 1513dd2d782..926b7499f05 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
@@ -237,16 +237,14 @@ public abstract class BaseAnalysisTask {
}
protected String getNdvFunction(String totalRows) {
- String sampleRows = "SUM(`t1`.`count`)";
- String onceCount = "SUM(IF(`t1`.`count` = 1, 1, 0))";
- String countDistinct = "COUNT(1)";
+ String n = "SUM(`t1`.`count`)"; // sample rows
+ String f1 = "SUM(IF(`t1`.`count` = 1 and `t1`.`column_key` is not
null, 1, 0))";
+ String d = "COUNT(`t1`.`column_key`)"; // sample ndv
// DUJ1 estimator: n*d / (n - f1 + f1*n/N)
// f1 is the count of element that appears only once in the sample.
//
(https://github.com/postgres/postgres/blob/master/src/backend/commands/analyze.c)
//
(http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.93.8637&rep=rep1&type=pdf)
- // sample_row * count_distinct / ( sample_row - once_count +
once_count * sample_row / total_row)
- return MessageFormat.format("{0} * {1} / ({0} - {2} + {2} * {0} /
{3})", sampleRows,
- countDistinct, onceCount, totalRows);
+ return MessageFormat.format("{0} * {1} / ({0} - {2} + {2} * {0} /
{3})", n, d, f1, totalRows);
}
// Max value is not accurate while sample, so set it to NULL to avoid
optimizer generate bad plan.
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java
index cc79f779f6e..5d4158db3fc 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java
@@ -189,15 +189,19 @@ public class ColStatsData {
public boolean isValid() {
if (ndv > 10 * count) {
- LOG.debug("Ndv {} is much larger than count {}", ndv, count);
+ String message = String.format("ColStatsData ndv too large. %s",
toSQL(true));
+ LOG.warn(message);
return false;
}
- if (ndv == 0 && (!isNull(minLit) || !isNull(maxLit))) {
- LOG.debug("Ndv is 0 but min or max exists");
+ if (ndv == 0 && (!isNull(minLit) || !isNull(maxLit)) && nullCount !=
count) {
+ String message = String.format("ColStatsData ndv 0 but min/max is
not null and nullCount != count. %s",
+ toSQL(true));
+ LOG.warn(message);
return false;
}
- if (count > 0 && ndv == 0 && isNull(minLit) && isNull(maxLit) &&
(nullCount == 0 || count > nullCount * 10)) {
- LOG.debug("count {} not 0, ndv is 0, min and max are all null,
null count {} is too small", count, count);
+ if (count > 0 && ndv == 0 && isNull(minLit) && isNull(maxLit) &&
(count > nullCount * 10)) {
+ LOG.warn("count {} not 0, ndv is 0, min and max are all null, null
count {} is too small",
+ count, nullCount);
return false;
}
return true;
diff --git
a/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java
b/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java
index dfa0d9fca9e..8405216a271 100644
---
a/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java
+++
b/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java
@@ -62,13 +62,12 @@ public class BaseAnalysisTaskTest {
Assertions.assertEquals("NULL", maxFunction);
String ndvFunction =
olapAnalysisTask.getNdvFunction(String.valueOf(100));
- Assertions.assertEquals("SUM(`t1`.`count`) * COUNT(1) /
(SUM(`t1`.`count`) - SUM(IF(`t1`.`count` = 1, 1, 0)) "
- + "+ SUM(IF(`t1`.`count` = 1, 1, 0)) * SUM(`t1`.`count`) /
100)", ndvFunction);
+ Assertions.assertEquals("SUM(`t1`.`count`) * COUNT(`t1`.`column_key`)
/ (SUM(`t1`.`count`) - SUM(IF(`t1`.`count` = 1 and `t1`.`column_key` is not
null, 1, 0)) + SUM(IF(`t1`.`count` = 1 and `t1`.`column_key` is not null, 1,
0)) * SUM(`t1`.`count`) / 100)", ndvFunction);
System.out.println(ndvFunction);
}
@Test
- public void testInvalidColStats() {
+ public void testNdvTooLarge() {
List<String> values = Lists.newArrayList();
values.add("id");
values.add("10000");
@@ -101,10 +100,49 @@ public class BaseAnalysisTaskTest {
} catch (Exception e) {
Assertions.assertEquals(e.getMessage(),
"ColStatsData is invalid, skip analyzing. "
- +
"('id',10000,20000,30000,0,'col',null,100,1100,300,'min','max',400,'500')");
+ +
"('id',10000,20000,30000,0,'col',null,100,1100,300,'min','max',400,'500')");
return;
}
Assertions.fail();
}
+ @Test
+ public void testNdv0MinMaxExistsNullNotEqualCount() {
+ List<String> values = Lists.newArrayList();
+ values.add("id");
+ values.add("10000");
+ values.add("20000");
+ values.add("30000");
+ values.add("0");
+ values.add("col");
+ values.add(null);
+ values.add("500"); // count
+ values.add("0"); // ndv
+ values.add("300"); // null
+ values.add("min");
+ values.add("max");
+ values.add("400");
+ values.add("500");
+ ResultRow row = new ResultRow(values);
+ List<ResultRow> result = Lists.newArrayList();
+ result.add(row);
+
+ new MockUp<StmtExecutor>() {
+ @Mock
+ public List<ResultRow> executeInternalQuery() {
+ return result;
+ }
+ };
+ BaseAnalysisTask task = new OlapAnalysisTask();
+ task.info = new
AnalysisInfoBuilder().setJobType(JobType.MANUAL).build();
+ try {
+ task.runQuery("test");
+ } catch (Exception e) {
+ Assertions.assertEquals(e.getMessage(),
+ "ColStatsData is invalid, skip analyzing. "
+ +
"('id',10000,20000,30000,0,'col',null,500,0,300,'min','max',400,'500')");
+ return;
+ }
+ Assertions.fail();
+ }
}
diff --git
a/fe/fe-core/src/test/java/org/apache/doris/statistics/ColStatsDataTest.java
b/fe/fe-core/src/test/java/org/apache/doris/statistics/ColStatsDataTest.java
index a7c84370153..b193edb2049 100644
--- a/fe/fe-core/src/test/java/org/apache/doris/statistics/ColStatsDataTest.java
+++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/ColStatsDataTest.java
@@ -215,7 +215,7 @@ public class ColStatsDataTest {
data = new ColStatsData(row);
Assertions.assertFalse(data.isValid());
- // Set max to null, min/max is not null
+ // Set max to null, min/max are all null
values.set(11, null);
row = new ResultRow(values);
data = new ColStatsData(row);
diff --git
a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
index 96d76a61908..3e0e309a488 100644
---
a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
+++
b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
@@ -370,7 +370,7 @@ public class OlapAnalysisTaskTest {
Assertions.assertTrue(task.scanFullTable());
Assertions.assertEquals("1.0", params.get("scaleFactor"));
Assertions.assertEquals("", params.get("sampleHints"));
- Assertions.assertEquals("SUM(`t1`.`count`) * COUNT(1) /
(SUM(`t1`.`count`) - SUM(IF(`t1`.`count` = 1, 1, 0)) + SUM(IF(`t1`.`count` = 1,
1, 0)) * SUM(`t1`.`count`) / 10)", params.get("ndvFunction"));
+ Assertions.assertEquals("SUM(`t1`.`count`) * COUNT(`t1`.`column_key`)
/ (SUM(`t1`.`count`) - SUM(IF(`t1`.`count` = 1 and `t1`.`column_key` is not
null, 1, 0)) + SUM(IF(`t1`.`count` = 1 and `t1`.`column_key` is not null, 1,
0)) * SUM(`t1`.`count`) / 10)", params.get("ndvFunction"));
params.clear();
task = new OlapAnalysisTask();
@@ -378,7 +378,7 @@ public class OlapAnalysisTaskTest {
task.getSampleParams(params, 1000);
Assertions.assertEquals("10.0", params.get("scaleFactor"));
Assertions.assertEquals("TABLET(1, 2)", params.get("sampleHints"));
- Assertions.assertEquals("SUM(`t1`.`count`) * COUNT(1) /
(SUM(`t1`.`count`) - SUM(IF(`t1`.`count` = 1, 1, 0)) + SUM(IF(`t1`.`count` = 1,
1, 0)) * SUM(`t1`.`count`) / 1000)", params.get("ndvFunction"));
+ Assertions.assertEquals("SUM(`t1`.`count`) * COUNT(`t1`.`column_key`)
/ (SUM(`t1`.`count`) - SUM(IF(`t1`.`count` = 1 and `t1`.`column_key` is not
null, 1, 0)) + SUM(IF(`t1`.`count` = 1 and `t1`.`column_key` is not null, 1,
0)) * SUM(`t1`.`count`) / 1000)", params.get("ndvFunction"));
Assertions.assertEquals("SUM(t1.count) * 4",
params.get("dataSizeFunction"));
Assertions.assertEquals("`${colName}`",
params.get("subStringColName"));
params.clear();
diff --git
a/regression-test/suites/external_table_p0/hive/test_hive_statistics_all_type_p0.groovy
b/regression-test/suites/external_table_p0/hive/test_hive_statistics_all_type_p0.groovy
index 7d85f288bd7..b111247d9e7 100644
---
a/regression-test/suites/external_table_p0/hive/test_hive_statistics_all_type_p0.groovy
+++
b/regression-test/suites/external_table_p0/hive/test_hive_statistics_all_type_p0.groovy
@@ -41,14 +41,14 @@ suite("test_hive_statistics_all_type_p0",
"all_types,p0,external,hive,external_d
result = sql """show column stats orc_all_types (int_col);"""
assertEquals("int_col", result[0][0])
assertEquals("3600.0", result[0][2])
- assertEquals("3240.0", result[0][3])
+ assertEquals("3239.0", result[0][3])
assertEquals("361.0", result[0][4])
assertEquals("14400.0", result[0][5])
result = sql """show column stats orc_all_types (string_col);"""
assertEquals("string_col", result[0][0])
assertEquals("3600.0", result[0][2])
- assertEquals("3254.0", result[0][3])
+ assertEquals("3253.0", result[0][3])
assertEquals("347.0", result[0][4])
assertEquals("453634.0", result[0][5])
diff --git a/regression-test/suites/statistics/test_analyze_all_null.groovy
b/regression-test/suites/statistics/test_analyze_all_null.groovy
new file mode 100644
index 00000000000..5defabb67eb
--- /dev/null
+++ b/regression-test/suites/statistics/test_analyze_all_null.groovy
@@ -0,0 +1,108 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_analyze_all_null") {
+
+ def wait_row_count_reported = { db, table, row, column, expected ->
+ def result = sql """show frontends;"""
+ logger.info("show frontends result origin: " + result)
+ def host
+ def port
+ for (int i = 0; i < result.size(); i++) {
+ if (result[i][8] == "true") {
+ host = result[i][1]
+ port = result[i][4]
+ }
+ }
+ def tokens = context.config.jdbcUrl.split('/')
+ def url=tokens[0] + "//" + host + ":" + port
+ logger.info("Master url is " + url)
+ connect(context.config.jdbcUser, context.config.jdbcPassword, url) {
+ sql """use ${db}"""
+ result = sql """show frontends;"""
+ logger.info("show frontends result master: " + result)
+ for (int i = 0; i < 120; i++) {
+ Thread.sleep(5000)
+ result = sql """SHOW DATA FROM ${table};"""
+ logger.info("result " + result)
+ if (result[row][column] == expected) {
+ return;
+ }
+ }
+ throw new Exception("Row count report timeout.")
+ }
+
+ }
+
+ sql """drop database if exists test_analyze_all_null"""
+ sql """create database test_analyze_all_null"""
+ sql """use test_analyze_all_null"""
+ sql """set global enable_auto_analyze=false"""
+
+ sql """CREATE TABLE allnull (
+ key1 int NULL,
+ value1 varchar(25) NULL
+ )ENGINE=OLAP
+ DUPLICATE KEY(`key1`)
+ COMMENT "OLAP"
+ DISTRIBUTED BY HASH(`key1`) BUCKETS 2
+ PROPERTIES (
+ "replication_num" = "1"
+ )
+ """
+ sql """insert into allnull select null, null from
numbers("number"="10000000")"""
+ wait_row_count_reported("test_analyze_all_null", "allnull", 0, 4,
"10000000")
+ sql """analyze table allnull with sample rows 4000000 with sync"""
+
+ def result = sql """show column stats allnull(key1)"""
+ assertEquals(1, result.size())
+ assertEquals("1.0E7", result[0][2])
+ assertEquals("0.0", result[0][3])
+ result = sql """show column stats allnull(value1)"""
+ assertEquals(1, result.size())
+ assertEquals("1.0E7", result[0][2])
+ assertEquals("0.0", result[0][3])
+
+ sql """CREATE TABLE invalidTest (
+ col1 int NULL,
+ col2 string NULL,
+ col3 string NULL
+ )ENGINE=OLAP
+ DUPLICATE KEY(`col1`)
+ COMMENT "OLAP"
+ DISTRIBUTED BY HASH(`col1`) BUCKETS 2
+ PROPERTIES (
+ "replication_num" = "1"
+ )
+ """
+ sql """insert into invalidTest values(1, "1", "1")"""
+
+ sql """alter table invalidTest modify column col1 set stats
('row_count'='100', 'ndv'='100', 'num_nulls'='0.0', 'data_size'='3.2E8',
'min_value'='1', 'max_value'='20000000');"""
+ sql """alter table invalidTest modify column col2 set stats
('row_count'='100', 'ndv'='0', 'num_nulls'='0.0', 'data_size'='3.2E8',
'min_value'='min', 'max_value'='max');"""
+ sql """alter table invalidTest modify column col3 set stats
('row_count'='100', 'ndv'='0', 'num_nulls'='100', 'data_size'='3.2E8',
'min_value'='min', 'max_value'='max');"""
+ result = sql """show column cached stats invalidTest"""
+ assertEquals(3, result.size())
+
+ explain {
+ sql("memo plan select * from invalidTest")
+ contains "col1#0 -> ndv=100.0000"
+ contains "col2#1 -> ndv=0.0000"
+ contains "col3#2 -> unknown(100.0)"
+ }
+
+ sql """drop database if exists test_analyze_all_null"""
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]