This is an automated email from the ASF dual-hosted git repository. morrysnow pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 8a8e6edba98 [enhancement](stats) Unify sample algorithm between olap table and external table (#25472) 8a8e6edba98 is described below commit 8a8e6edba98c0515b93a6050c407cc0cea94b2cd Author: AKIRA <33112463+kikyou1...@users.noreply.github.com> AuthorDate: Wed Oct 18 14:18:40 2023 +0800 [enhancement](stats) Unify sample algorithm between olap table and external table (#25472) To reduce error of ndv estimation of olap table's column --- .../apache/doris/statistics/BaseAnalysisTask.java | 9 ++++++ .../apache/doris/statistics/HMSAnalysisTask.java | 7 +---- .../apache/doris/statistics/OlapAnalysisTask.java | 34 +++++++++++----------- 3 files changed, 27 insertions(+), 23 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java index 1377fe05ff8..73ad70eb0df 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java @@ -44,6 +44,15 @@ public abstract class BaseAnalysisTask { public static final Logger LOG = LogManager.getLogger(BaseAnalysisTask.class); + protected static final String NDV_MULTIPLY_THRESHOLD = "0.3"; + + protected static final String NDV_SAMPLE_TEMPLATE = "ROUND(COUNT(1) * ${scaleFactor}) AS row_count, " + + "case when NDV(`${colName}`)/count('${colName}') < " + + NDV_MULTIPLY_THRESHOLD + + " then NDV(`${colName}`) " + + "else NDV(`${colName}`) * ${scaleFactor} end AS ndv, " + ; + /** * Stats stored in the column_statistics table basically has two types, `part_id` is null which means it is * aggregate from partition level stats, `part_id` is not null which means it is partition level stats. diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java index df2396de034..217eb89383c 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java @@ -50,7 +50,6 @@ public class HMSAnalysisTask extends BaseAnalysisTask { // While doing sample analysis, the sampled ndv result will multiply a factor (total size/sample size) // if ndv(col)/count(col) is greater than this threshold. - private static final String NDV_MULTIPLY_THRESHOLD = "0.3"; private static final String ANALYZE_TABLE_TEMPLATE = "INSERT INTO " + "${internalDB}.${columnStatTbl}" @@ -62,11 +61,7 @@ public class HMSAnalysisTask extends BaseAnalysisTask { + "${idxId} AS idx_id, " + "'${colId}' AS col_id, " + "NULL AS part_id, " - + "ROUND(COUNT(1) * ${scaleFactor}) AS row_count, " - + "case when NDV(`${colName}`)/count('${colName}') < " - + NDV_MULTIPLY_THRESHOLD - + " then NDV(`${colName}`) " - + "else NDV(`${colName}`) * ${scaleFactor} end AS ndv, " + + NDV_SAMPLE_TEMPLATE + "ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * ${scaleFactor}) AS null_count, " + "MIN(`${colName}`) AS min, " + "MAX(`${colName}`) AS max, " diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java index d14cbc0f0a1..7be6d1cc6ee 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java @@ -54,7 +54,7 @@ public class OlapAnalysisTask extends BaseAnalysisTask { // NDV should only be computed for the relevant partition. private static final String ANALYZE_COLUMN_SQL_TEMPLATE = INSERT_COL_STATISTICS + " (SELECT NDV(`${colName}`) AS ndv " - + " FROM `${dbName}`.`${tblName}`) t2\n"; + + " FROM `${dbName}`.`${tblName}`) t2"; private static final String COLLECT_PARTITION_STATS_SQL_TEMPLATE = " SELECT " @@ -73,22 +73,22 @@ public class OlapAnalysisTask extends BaseAnalysisTask { + "${dataSizeFunction} AS data_size, " + "NOW() FROM `${dbName}`.`${tblName}` PARTITION ${partitionName}"; - private static final String SAMPLE_COLUMN_SQL_TEMPLATE = "SELECT \n" - + "CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS id, \n" - + "${catalogId} AS catalog_id, \n" - + "${dbId} AS db_id, \n" - + "${tblId} AS tbl_id, \n" - + "${idxId} AS idx_id, \n" - + "'${colId}' AS col_id, \n" - + "NULL AS part_id, \n" - + "COUNT(1) * ${scaleFactor} AS row_count, \n" - + "NDV(`${colName}`) * ${scaleFactor} AS ndv, \n" - + "SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * ${scaleFactor} AS null_count, \n" - + "MIN(`${colName}`) AS min, \n" - + "MAX(`${colName}`) AS max, \n" - + "${dataSizeFunction} * ${scaleFactor} AS data_size, \n" - + "NOW()\n" - + "FROM `${dbName}`.`${tblName}`\n" + private static final String SAMPLE_COLUMN_SQL_TEMPLATE = "SELECT " + + "CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS id, " + + "${catalogId} AS catalog_id, " + + "${dbId} AS db_id, " + + "${tblId} AS tbl_id, " + + "${idxId} AS idx_id, " + + "'${colId}' AS col_id, " + + "NULL AS part_id, " + + "COUNT(1) * ${scaleFactor} AS row_count, " + + NDV_SAMPLE_TEMPLATE + + "SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * ${scaleFactor} AS null_count, " + + "MIN(`${colName}`) AS min, " + + "MAX(`${colName}`) AS max, " + + "${dataSizeFunction} * ${scaleFactor} AS data_size, " + + "NOW() " + + "FROM `${dbName}`.`${tblName}`" + "${tablets}"; // cache stats for each partition, it would be inserted into column_statistics in a batch. --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org