This is an automated email from the ASF dual-hosted git repository.

morrysnow pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 8a8e6edba98 [enhancement](stats) Unify sample algorithm between olap 
table and external table (#25472)
8a8e6edba98 is described below

commit 8a8e6edba98c0515b93a6050c407cc0cea94b2cd
Author: AKIRA <33112463+kikyou1...@users.noreply.github.com>
AuthorDate: Wed Oct 18 14:18:40 2023 +0800

    [enhancement](stats) Unify sample algorithm between olap table and external 
table (#25472)
    
    To reduce error of ndv estimation of olap table's column
---
 .../apache/doris/statistics/BaseAnalysisTask.java  |  9 ++++++
 .../apache/doris/statistics/HMSAnalysisTask.java   |  7 +----
 .../apache/doris/statistics/OlapAnalysisTask.java  | 34 +++++++++++-----------
 3 files changed, 27 insertions(+), 23 deletions(-)

diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
index 1377fe05ff8..73ad70eb0df 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
@@ -44,6 +44,15 @@ public abstract class BaseAnalysisTask {
 
     public static final Logger LOG = 
LogManager.getLogger(BaseAnalysisTask.class);
 
+    protected static final String NDV_MULTIPLY_THRESHOLD = "0.3";
+
+    protected static final String NDV_SAMPLE_TEMPLATE = "ROUND(COUNT(1) * 
${scaleFactor}) AS row_count, "
+            + "case when NDV(`${colName}`)/count('${colName}') < "
+            + NDV_MULTIPLY_THRESHOLD
+            + " then NDV(`${colName}`) "
+            + "else NDV(`${colName}`) * ${scaleFactor} end AS ndv, "
+            ;
+
     /**
      * Stats stored in the column_statistics table basically has two types, 
`part_id` is null which means it is
      * aggregate from partition level stats, `part_id` is not null which means 
it is partition level stats.
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java
index df2396de034..217eb89383c 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java
@@ -50,7 +50,6 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
 
     // While doing sample analysis, the sampled ndv result will multiply a 
factor (total size/sample size)
     // if ndv(col)/count(col) is greater than this threshold.
-    private static final String NDV_MULTIPLY_THRESHOLD = "0.3";
 
     private static final String ANALYZE_TABLE_TEMPLATE = "INSERT INTO "
             + "${internalDB}.${columnStatTbl}"
@@ -62,11 +61,7 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
             + "${idxId} AS idx_id, "
             + "'${colId}' AS col_id, "
             + "NULL AS part_id, "
-            + "ROUND(COUNT(1) * ${scaleFactor}) AS row_count, "
-            + "case when NDV(`${colName}`)/count('${colName}') < "
-            + NDV_MULTIPLY_THRESHOLD
-            + " then NDV(`${colName}`) "
-            + "else NDV(`${colName}`) * ${scaleFactor} end AS ndv, "
+            + NDV_SAMPLE_TEMPLATE
             + "ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * 
${scaleFactor}) AS null_count, "
             + "MIN(`${colName}`) AS min, "
             + "MAX(`${colName}`) AS max, "
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java
index d14cbc0f0a1..7be6d1cc6ee 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java
@@ -54,7 +54,7 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
     //  NDV should only be computed for the relevant partition.
     private static final String ANALYZE_COLUMN_SQL_TEMPLATE = 
INSERT_COL_STATISTICS
             + "     (SELECT NDV(`${colName}`) AS ndv "
-            + "     FROM `${dbName}`.`${tblName}`) t2\n";
+            + "     FROM `${dbName}`.`${tblName}`) t2";
 
     private static final String COLLECT_PARTITION_STATS_SQL_TEMPLATE =
             " SELECT "
@@ -73,22 +73,22 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
                     + "${dataSizeFunction} AS data_size, "
                     + "NOW() FROM `${dbName}`.`${tblName}` PARTITION 
${partitionName}";
 
-    private static final String SAMPLE_COLUMN_SQL_TEMPLATE = "SELECT \n"
-            + "CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS id, \n"
-            + "${catalogId} AS catalog_id, \n"
-            + "${dbId} AS db_id, \n"
-            + "${tblId} AS tbl_id, \n"
-            + "${idxId} AS idx_id, \n"
-            + "'${colId}' AS col_id, \n"
-            + "NULL AS part_id, \n"
-            + "COUNT(1) * ${scaleFactor} AS row_count, \n"
-            + "NDV(`${colName}`) * ${scaleFactor}  AS ndv, \n"
-            + "SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * 
${scaleFactor} AS null_count, \n"
-            + "MIN(`${colName}`) AS min, \n"
-            + "MAX(`${colName}`) AS max, \n"
-            + "${dataSizeFunction} * ${scaleFactor} AS data_size, \n"
-            + "NOW()\n"
-            + "FROM `${dbName}`.`${tblName}`\n"
+    private static final String SAMPLE_COLUMN_SQL_TEMPLATE = "SELECT "
+            + "CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS id, "
+            + "${catalogId} AS catalog_id, "
+            + "${dbId} AS db_id, "
+            + "${tblId} AS tbl_id, "
+            + "${idxId} AS idx_id, "
+            + "'${colId}' AS col_id, "
+            + "NULL AS part_id, "
+            + "COUNT(1) * ${scaleFactor} AS row_count, "
+            + NDV_SAMPLE_TEMPLATE
+            + "SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * 
${scaleFactor} AS null_count, "
+            + "MIN(`${colName}`) AS min, "
+            + "MAX(`${colName}`) AS max, "
+            + "${dataSizeFunction} * ${scaleFactor} AS data_size, "
+            + "NOW() "
+            + "FROM `${dbName}`.`${tblName}`"
             + "${tablets}";
 
     // cache stats for each partition, it would be inserted into 
column_statistics in a batch.


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to