hudi table (#29868)

yiguolei Wed, 17 Jan 2024 01:43:22 -0800

This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/master by this push:
     new 1f9ae3427f1 [feature](statistics)support statistics for 
iceberg/paimon/hudi table (#29868)
1f9ae3427f1 is described below

commit 1f9ae3427f155c9ae487bb2f353492ad9fc0252c
Author: wuwenchi <wuwenchi...@hotmail.com>
AuthorDate: Wed Jan 17 17:43:05 2024 +0800

    [feature](statistics)support statistics for iceberg/paimon/hudi table 
(#29868)
---
 .../docker-compose/iceberg/iceberg.env             |   2 +-
 .../docker-compose/iceberg/iceberg.yaml.tpl        |   2 +
 .../catalog/external/IcebergExternalTable.java     |   9 +
 .../catalog/external/PaimonExternalTable.java      |   9 +
 .../apache/doris/common/proc/CatalogsProcDir.java  |  10 +-
 ...AnalysisTask.java => ExternalAnalysisTask.java} | 173 ++------------
 .../apache/doris/statistics/HMSAnalysisTask.java   | 252 ++-------------------
 .../doris/statistics/util/StatisticsUtil.java      |  10 +-
 .../doris/statistics/HMSAnalysisTaskTest.java      |   4 +-
 regression-test/conf/regression-conf.groovy        |   1 +
 .../iceberg/test_iceberg_statistics.out            |  39 ++++
 .../paimon/test_paimon_statistics.out              |  21 ++
 .../hive/test_hive_hudi_statistics.out             |  16 ++
 .../pipeline/p0/conf/regression-conf.groovy        |   1 +
 .../tpch/tpch-sf100/conf/regression-conf.groovy    |   1 +
 .../plugins/plugins_get_ids_from_proc.groovy       |  62 +++++
 .../iceberg/test_iceberg_statistics.groovy         |  57 +++++
 .../paimon/test_paimon_catalog.groovy              |   3 +
 .../paimon/test_paimon_statistics.groovy           |  47 ++++
 .../external_table_p0/test_catalog_ddl.groovy      |   2 +
 .../hive/test_hive_hudi_statistics.groovy          |  47 ++++
 21 files changed, 372 insertions(+), 396 deletions(-)

diff --git a/docker/thirdparties/docker-compose/iceberg/iceberg.env 
b/docker/thirdparties/docker-compose/iceberg/iceberg.env
index 4cc8b42eaf9..6bebd49f437 100644
--- a/docker/thirdparties/docker-compose/iceberg/iceberg.env
+++ b/docker/thirdparties/docker-compose/iceberg/iceberg.env
@@ -21,4 +21,4 @@ SPARK_DRIVER_UI_PORT=8080
 SPARK_HISTORY_UI_PORT=10000
 REST_CATALOG_PORT=18181
 MINIO_UI_PORT=9000
-MINIO_API_PORT=9001
+MINIO_API_PORT=19001
diff --git a/docker/thirdparties/docker-compose/iceberg/iceberg.yaml.tpl 
b/docker/thirdparties/docker-compose/iceberg/iceberg.yaml.tpl
index d7220f24376..bc217c1dd6e 100644
--- a/docker/thirdparties/docker-compose/iceberg/iceberg.yaml.tpl
+++ b/docker/thirdparties/docker-compose/iceberg/iceberg.yaml.tpl
@@ -58,6 +58,8 @@ services:
   minio:
     image: minio/minio
     container_name: doris--minio
+    ports:
+      - ${MINIO_API_PORT}:9000
     environment:
       - MINIO_ROOT_USER=admin
       - MINIO_ROOT_PASSWORD=password
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/IcebergExternalTable.java
 
b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/IcebergExternalTable.java
index 7398ff19c9e..be99e26de62 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/IcebergExternalTable.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/IcebergExternalTable.java
@@ -23,7 +23,10 @@ import org.apache.doris.catalog.HiveMetaStoreClientHelper;
 import org.apache.doris.catalog.ScalarType;
 import org.apache.doris.catalog.Type;
 import org.apache.doris.datasource.iceberg.IcebergExternalCatalog;
+import org.apache.doris.statistics.AnalysisInfo;
+import org.apache.doris.statistics.BaseAnalysisTask;
 import org.apache.doris.statistics.ColumnStatistic;
+import org.apache.doris.statistics.ExternalAnalysisTask;
 import org.apache.doris.statistics.util.StatisticsUtil;
 import org.apache.doris.thrift.THiveTable;
 import org.apache.doris.thrift.TIcebergTable;
@@ -149,4 +152,10 @@ public class IcebergExternalTable extends ExternalTable {
                 () -> StatisticsUtil.getIcebergColumnStats(colName,
                         ((IcebergExternalCatalog) 
catalog).getIcebergTable(dbName, name)));
     }
+
+    @Override
+    public BaseAnalysisTask createAnalysisTask(AnalysisInfo info) {
+        makeSureInitialized();
+        return new ExternalAnalysisTask(info);
+    }
 }
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/PaimonExternalTable.java
 
b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/PaimonExternalTable.java
index c8ea253671d..b517265df6a 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/PaimonExternalTable.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/PaimonExternalTable.java
@@ -21,6 +21,9 @@ import org.apache.doris.catalog.Column;
 import org.apache.doris.catalog.ScalarType;
 import org.apache.doris.catalog.Type;
 import org.apache.doris.datasource.paimon.PaimonExternalCatalog;
+import org.apache.doris.statistics.AnalysisInfo;
+import org.apache.doris.statistics.BaseAnalysisTask;
+import org.apache.doris.statistics.ExternalAnalysisTask;
 import org.apache.doris.thrift.THiveTable;
 import org.apache.doris.thrift.TTableDescriptor;
 import org.apache.doris.thrift.TTableType;
@@ -154,4 +157,10 @@ public class PaimonExternalTable extends ExternalTable {
                     + getPaimonCatalogType());
         }
     }
+
+    @Override
+    public BaseAnalysisTask createAnalysisTask(AnalysisInfo info) {
+        makeSureInitialized();
+        return new ExternalAnalysisTask(info);
+    }
 }
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/common/proc/CatalogsProcDir.java 
b/fe/fe-core/src/main/java/org/apache/doris/common/proc/CatalogsProcDir.java
index 854b4dddc79..e6163645c28 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/common/proc/CatalogsProcDir.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/common/proc/CatalogsProcDir.java
@@ -27,6 +27,7 @@ import com.google.common.base.Preconditions;
 import com.google.common.base.Strings;
 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.Lists;
+import org.apache.log4j.Logger;
 
 import java.util.ArrayList;
 import java.util.Collections;
@@ -37,6 +38,7 @@ import java.util.List;
  * show all catalogs' info
  */
 public class CatalogsProcDir implements ProcDirInterface {
+    private static final Logger LOG = Logger.getLogger(CatalogsProcDir.class);
     public static final ImmutableList<String> TITLE_NAMES = new 
ImmutableList.Builder<String>()
             
.add("CatalogIds").add("CatalogName").add("DatabaseNum").add("LastUpdateTime")
             .build();
@@ -90,7 +92,13 @@ public class CatalogsProcDir implements ProcDirInterface {
             List<Comparable> catalogInfo = Lists.newArrayList();
             catalogInfo.add(catalog.getId());
             catalogInfo.add(catalog.getName());
-            catalogInfo.add(catalog.getDbNames().size());
+            int size = -1;
+            try {
+                size = catalog.getDbNames().size();
+            } catch (Exception e) {
+                LOG.warn("failed to get database: ", e);
+            }
+            catalogInfo.add(size);
             
catalogInfo.add(TimeUtils.longToTimeString(catalog.getLastUpdateTime()));
             catalogInfos.add(catalogInfo);
         }
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/ExternalAnalysisTask.java
similarity index 59%
copy from 
fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java
copy to 
fe/fe-core/src/main/java/org/apache/doris/statistics/ExternalAnalysisTask.java
index fd0a4c82538..15848c013d6 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/ExternalAnalysisTask.java
@@ -19,15 +19,11 @@ package org.apache.doris.statistics;
 
 import org.apache.doris.catalog.Column;
 import org.apache.doris.catalog.Env;
-import org.apache.doris.catalog.external.HMSExternalTable;
-import org.apache.doris.common.AnalysisException;
+import org.apache.doris.catalog.external.ExternalTable;
 import org.apache.doris.common.FeConstants;
 import org.apache.doris.common.Pair;
-import org.apache.doris.datasource.hive.HiveMetaStoreCache;
-import org.apache.doris.external.hive.util.HiveUtil;
 import org.apache.doris.statistics.util.StatisticsUtil;
 
-import com.google.common.collect.Sets;
 import org.apache.commons.text.StringSubstitutor;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
@@ -39,42 +35,43 @@ import java.util.Map;
 import java.util.Random;
 import java.util.Set;
 
-public class HMSAnalysisTask extends BaseAnalysisTask {
-    private static final Logger LOG = 
LogManager.getLogger(HMSAnalysisTask.class);
+public class ExternalAnalysisTask extends BaseAnalysisTask {
+    private static final Logger LOG = 
LogManager.getLogger(ExternalAnalysisTask.class);
 
     private static final String ANALYZE_TABLE_COUNT_TEMPLATE = "SELECT 
ROUND(COUNT(1) * ${scaleFactor}) as rowCount "
             + "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${sampleHints}";
     private boolean isTableLevelTask;
     private boolean isPartitionOnly;
-    private HMSExternalTable table;
+    private ExternalTable table;
 
-    public HMSAnalysisTask() {
+    // For test
+    public ExternalAnalysisTask() {
     }
 
-    public HMSAnalysisTask(AnalysisInfo info) {
+    public ExternalAnalysisTask(AnalysisInfo info) {
         super(info);
         isTableLevelTask = info.externalTableLevelTask;
         isPartitionOnly = info.partitionOnly;
-        table = (HMSExternalTable) tbl;
+        table = (ExternalTable) tbl;
     }
 
     public void doExecute() throws Exception {
         if (isTableLevelTask) {
             getTableStats();
         } else {
-            getTableColumnStats();
+            getOrdinaryColumnStats();
         }
     }
 
     // For test
-    protected void setTable(HMSExternalTable table) {
+    protected void setTable(ExternalTable table) {
         this.table = table;
     }
 
     /**
      * Get table row count
      */
-    private void getTableStats() throws Exception {
+    private void getTableStats() {
         Map<String, String> params = buildStatsParams(null);
         List<ResultRow> columnResult =
                 StatisticsUtil.execStatisticQuery(new StringSubstitutor(params)
@@ -86,35 +83,8 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
         job.rowCountDone(this);
     }
 
-    /**
-     * Get column statistics and insert the result to 
__internal_schema.column_statistics
-     */
-    protected void getTableColumnStats() throws Exception {
-        if (!info.usingSqlForPartitionColumn) {
-            try {
-                if (isPartitionColumn()) {
-                    getPartitionColumnStats();
-                } else {
-                    getHmsColumnStats();
-                }
-            } catch (Exception e) {
-                LOG.warn("Failed to collect stats for {}col {} using metadata, 
"
-                        + "fallback to normal collection",
-                        isPartitionColumn() ? "partition " : "", 
col.getName(), e);
-                /* retry using sql way! */
-                getOrdinaryColumnStats();
-            }
-        } else {
-            getOrdinaryColumnStats();
-        }
-    }
-
-    private boolean isPartitionColumn() {
-        return table.getPartitionColumns().stream().anyMatch(c -> 
c.getName().equals(col.getName()));
-    }
-
-    // Get ordinary column stats. Ordinary column means not partition column.
-    private void getOrdinaryColumnStats() throws Exception {
+    // Get ordinary column stats
+    protected void getOrdinaryColumnStats() throws Exception {
         StringBuilder sb = new StringBuilder();
         Map<String, String> params = buildStatsParams("NULL");
         params.put("min", getMinFunction());
@@ -169,122 +139,7 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
         runQuery(sql);
     }
 
-    // Collect the partition column stats through HMS metadata.
-    // Get all the partition values and calculate the stats based on the 
values.
-    private void getPartitionColumnStats() throws Exception {
-        Set<String> partitionNames = table.getPartitionNames();
-        Set<String> ndvPartValues = Sets.newHashSet();
-        long numNulls = 0;
-        long dataSize = 0;
-        String min = null;
-        String max = null;
-        for (String names : partitionNames) {
-            // names is like "date=20230101" for one level partition
-            // and like "date=20230101/hour=12" for two level partition
-            String[] parts = names.split("/");
-            for (String part : parts) {
-                if (part.startsWith(col.getName())) {
-                    String value = HiveUtil.getHivePartitionValue(part);
-                    // HIVE_DEFAULT_PARTITION hive partition value when the 
partition name is not specified.
-                    if (value == null || value.isEmpty() || 
value.equals(HiveMetaStoreCache.HIVE_DEFAULT_PARTITION)) {
-                        numNulls += 1;
-                        continue;
-                    }
-                    ndvPartValues.add(value);
-                    dataSize += col.getType().isStringType() ? value.length() 
: col.getType().getSlotSize();
-                    min = updateMinValue(min, value);
-                    max = updateMaxValue(max, value);
-                }
-            }
-        }
-        // Estimate the row count. This value is inaccurate if the table stats 
is empty.
-        TableStatsMeta tableStatsStatus = 
Env.getCurrentEnv().getAnalysisManager().findTableStatsStatus(table.getId());
-        long count = tableStatsStatus == null ? table.estimatedRowCount() : 
tableStatsStatus.rowCount;
-        dataSize = dataSize * count / partitionNames.size();
-        numNulls = numNulls * count / partitionNames.size();
-        int ndv = ndvPartValues.size();
-
-        Map<String, String> params = buildStatsParams("NULL");
-        params.put("row_count", String.valueOf(count));
-        params.put("ndv", String.valueOf(ndv));
-        params.put("null_count", String.valueOf(numNulls));
-        params.put("min", StatisticsUtil.quote(min));
-        params.put("max", StatisticsUtil.quote(max));
-        params.put("data_size", String.valueOf(dataSize));
-        StringSubstitutor stringSubstitutor = new StringSubstitutor(params);
-        String sql = 
stringSubstitutor.replace(ANALYZE_PARTITION_COLUMN_TEMPLATE);
-        runQuery(sql);
-    }
-
-    // Collect the spark analyzed column stats through HMS metadata.
-    private void getHmsColumnStats() throws Exception {
-        TableStatsMeta tableStatsStatus = 
Env.getCurrentEnv().getAnalysisManager().findTableStatsStatus(table.getId());
-        long count = tableStatsStatus == null ? table.estimatedRowCount() : 
tableStatsStatus.rowCount;
-
-        Map<String, String> params = buildStatsParams("NULL");
-        Map<StatsType, String> statsParams = new HashMap<>();
-        statsParams.put(StatsType.NDV, "ndv");
-        statsParams.put(StatsType.NUM_NULLS, "null_count");
-        statsParams.put(StatsType.MIN_VALUE, "min");
-        statsParams.put(StatsType.MAX_VALUE, "max");
-        statsParams.put(StatsType.AVG_SIZE, "avg_len");
-
-        if (table.fillColumnStatistics(info.colName, statsParams, params)) {
-            throw new AnalysisException("some column stats not available");
-        }
-
-        long dataSize = Long.valueOf(params.get("avg_len")) * count;
-        params.put("row_count", String.valueOf(count));
-        params.put("data_size", String.valueOf(dataSize));
-
-        StringSubstitutor stringSubstitutor = new StringSubstitutor(params);
-        String sql = 
stringSubstitutor.replace(ANALYZE_PARTITION_COLUMN_TEMPLATE);
-        runQuery(sql);
-    }
-
-    private String updateMinValue(String currentMin, String value) {
-        if (currentMin == null) {
-            return value;
-        }
-        if (col.getType().isFixedPointType()) {
-            if (Long.parseLong(value) < Long.parseLong(currentMin)) {
-                return value;
-            } else {
-                return currentMin;
-            }
-        }
-        if (col.getType().isFloatingPointType() || col.getType().isDecimalV2() 
|| col.getType().isDecimalV3()) {
-            if (Double.parseDouble(value) < Double.parseDouble(currentMin)) {
-                return value;
-            } else {
-                return currentMin;
-            }
-        }
-        return value.compareTo(currentMin) < 0 ? value : currentMin;
-    }
-
-    private String updateMaxValue(String currentMax, String value) {
-        if (currentMax == null) {
-            return value;
-        }
-        if (col.getType().isFixedPointType()) {
-            if (Long.parseLong(value) > Long.parseLong(currentMax)) {
-                return value;
-            } else {
-                return currentMax;
-            }
-        }
-        if (col.getType().isFloatingPointType() || col.getType().isDecimalV2() 
|| col.getType().isDecimalV3()) {
-            if (Double.parseDouble(value) > Double.parseDouble(currentMax)) {
-                return value;
-            } else {
-                return currentMax;
-            }
-        }
-        return value.compareTo(currentMax) > 0 ? value : currentMax;
-    }
-
-    private Map<String, String> buildStatsParams(String partId) {
+    protected Map<String, String> buildStatsParams(String partId) {
         Map<String, String> commonParams = new HashMap<>();
         String id = StatisticsUtil.constructId(tbl.getId(), -1);
         if (partId == null) {
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java
index fd0a4c82538..9e8be622824 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java
@@ -17,12 +17,10 @@
 
 package org.apache.doris.statistics;
 
-import org.apache.doris.catalog.Column;
 import org.apache.doris.catalog.Env;
+import org.apache.doris.catalog.external.ExternalTable;
 import org.apache.doris.catalog.external.HMSExternalTable;
 import org.apache.doris.common.AnalysisException;
-import org.apache.doris.common.FeConstants;
-import org.apache.doris.common.Pair;
 import org.apache.doris.datasource.hive.HiveMetaStoreCache;
 import org.apache.doris.external.hive.util.HiveUtil;
 import org.apache.doris.statistics.util.StatisticsUtil;
@@ -32,64 +30,36 @@ import org.apache.commons.text.StringSubstitutor;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
 
-import java.util.Collections;
 import java.util.HashMap;
-import java.util.List;
 import java.util.Map;
-import java.util.Random;
 import java.util.Set;
 
-public class HMSAnalysisTask extends BaseAnalysisTask {
+public class HMSAnalysisTask extends ExternalAnalysisTask {
     private static final Logger LOG = 
LogManager.getLogger(HMSAnalysisTask.class);
+    private HMSExternalTable hmsExternalTable;
 
-    private static final String ANALYZE_TABLE_COUNT_TEMPLATE = "SELECT 
ROUND(COUNT(1) * ${scaleFactor}) as rowCount "
-            + "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${sampleHints}";
-    private boolean isTableLevelTask;
-    private boolean isPartitionOnly;
-    private HMSExternalTable table;
-
+    // for test
     public HMSAnalysisTask() {
     }
 
     public HMSAnalysisTask(AnalysisInfo info) {
         super(info);
-        isTableLevelTask = info.externalTableLevelTask;
-        isPartitionOnly = info.partitionOnly;
-        table = (HMSExternalTable) tbl;
+        hmsExternalTable = (HMSExternalTable) tbl;
     }
 
-    public void doExecute() throws Exception {
-        if (isTableLevelTask) {
-            getTableStats();
-        } else {
-            getTableColumnStats();
-        }
+    private boolean isPartitionColumn() {
+        return hmsExternalTable.getPartitionColumns().stream().anyMatch(c -> 
c.getName().equals(col.getName()));
     }
 
     // For test
     protected void setTable(HMSExternalTable table) {
-        this.table = table;
+        setTable((ExternalTable) table);
+        this.hmsExternalTable = table;
     }
 
-    /**
-     * Get table row count
-     */
-    private void getTableStats() throws Exception {
-        Map<String, String> params = buildStatsParams(null);
-        List<ResultRow> columnResult =
-                StatisticsUtil.execStatisticQuery(new StringSubstitutor(params)
-                        .replace(ANALYZE_TABLE_COUNT_TEMPLATE));
-        String rowCount = columnResult.get(0).get(0);
-        Env.getCurrentEnv().getAnalysisManager()
-                .updateTableStatsStatus(
-                        new TableStatsMeta(Long.parseLong(rowCount), info, 
tbl));
-        job.rowCountDone(this);
-    }
 
-    /**
-     * Get column statistics and insert the result to 
__internal_schema.column_statistics
-     */
-    protected void getTableColumnStats() throws Exception {
+    @Override
+    protected void getOrdinaryColumnStats() throws Exception {
         if (!info.usingSqlForPartitionColumn) {
             try {
                 if (isPartitionColumn()) {
@@ -102,77 +72,17 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
                         + "fallback to normal collection",
                         isPartitionColumn() ? "partition " : "", 
col.getName(), e);
                 /* retry using sql way! */
-                getOrdinaryColumnStats();
+                super.getOrdinaryColumnStats();
             }
         } else {
-            getOrdinaryColumnStats();
+            super.getOrdinaryColumnStats();
         }
     }
 
-    private boolean isPartitionColumn() {
-        return table.getPartitionColumns().stream().anyMatch(c -> 
c.getName().equals(col.getName()));
-    }
-
-    // Get ordinary column stats. Ordinary column means not partition column.
-    private void getOrdinaryColumnStats() throws Exception {
-        StringBuilder sb = new StringBuilder();
-        Map<String, String> params = buildStatsParams("NULL");
-        params.put("min", getMinFunction());
-        params.put("max", getMaxFunction());
-        params.put("dataSizeFunction", getDataSizeFunction(col, false));
-        Pair<Double, Long> sampleInfo = getSampleInfo();
-        params.put("scaleFactor", String.valueOf(sampleInfo.first));
-        StringSubstitutor stringSubstitutor;
-        if (tableSample == null) {
-            // Do full analyze
-            LOG.debug("Will do full collection for column {}", col.getName());
-            sb.append(COLLECT_COL_STATISTICS);
-        } else {
-            // Do sample analyze
-            LOG.debug("Will do sample collection for column {}", 
col.getName());
-            boolean limitFlag = false;
-            boolean bucketFlag = false;
-            // If sample size is too large, use limit to control the sample 
size.
-            if (needLimit(sampleInfo.second, sampleInfo.first)) {
-                limitFlag = true;
-                long columnSize = 0;
-                for (Column column : table.getFullSchema()) {
-                    columnSize += column.getDataType().getSlotSize();
-                }
-                double targetRows = (double) sampleInfo.second / columnSize;
-                // Estimate the new scaleFactor based on the schema.
-                if (targetRows > StatisticsUtil.getHugeTableSampleRows()) {
-                    params.put("limit", "limit " + 
StatisticsUtil.getHugeTableSampleRows());
-                    params.put("scaleFactor",
-                            String.valueOf(sampleInfo.first * targetRows / 
StatisticsUtil.getHugeTableSampleRows()));
-                }
-            }
-            // Single distribution column is not fit for DUJ1 estimator, use 
linear estimator.
-            Set<String> distributionColumns = tbl.getDistributionColumnNames();
-            if (distributionColumns.size() == 1 && 
distributionColumns.contains(col.getName().toLowerCase())) {
-                bucketFlag = true;
-                sb.append(LINEAR_ANALYZE_TEMPLATE);
-                params.put("ndvFunction", "ROUND(NDV(`${colName}`) * 
${scaleFactor})");
-                params.put("rowCount", "ROUND(count(1) * ${scaleFactor})");
-            } else {
-                sb.append(DUJ1_ANALYZE_TEMPLATE);
-                params.put("dataSizeFunction", getDataSizeFunction(col, true));
-                params.put("ndvFunction", getNdvFunction("ROUND(SUM(t1.count) 
* ${scaleFactor})"));
-                params.put("rowCount", "ROUND(SUM(t1.count) * 
${scaleFactor})");
-            }
-            LOG.info("Sample for column [{}]. Scale factor [{}], "
-                    + "limited [{}], is distribute column [{}]",
-                    col.getName(), params.get("scaleFactor"), limitFlag, 
bucketFlag);
-        }
-        stringSubstitutor = new StringSubstitutor(params);
-        String sql = stringSubstitutor.replace(sb.toString());
-        runQuery(sql);
-    }
-
     // Collect the partition column stats through HMS metadata.
     // Get all the partition values and calculate the stats based on the 
values.
     private void getPartitionColumnStats() throws Exception {
-        Set<String> partitionNames = table.getPartitionNames();
+        Set<String> partitionNames = hmsExternalTable.getPartitionNames();
         Set<String> ndvPartValues = Sets.newHashSet();
         long numNulls = 0;
         long dataSize = 0;
@@ -198,8 +108,9 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
             }
         }
         // Estimate the row count. This value is inaccurate if the table stats 
is empty.
-        TableStatsMeta tableStatsStatus = 
Env.getCurrentEnv().getAnalysisManager().findTableStatsStatus(table.getId());
-        long count = tableStatsStatus == null ? table.estimatedRowCount() : 
tableStatsStatus.rowCount;
+        TableStatsMeta tableStatsStatus = 
Env.getCurrentEnv().getAnalysisManager()
+                .findTableStatsStatus(hmsExternalTable.getId());
+        long count = tableStatsStatus == null ? 
hmsExternalTable.estimatedRowCount() : tableStatsStatus.rowCount;
         dataSize = dataSize * count / partitionNames.size();
         numNulls = numNulls * count / partitionNames.size();
         int ndv = ndvPartValues.size();
@@ -218,8 +129,9 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
 
     // Collect the spark analyzed column stats through HMS metadata.
     private void getHmsColumnStats() throws Exception {
-        TableStatsMeta tableStatsStatus = 
Env.getCurrentEnv().getAnalysisManager().findTableStatsStatus(table.getId());
-        long count = tableStatsStatus == null ? table.estimatedRowCount() : 
tableStatsStatus.rowCount;
+        TableStatsMeta tableStatsStatus = 
Env.getCurrentEnv().getAnalysisManager()
+                .findTableStatsStatus(hmsExternalTable.getId());
+        long count = tableStatsStatus == null ? 
hmsExternalTable.estimatedRowCount() : tableStatsStatus.rowCount;
 
         Map<String, String> params = buildStatsParams("NULL");
         Map<StatsType, String> statsParams = new HashMap<>();
@@ -229,7 +141,7 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
         statsParams.put(StatsType.MAX_VALUE, "max");
         statsParams.put(StatsType.AVG_SIZE, "avg_len");
 
-        if (table.fillColumnStatistics(info.colName, statsParams, params)) {
+        if (hmsExternalTable.fillColumnStatistics(info.colName, statsParams, 
params)) {
             throw new AnalysisException("some column stats not available");
         }
 
@@ -283,126 +195,4 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
         }
         return value.compareTo(currentMax) > 0 ? value : currentMax;
     }
-
-    private Map<String, String> buildStatsParams(String partId) {
-        Map<String, String> commonParams = new HashMap<>();
-        String id = StatisticsUtil.constructId(tbl.getId(), -1);
-        if (partId == null) {
-            commonParams.put("partId", "NULL");
-        } else {
-            id = StatisticsUtil.constructId(id, partId);
-            commonParams.put("partId", "\'" + partId + "\'");
-        }
-        commonParams.put("internalDB", FeConstants.INTERNAL_DB_NAME);
-        commonParams.put("columnStatTbl", 
StatisticConstants.STATISTIC_TBL_NAME);
-        commonParams.put("id", id);
-        commonParams.put("catalogId", String.valueOf(catalog.getId()));
-        commonParams.put("dbId", String.valueOf(db.getId()));
-        commonParams.put("tblId", String.valueOf(tbl.getId()));
-        commonParams.put("indexId", "-1");
-        commonParams.put("idxId", "-1");
-        commonParams.put("colName", info.colName);
-        commonParams.put("colId", info.colName);
-        commonParams.put("catalogName", catalog.getName());
-        commonParams.put("dbName", db.getFullName());
-        commonParams.put("tblName", tbl.getName());
-        commonParams.put("sampleHints", getSampleHint());
-        commonParams.put("limit", "");
-        commonParams.put("scaleFactor", "1");
-        if (col != null) {
-            commonParams.put("type", col.getType().toString());
-        }
-        commonParams.put("lastAnalyzeTimeInMs", 
String.valueOf(System.currentTimeMillis()));
-        return commonParams;
-    }
-
-    protected String getSampleHint() {
-        if (tableSample == null) {
-            return "";
-        }
-        if (tableSample.isPercent()) {
-            return String.format("TABLESAMPLE(%d PERCENT)", 
tableSample.getSampleValue());
-        } else {
-            return String.format("TABLESAMPLE(%d ROWS)", 
tableSample.getSampleValue());
-        }
-    }
-
-    /**
-     * Get the pair of sample scale factor and the file size going to sample.
-     * While analyzing, the result of count, null count and data size need to
-     * multiply this scale factor to get more accurate result.
-     * @return Pair of sample scale factor and the file size going to sample.
-     */
-    protected Pair<Double, Long> getSampleInfo() {
-        if (tableSample == null) {
-            return Pair.of(1.0, 0L);
-        }
-        long target;
-        // Get list of all files' size in this HMS table.
-        List<Long> chunkSizes = table.getChunkSizes();
-        Collections.shuffle(chunkSizes, new Random(tableSample.getSeek()));
-        long total = 0;
-        // Calculate the total size of this HMS table.
-        for (long size : chunkSizes) {
-            total += size;
-        }
-        if (total == 0) {
-            return Pair.of(1.0, 0L);
-        }
-        // Calculate the sample target size for percent and rows sample.
-        if (tableSample.isPercent()) {
-            target = total * tableSample.getSampleValue() / 100;
-        } else {
-            int columnSize = 0;
-            for (Column column : table.getFullSchema()) {
-                columnSize += column.getDataType().getSlotSize();
-            }
-            target = columnSize * tableSample.getSampleValue();
-        }
-        // Calculate the actual sample size (cumulate).
-        long cumulate = 0;
-        for (long size : chunkSizes) {
-            cumulate += size;
-            if (cumulate >= target) {
-                break;
-            }
-        }
-        return Pair.of(Math.max(((double) total) / cumulate, 1), cumulate);
-    }
-
-    @Override
-    protected void afterExecution() {
-        // Table level task doesn't need to sync any value to sync stats, it 
stores the value in metadata.
-        // Partition only task doesn't need to refresh cached.
-        if (isTableLevelTask || isPartitionOnly) {
-            return;
-        }
-        Env.getCurrentEnv().getStatisticsCache().syncLoadColStats(tbl.getId(), 
-1, col.getName());
-    }
-
-    /**
-     * If the size to sample is larger than LIMIT_SIZE (1GB)
-     * and is much larger (1.2*) than the size user want to sample,
-     * use limit to control the total sample size.
-     * @param sizeToRead The file size to sample.
-     * @param factor sizeToRead * factor = Table total size.
-     * @return True if need to limit.
-     */
-    protected boolean needLimit(long sizeToRead, double factor) {
-        long total = (long) (sizeToRead * factor);
-        long target;
-        if (tableSample.isPercent()) {
-            target = total * tableSample.getSampleValue() / 100;
-        } else {
-            int columnSize = 0;
-            for (Column column : table.getFullSchema()) {
-                columnSize += column.getDataType().getSlotSize();
-            }
-            target = columnSize * tableSample.getSampleValue();
-        }
-        if (sizeToRead > LIMIT_SIZE && sizeToRead > target * LIMIT_FACTOR) {
-            return true;
-        }
-        return false;
-    }
 }
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java
index 5c8aec3fbf6..6176ec13bd6 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java
@@ -84,10 +84,12 @@ import org.apache.iceberg.FileScanTask;
 import org.apache.iceberg.PartitionSpec;
 import org.apache.iceberg.Table;
 import org.apache.iceberg.TableScan;
+import org.apache.iceberg.io.CloseableIterable;
 import org.apache.iceberg.types.Types;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
 
+import java.io.IOException;
 import java.net.InetSocketAddress;
 import java.nio.charset.StandardCharsets;
 import java.text.SimpleDateFormat;
@@ -735,8 +737,12 @@ public class StatisticsUtil {
         columnStatisticBuilder.setDataSize(0);
         columnStatisticBuilder.setAvgSizeByte(0);
         columnStatisticBuilder.setNumNulls(0);
-        for (FileScanTask task : tableScan.planFiles()) {
-            processDataFile(task.file(), task.spec(), colName, 
columnStatisticBuilder);
+        try (CloseableIterable<FileScanTask> fileScanTasks = 
tableScan.planFiles()) {
+            for (FileScanTask task : fileScanTasks) {
+                processDataFile(task.file(), task.spec(), colName, 
columnStatisticBuilder);
+            }
+        } catch (IOException e) {
+            LOG.warn("Error to close FileScanTask.", e);
         }
         if (columnStatisticBuilder.getCount() > 0) {
             
columnStatisticBuilder.setAvgSizeByte(columnStatisticBuilder.getDataSize()
diff --git 
a/fe/fe-core/src/test/java/org/apache/doris/statistics/HMSAnalysisTaskTest.java 
b/fe/fe-core/src/test/java/org/apache/doris/statistics/HMSAnalysisTaskTest.java
index fb0a3b3c2ca..e1016864525 100644
--- 
a/fe/fe-core/src/test/java/org/apache/doris/statistics/HMSAnalysisTaskTest.java
+++ 
b/fe/fe-core/src/test/java/org/apache/doris/statistics/HMSAnalysisTaskTest.java
@@ -252,7 +252,7 @@ public class HMSAnalysisTaskTest {
         analysisInfoBuilder.setUsingSqlForPartitionColumn(true);
         task.info = analysisInfoBuilder.build();
 
-        task.getTableColumnStats();
+        task.getOrdinaryColumnStats();
     }
 
 
@@ -309,6 +309,6 @@ public class HMSAnalysisTaskTest {
         analysisInfoBuilder.setUsingSqlForPartitionColumn(false);
         task.info = analysisInfoBuilder.build();
 
-        task.getTableColumnStats();
+        task.getOrdinaryColumnStats();
     }
 }
diff --git a/regression-test/conf/regression-conf.groovy 
b/regression-test/conf/regression-conf.groovy
index 39c41f7c11a..6d17bd032fd 100644
--- a/regression-test/conf/regression-conf.groovy
+++ b/regression-test/conf/regression-conf.groovy
@@ -191,6 +191,7 @@ extArrowFlightSqlPassword= ""
 
 // iceberg rest catalog config
 iceberg_rest_uri_port=18181
+iceberg_minio_port=19001
 
 // If the failure suite num exceeds this config
 // all following suite will be skipped to fast quit the run.
diff --git 
a/regression-test/data/external_table_p0/iceberg/test_iceberg_statistics.out 
b/regression-test/data/external_table_p0/iceberg/test_iceberg_statistics.out
new file mode 100644
index 00000000000..c094d171479
--- /dev/null
+++ b/regression-test/data/external_table_p0/iceberg/test_iceberg_statistics.out
@@ -0,0 +1,39 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !s1 --
+city   1000    4       0       Beijing Shanghai        6973
+col_binary     1000    867     0       0       1111101100100001001     15356
+col_boolean    1000    2       0       0       1       1000
+col_byte       1000    251     0       -128    127     4000
+col_char       1000    963     0       !       zy@notj#fkedb($ 9348
+col_date       1000    3       0       1969-09-21      2969-02-03      4000
+col_decimal    1000    1006    0       4.028284        9999.512216     8000
+col_double     1000    990     0       0.005217837593576302    
9.996285421163707       8000
+col_float      1000    995     0       0.013126845     9.99709 4000
+col_integer    1000    999     0       -21468189       2108484 4000
+col_long       1000    996     0       -92193877774291102      
92127291905311066       8000
+col_short      1000    985     0       -32554  32525   4000
+col_string     1000    992     0       0       zx70Jyeb6TfQ1YUaIGC     10714
+col_timestamp  1000    4       0       1970-01-01 08:00:01.000001      
1970-01-04 08:00:01.000001      8000
+col_timestamp_ntz      1000    4       0       2017-12-01 10:12:55.038194      
2017-12-04 10:12:55.038194      8000
+col_varchar    1000    988     0       0       zvnZ6bBxh       10764
+id     1000    1001    0       -99567408       99854631        8000
+
+-- !s2 --
+city   1000    4       0       Beijing Shanghai        6973
+col_binary     1000    867     0       0       1111101100100001001     15356
+col_boolean    1000    2       0       0       1       1000
+col_byte       1000    251     0       -128    127     4000
+col_char       1000    973     0       !       zy@notj#fkedb($ 9324
+col_date       1000    3       0       1969-09-21      2969-02-03      4000
+col_decimal    1000    1006    0       4.028284        9999.512216     8000
+col_double     1000    990     0       0.005217837593576302    
9.996285421163707       8000
+col_float      1000    995     0       0.013126845     9.99709 4000
+col_integer    1000    999     0       -21468189       2108484 4000
+col_long       1000    996     0       -92193877774291102      
92127291905311066       8000
+col_short      1000    985     0       -32554  32525   4000
+col_string     1000    992     0       0       zx70Jyeb6TfQ1YUaIGC     10714
+col_timestamp  1000    4       0       1970-01-01 08:00:01.000001      
1970-01-04 08:00:01.000001      8000
+col_timestamp_ntz      1000    4       0       2017-12-01 10:12:55.038194      
2017-12-04 10:12:55.038194      8000
+col_varchar    1000    988     0       0       zvnZ6bBxh       10764
+id     1000    1001    0       -99567408       99854631        8000
+
diff --git 
a/regression-test/data/external_table_p0/paimon/test_paimon_statistics.out 
b/regression-test/data/external_table_p0/paimon/test_paimon_statistics.out
new file mode 100644
index 00000000000..0f9f20d4782
--- /dev/null
+++ b/regression-test/data/external_table_p0/paimon/test_paimon_statistics.out
@@ -0,0 +1,21 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !s1 --
+c1     2       2       0       1       10      2
+c10    2       2       0       10.1    100.1   16
+c11    2       2       0       11.10   110.10  16
+c12    2       2       0       2020-02-02      2020-03-02      8
+c13    2       2       0       130str  13str   11
+c14    2       2       0       140varchar      14varchar       19
+c15    2       2       0       a       b       2
+c16    2       2       0       0       1       2
+c17    2       2       0       aaaa    bbbb    8
+c18    2       2       0       2023-08-13 09:32:38.530000      2023-08-14 
08:32:52.821000      16
+c2     2       2       0       2       20      2
+c3     2       2       0       3       30      4
+c4     2       2       0       4       40      4
+c5     2       2       0       5       50      8
+c6     2       2       0       6       60      8
+c7     2       2       0       7       70      16
+c8     2       2       0       8       80      16
+c9     2       2       0       9.1     90.1    8
+
diff --git 
a/regression-test/data/external_table_p2/hive/test_hive_hudi_statistics.out 
b/regression-test/data/external_table_p2/hive/test_hive_hudi_statistics.out
new file mode 100644
index 00000000000..66a36a81afc
--- /dev/null
+++ b/regression-test/data/external_table_p2/hive/test_hive_hudi_statistics.out
@@ -0,0 +1,16 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !s1 --
+_hoodie_commit_seqno   4       4       0       20230605145009209_0_1   
20230801201335031_1_1   84
+_hoodie_commit_time    4       3       0       20230605145009209       
20230801201335031       68
+_hoodie_file_name      4       4       0       
65ffc5d9-397a-456e-a735-30f3ad37466f-0  
e33d645c-6e2f-41f3-b8d6-f658771bd460-0_1-83-220_20230605145403388.parquet       
221
+_hoodie_partition_path 4       3       0       
partitionId=2011-11-11/versionId=v_1    partitionId=2021-02-01/versionId=v_4    
144
+_hoodie_record_key     4       3       0       rowId:row_1     rowId:row_4     
44
+inttolong      4       2       0       0       1       16
+longtoint      4       3       0       1000000 1000004 32
+name   4       3       0       ashin   john    15
+partitionid    4       3       0       2011-11-11      2021-02-01      40
+precomb        4       3       0       0       4       32
+rowid  4       3       0       row_1   row_4   20
+tobedeletedstr 4       3       0       toBeDel0        toBeDel4        32
+versionid      4       3       0       v_0     v_4     12
+
diff --git a/regression-test/pipeline/p0/conf/regression-conf.groovy 
b/regression-test/pipeline/p0/conf/regression-conf.groovy
index be70ab0f730..b99e21c4e7c 100644
--- a/regression-test/pipeline/p0/conf/regression-conf.groovy
+++ b/regression-test/pipeline/p0/conf/regression-conf.groovy
@@ -96,6 +96,7 @@ kafka_port=19193
 
 // iceberg test config
 iceberg_rest_uri_port=18181
+iceberg_minio_port=19001
 
 enableEsTest=false
 es_6_port=19200
diff --git 
a/regression-test/pipeline/tpch/tpch-sf100/conf/regression-conf.groovy 
b/regression-test/pipeline/tpch/tpch-sf100/conf/regression-conf.groovy
index 364a7103fe8..5234ccc4241 100644
--- a/regression-test/pipeline/tpch/tpch-sf100/conf/regression-conf.groovy
+++ b/regression-test/pipeline/tpch/tpch-sf100/conf/regression-conf.groovy
@@ -94,6 +94,7 @@ kafka_port=19193
 
 // iceberg test config
 iceberg_rest_uri_port=18181
+iceberg_minio_port=19001
 
 enableEsTest=false
 es_6_port=19200
diff --git a/regression-test/plugins/plugins_get_ids_from_proc.groovy 
b/regression-test/plugins/plugins_get_ids_from_proc.groovy
new file mode 100644
index 00000000000..74a4d4d2010
--- /dev/null
+++ b/regression-test/plugins/plugins_get_ids_from_proc.groovy
@@ -0,0 +1,62 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+import org.apache.doris.regression.suite.Suite
+
+Suite.metaClass.get_catalog_id = {String catalog_name /* param */ ->
+    String catalog_id;
+    def catalogs = sql """show proc '/catalogs'"""
+    for (catalog in catalogs) {
+        if (catalog[1].equals(catalog_name)) {
+            catalog_id = catalog[0]
+            break
+        }
+    }
+    log.info("get catalogid: " + catalog_id)
+    return catalog_id
+}
+
+
+Suite.metaClass.get_database_id = {String catalog_name, String db_name /* 
param */ ->
+    String database_id;
+    def catalog_id = get_catalog_id(catalog_name)
+    def dbs = sql """show proc '/catalogs/${catalog_id}'"""
+    for (db in dbs) {
+        if (db[1].equals(db_name)) {
+            database_id = db[0]
+            break
+        }
+    }
+    log.info("get database_id: " + database_id)
+    return database_id
+}
+
+
+Suite.metaClass.get_table_id = {String catalog_name, String db_name, String 
tb_name /* param */ ->
+    String table_id;
+    def catalog_id = get_catalog_id(catalog_name)
+    def database_id = get_database_id(catalog_name, db_name)
+    def tbs = sql """show proc '/catalogs/${catalog_id}/${database_id}'"""
+    for (tb in tbs) {
+        if (tb[1].equals(tb_name)) {
+            table_id = tb[0]
+            break
+        }
+    }
+    log.info("get table_id: " + table_id)
+    return table_id
+}
diff --git 
a/regression-test/suites/external_table_p0/iceberg/test_iceberg_statistics.groovy
 
b/regression-test/suites/external_table_p0/iceberg/test_iceberg_statistics.groovy
new file mode 100644
index 00000000000..24b27eb70b4
--- /dev/null
+++ 
b/regression-test/suites/external_table_p0/iceberg/test_iceberg_statistics.groovy
@@ -0,0 +1,57 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_iceberg_statistics", 
"p0,external,doris,external_docker,external_docker_doris") {
+    String enabled = context.config.otherConfigs.get("enableIcebergTest")
+    if (enabled != null && enabled.equalsIgnoreCase("true")) {
+        try {
+            String rest_port = 
context.config.otherConfigs.get("iceberg_rest_uri_port")
+            String minio_port = 
context.config.otherConfigs.get("iceberg_minio_port")
+            String externalEnvIp = 
context.config.otherConfigs.get("externalEnvIp")
+            String catalog_name = "test_iceberg_rest_catalog"
+            String db_name = "format_v2"
+
+            sql """drop catalog if exists ${catalog_name}"""
+            sql """CREATE CATALOG ${catalog_name} PROPERTIES (
+                    'type'='iceberg',
+                    'iceberg.catalog.type'='rest',
+                    'uri' = 'http://${externalEnvIp}:${rest_port}',
+                    "s3.access_key" = "admin",
+                    "s3.secret_key" = "password",
+                    "s3.endpoint" = "http://${externalEnvIp}:${minio_port}";,
+                    "s3.region" = "us-east-1"
+                );"""
+
+            def table_id_mor = get_table_id(catalog_name, db_name, 
"sample_mor_parquet")
+            def table_id_cow = get_table_id(catalog_name, db_name, 
"sample_cow_parquet")
+
+            // analyze
+            sql """use `${catalog_name}`.`${db_name}`"""
+            sql """analyze table sample_mor_parquet with sync"""
+            sql """analyze table sample_cow_parquet with sync"""
+
+            // select
+            def s1 = """select 
col_id,count,ndv,null_count,min,max,data_size_in_bytes from 
internal.__internal_schema.column_statistics where tbl_id = ${table_id_mor} 
order by id;"""
+            def s2 = """select 
col_id,count,ndv,null_count,min,max,data_size_in_bytes from 
internal.__internal_schema.column_statistics where tbl_id = ${table_id_cow} 
order by id;"""
+
+            qt_s1 s1
+            qt_s2 s2
+        } finally {
+        }
+    }
+}
+
diff --git 
a/regression-test/suites/external_table_p0/paimon/test_paimon_catalog.groovy 
b/regression-test/suites/external_table_p0/paimon/test_paimon_catalog.groovy
index 87ea14ad2fd..ce8c9b5e849 100644
--- a/regression-test/suites/external_table_p0/paimon/test_paimon_catalog.groovy
+++ b/regression-test/suites/external_table_p0/paimon/test_paimon_catalog.groovy
@@ -52,6 +52,9 @@ suite("test_paimon_catalog", 
"p0,external,doris,external_docker,external_docker_
         );
     """
 
+    sql """drop catalog ${file_ctl_name}""";
+    sql """drop catalog ${hms_ctl_name}""";
+
     String enabled = context.config.otherConfigs.get("enablePaimonTest")
         if (enabled != null && enabled.equalsIgnoreCase("true")) {
             def all = """select * from all_table order by c1;"""
diff --git 
a/regression-test/suites/external_table_p0/paimon/test_paimon_statistics.groovy 
b/regression-test/suites/external_table_p0/paimon/test_paimon_statistics.groovy
new file mode 100644
index 00000000000..c75e7b797d9
--- /dev/null
+++ 
b/regression-test/suites/external_table_p0/paimon/test_paimon_statistics.groovy
@@ -0,0 +1,47 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_paimon_statistics", 
"p0,external,doris,external_docker,external_docker_doris") {
+    String enabled = context.config.otherConfigs.get("enablePaimonTest")
+    if (enabled != null && enabled.equalsIgnoreCase("true")) {
+        try {
+            String hdfs_port = context.config.otherConfigs.get("hdfs_port")
+            String catalog_name = "paimon1"
+            String externalEnvIp = 
context.config.otherConfigs.get("externalEnvIp")
+
+            sql """drop catalog if exists ${catalog_name}"""
+            sql """create catalog if not exists ${catalog_name} properties (
+                "type" = "paimon",
+                "paimon.catalog.type"="filesystem",
+                "warehouse" = 
"hdfs://${externalEnvIp}:${hdfs_port}/user/doris/paimon1"
+            );"""
+
+            def table_id = get_table_id(catalog_name, "db1", "all_table")
+
+            // analyze
+            sql """use `${catalog_name}`.`db1`"""
+            sql """analyze table all_table with sync"""
+
+            // select
+            def s1 = """select 
col_id,count,ndv,null_count,min,max,data_size_in_bytes from 
internal.__internal_schema.column_statistics where tbl_id = ${table_id} order 
by id;"""
+
+            qt_s1 s1
+        } finally {
+        }
+    }
+}
+
diff --git a/regression-test/suites/external_table_p0/test_catalog_ddl.groovy 
b/regression-test/suites/external_table_p0/test_catalog_ddl.groovy
index b236567e8bd..a9a67f51853 100644
--- a/regression-test/suites/external_table_p0/test_catalog_ddl.groovy
+++ b/regression-test/suites/external_table_p0/test_catalog_ddl.groovy
@@ -44,4 +44,6 @@ suite("test_catalog_ddl", "p0,external,external_docker") {
         result = sql """show create catalog ${catalog1};"""
         assertEquals(result.size(), 1)
         assertTrue(result[0][1].contains("COMMENT \"alter_comment\""))
+
+        sql """drop catalog ${catalog1}"""
 }
diff --git 
a/regression-test/suites/external_table_p2/hive/test_hive_hudi_statistics.groovy
 
b/regression-test/suites/external_table_p2/hive/test_hive_hudi_statistics.groovy
new file mode 100644
index 00000000000..55e5037de45
--- /dev/null
+++ 
b/regression-test/suites/external_table_p2/hive/test_hive_hudi_statistics.groovy
@@ -0,0 +1,47 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_hive_hudi_statistics", "p2,external,hive,hudi") {
+    String enabled = context.config.otherConfigs.get("enableExternalHiveTest")
+    if (enabled != null && enabled.equalsIgnoreCase("true")) {
+        String extHiveHmsHost = 
context.config.otherConfigs.get("extHiveHmsHost")
+        String extHiveHmsPort = 
context.config.otherConfigs.get("extHiveHmsPort")
+        String catalog_name = "test_hive_hudi_statistics"
+        String db_name = "hudi_catalog"
+
+        sql """drop catalog if exists ${catalog_name};"""
+        sql """
+            create catalog if not exists ${catalog_name} properties (
+                'hadoop.username'='hadoop',
+                'type'='hms',
+                'hive.metastore.uris' = 
'thrift://${extHiveHmsHost}:${extHiveHmsPort}'
+            );
+        """
+
+        def table_id = get_table_id(catalog_name, db_name, 
"partitioned_mor_rt")
+
+        // analyze
+        sql """use `${catalog_name}`.`${db_name}`"""
+        sql """analyze table partitioned_mor_rt with sync"""
+
+        // select
+        def s1 = """select 
col_id,count,ndv,null_count,min,max,data_size_in_bytes from 
internal.__internal_schema.column_statistics where tbl_id = ${table_id} order 
by id;"""
+
+        qt_s1 s1
+
+    }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

(doris) branch master updated: [feature](statistics)support statistics for iceberg/paimon/hudi table (#29868)

Reply via email to