This is an automated email from the ASF dual-hosted git repository.

dataroaring pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/doris.git

commit 6c2da4c30404a324b53ba97d8e98df1160211a89
Author: Jibing-Li <[email protected]>
AuthorDate: Wed Jul 31 18:05:45 2024 +0800

    [improvement](statistics)Analyze all columns when partition first loaded. 
(#38540)
    
    When a partition load data for the first time, will analyze all columns
    in the table in the next auto analyze iteration.
---
 .../doris/statistics/StatisticsAutoCollector.java  |  11 +-
 .../apache/doris/statistics/TableStatsMeta.java    |   8 --
 .../doris/statistics/util/StatisticsUtil.java      |   2 +-
 .../suites/statistics/analyze_stats.groovy         | 133 +++++++++++++++------
 .../analyze_stats_partition_first_load.groovy      |   5 +-
 5 files changed, 109 insertions(+), 50 deletions(-)

diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoCollector.java
 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoCollector.java
index e0b046befc9..2ca0ab1c265 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoCollector.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoCollector.java
@@ -143,7 +143,7 @@ public class StatisticsAutoCollector extends MasterDaemon {
     protected void processOneJob(TableIf table, Set<Pair<String, String>> 
columns,
             JobPriority priority) throws DdlException {
         // appendMvColumn(table, columns);
-        appendPartitionColumns(table, columns);
+        appendAllColumns(table, columns);
         columns = columns.stream().filter(c -> 
StatisticsUtil.needAnalyzeColumn(table, c)).collect(Collectors.toSet());
         if (columns.isEmpty()) {
             return;
@@ -161,7 +161,8 @@ public class StatisticsAutoCollector extends MasterDaemon {
         }
     }
 
-    protected void appendPartitionColumns(TableIf table, Set<Pair<String, 
String>> columns) throws DdlException {
+    // If partition changed (partition first loaded, partition dropped and so 
on), need re-analyze all columns.
+    protected void appendAllColumns(TableIf table, Set<Pair<String, String>> 
columns) throws DdlException {
         if (!(table instanceof OlapTable)) {
             return;
         }
@@ -169,7 +170,11 @@ public class StatisticsAutoCollector extends MasterDaemon {
         TableStatsMeta tableStatsStatus = 
manager.findTableStatsStatus(table.getId());
         if (tableStatsStatus != null && 
tableStatsStatus.partitionChanged.get()) {
             OlapTable olapTable = (OlapTable) table;
-            
columns.addAll(olapTable.getColumnIndexPairs(olapTable.getPartitionColumnNames()));
+            Set<String> allColumnPairs = 
olapTable.getSchemaAllIndexes(false).stream()
+                    .filter(c -> 
!StatisticsUtil.isUnsupportedType(c.getType()))
+                    .map(Column::getName)
+                    .collect(Collectors.toSet());
+            columns.addAll(olapTable.getColumnIndexPairs(allColumnPairs));
         }
     }
 
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatsMeta.java 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatsMeta.java
index 4c984394f3c..1048708958b 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatsMeta.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatsMeta.java
@@ -19,7 +19,6 @@ package org.apache.doris.statistics;
 
 import org.apache.doris.catalog.Column;
 import org.apache.doris.catalog.OlapTable;
-import org.apache.doris.catalog.PartitionInfo;
 import org.apache.doris.catalog.TableIf;
 import org.apache.doris.common.Pair;
 import org.apache.doris.common.io.Text;
@@ -166,13 +165,6 @@ public class TableStatsMeta implements Writable, 
GsonPostProcessable {
                             
.map(Column::getName).collect(Collectors.toSet())))) {
                 partitionChanged.set(false);
                 userInjected = false;
-            } else if (tableIf instanceof OlapTable) {
-                PartitionInfo partitionInfo = ((OlapTable) 
tableIf).getPartitionInfo();
-                if (partitionInfo != null && analyzedJob.jobColumns
-                        
.containsAll(tableIf.getColumnIndexPairs(partitionInfo.getPartitionColumns().stream()
-                            
.map(Column::getName).collect(Collectors.toSet())))) {
-                    partitionChanged.set(false);
-                }
             }
         }
     }
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java
index 5947fcc0e60..45b7454adff 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java
@@ -1011,7 +1011,7 @@ public class StatisticsUtil {
         if (table instanceof OlapTable) {
             OlapTable olapTable = (OlapTable) table;
             // 0. Check new partition first time loaded flag.
-            if (olapTable.isPartitionColumn(column.second) && 
tableStatsStatus.partitionChanged.get()) {
+            if (tableStatsStatus.partitionChanged.get()) {
                 return true;
             }
             // 1. Check row count.
diff --git a/regression-test/suites/statistics/analyze_stats.groovy 
b/regression-test/suites/statistics/analyze_stats.groovy
index 852dbeb1321..37f59406d25 100644
--- a/regression-test/suites/statistics/analyze_stats.groovy
+++ b/regression-test/suites/statistics/analyze_stats.groovy
@@ -165,6 +165,73 @@ suite("test_analyze") {
         exception = e
     }
 
+    // Test partition load data for the first time.
+    sql """
+     CREATE TABLE `partition_test` (
+      `id` INT NOT NULL,
+      `name` VARCHAR(25) NOT NULL,
+      `comment` VARCHAR(152) NULL
+      ) ENGINE=OLAP
+      DUPLICATE KEY(`id`)
+      COMMENT 'OLAP'
+      PARTITION BY RANGE(`id`)
+      (PARTITION p1 VALUES [("0"), ("100")),
+       PARTITION p2 VALUES [("100"), ("200")),
+       PARTITION p3 VALUES [("200"), ("300")))
+      DISTRIBUTED BY HASH(`id`) BUCKETS 1
+      PROPERTIES (
+       "replication_num" = "1");
+     """
+
+    sql """analyze table partition_test with sync"""
+    sql """insert into partition_test values (1, '1', '1')"""
+    def partition_result = sql """show table stats partition_test"""
+    assertEquals(partition_result[0][6], "true")
+    assertEquals(partition_result[0][0], "1")
+    sql """analyze table partition_test with sync"""
+    partition_result = sql """show table stats partition_test"""
+    assertEquals(partition_result[0][6], "false")
+    sql """insert into partition_test values (101, '1', '1')"""
+    partition_result = sql """show table stats partition_test"""
+    assertEquals(partition_result[0][6], "true")
+    sql """analyze table partition_test(id) with sync"""
+    partition_result = sql """show table stats partition_test"""
+    assertEquals(partition_result[0][6], "true")
+    sql """analyze table partition_test with sync"""
+    partition_result = sql """show table stats partition_test"""
+    assertEquals(partition_result[0][6], "false")
+    sql """insert into partition_test values (102, '1', '1')"""
+    partition_result = sql """show table stats partition_test"""
+    assertEquals(partition_result[0][6], "false")
+    sql """insert into partition_test values (2, '2', '2')"""
+    sql """insert into partition_test values (3, '3', '3')"""
+    sql """insert into partition_test values (4, '4', '4')"""
+    sql """insert into partition_test values (5, '5', '5')"""
+    sql """insert into partition_test values (6, '6', '6')"""
+    sql """insert into partition_test values (7, '7', '7')"""
+    sql """insert into partition_test values (8, '8', '8')"""
+    sql """insert into partition_test values (9, '9', '9')"""
+    sql """insert into partition_test values (10, '10', '10')"""
+    sql """insert into partition_test values (103, '1', '1')"""
+    sql """insert into partition_test values (104, '1', '1')"""
+    sql """insert into partition_test values (105, '1', '1')"""
+    partition_result = sql """show table stats partition_test"""
+    assertEquals(partition_result[0][6], "false")
+    sql """analyze table partition_test with sync"""
+    sql """insert into partition_test values (201, '1', '1')"""
+    partition_result = sql """show table stats partition_test"""
+    assertEquals(partition_result[0][6], "true")
+    partition_result = sql """show column stats partition_test(id)"""
+    assertEquals("id", partition_result[0][0])
+    assertEquals("15.0", partition_result[0][2])
+    partition_result = sql """show column stats partition_test(name)"""
+    assertEquals("name", partition_result[0][0])
+    assertEquals("15.0", partition_result[0][2])
+    partition_result = sql """show column stats partition_test(comment)"""
+    assertEquals("comment", partition_result[0][0])
+    assertEquals("15.0", partition_result[0][2])
+
+
     // Test sample agg table value column. Min max is N/A when zone map is not 
available.
     sql """
      CREATE TABLE `agg_table_test` (
@@ -177,7 +244,7 @@ suite("test_analyze") {
      PROPERTIES (
       "replication_num" = "1"
      );
-   """
+    """
     sql """insert into agg_table_test values (1,'name1'), (2, 'name2')"""
     sql """analyze table agg_table_test with sample rows 100 with sync"""
     def agg_result = sql """show column stats agg_table_test (name)"""
@@ -185,6 +252,34 @@ suite("test_analyze") {
     assertEquals(agg_result[0][7], "N/A")
     assertEquals(agg_result[0][8], "N/A")
 
+    // Continue test partition load data for the first time.
+    def reported = false;
+    for (int i = 0; i < 10; i++) {
+        def data_result = sql """show data from partition_test"""
+        logger.info("show data from partition_test: " + data_result)
+        if (data_result[0][4] == '16') {
+            reported = true;
+            break;
+        }
+        sleep(1000)
+    }
+    if (!reported) {
+        logger.info("partition_test row count is not reported.")
+    } else {
+        sql """analyze table partition_test(name) 
PROPERTIES("use.auto.analyzer"="true")"""
+        partition_result = sql """show table stats partition_test"""
+        assertEquals(partition_result[0][6], "false")
+        partition_result = sql """show column stats partition_test(id)"""
+        assertEquals("id", partition_result[0][0])
+        assertEquals("16.0", partition_result[0][2])
+        partition_result = sql """show column stats partition_test(name)"""
+        assertEquals("name", partition_result[0][0])
+        assertEquals("16.0", partition_result[0][2])
+        partition_result = sql """show column stats partition_test(comment)"""
+        assertEquals("comment", partition_result[0][0])
+        assertEquals("16.0", partition_result[0][2])
+    }
+
     def a_result_1 = sql """
         ANALYZE DATABASE ${db} WITH SYNC WITH SAMPLE PERCENT 10
     """
@@ -2596,42 +2691,6 @@ PARTITION `p599` VALUES IN (599)
     assertEquals("\'name1\'", result[0][7])
     assertEquals("\'name3\'", result[0][8])
 
-    // Test partititon load data for the first time.
-    sql """
-     CREATE TABLE `partition_test` (
-      `id` INT NOT NULL,
-      `name` VARCHAR(25) NOT NULL,
-      `comment` VARCHAR(152) NULL
-      ) ENGINE=OLAP
-      DUPLICATE KEY(`id`)
-      COMMENT 'OLAP'
-      PARTITION BY RANGE(`id`)
-      (PARTITION p1 VALUES [("0"), ("100")),
-       PARTITION p2 VALUES [("100"), ("200")),
-       PARTITION p3 VALUES [("200"), ("300")))
-      DISTRIBUTED BY HASH(`id`) BUCKETS 1
-      PROPERTIES (
-       "replication_num" = "1");
-     """
-
-    sql """analyze table partition_test with sync"""
-    sql """insert into partition_test values (1, '1', '1')"""
-    def partition_result = sql """show table stats partition_test"""
-    assertEquals(partition_result[0][6], "true")
-    assertEquals(partition_result[0][0], "1")
-    sql """analyze table partition_test with sync"""
-    partition_result = sql """show table stats partition_test"""
-    assertEquals(partition_result[0][6], "false")
-    sql """insert into partition_test values (101, '1', '1')"""
-    partition_result = sql """show table stats partition_test"""
-    assertEquals(partition_result[0][6], "true")
-    sql """analyze table partition_test(id) with sync"""
-    partition_result = sql """show table stats partition_test"""
-    assertEquals(partition_result[0][6], "false")
-    sql """insert into partition_test values (102, '1', '1')"""
-    partition_result = sql """show table stats partition_test"""
-    assertEquals(partition_result[0][6], "false")
-
     // Test sample string type min max
     sql """
      CREATE TABLE `string_min_max` (
diff --git 
a/regression-test/suites/statistics/analyze_stats_partition_first_load.groovy 
b/regression-test/suites/statistics/analyze_stats_partition_first_load.groovy
index d25ff8086c4..10a5d98df60 100644
--- 
a/regression-test/suites/statistics/analyze_stats_partition_first_load.groovy
+++ 
b/regression-test/suites/statistics/analyze_stats_partition_first_load.groovy
@@ -57,6 +57,9 @@ suite("test_analyze_partition_first_load") {
     assertEquals(partition_result.new_partition[0], "true")
     sql """analyze table `$tbl`(id) with sync"""
     partition_result = sql_return_maparray """show table stats `$tbl`"""
+    assertEquals(partition_result.new_partition[0], "true")
+    sql """analyze table `$tbl` with sync"""
+    partition_result = sql_return_maparray """show table stats `$tbl`"""
     assertEquals(partition_result.new_partition[0], "false")
     sql """insert into `$tbl` values (102, '1', '1')"""
     partition_result = sql_return_maparray """show table stats `$tbl`"""
@@ -84,7 +87,7 @@ suite("test_analyze_partition_first_load") {
         assertEquals(partition_result.new_partition[0], "true") // last 
chance, still failure?
     }
 
-    sql """analyze table `$tbl`(id) with sync"""
+    sql """analyze table `$tbl` with sync"""
     partition_result = sql_return_maparray """show table stats `$tbl`"""
     assertEquals(partition_result.new_partition[0], "false")
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to