This is an automated email from the ASF dual-hosted git repository. dataroaring pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/doris.git
commit 6c2da4c30404a324b53ba97d8e98df1160211a89 Author: Jibing-Li <[email protected]> AuthorDate: Wed Jul 31 18:05:45 2024 +0800 [improvement](statistics)Analyze all columns when partition first loaded. (#38540) When a partition load data for the first time, will analyze all columns in the table in the next auto analyze iteration. --- .../doris/statistics/StatisticsAutoCollector.java | 11 +- .../apache/doris/statistics/TableStatsMeta.java | 8 -- .../doris/statistics/util/StatisticsUtil.java | 2 +- .../suites/statistics/analyze_stats.groovy | 133 +++++++++++++++------ .../analyze_stats_partition_first_load.groovy | 5 +- 5 files changed, 109 insertions(+), 50 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoCollector.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoCollector.java index e0b046befc9..2ca0ab1c265 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoCollector.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoCollector.java @@ -143,7 +143,7 @@ public class StatisticsAutoCollector extends MasterDaemon { protected void processOneJob(TableIf table, Set<Pair<String, String>> columns, JobPriority priority) throws DdlException { // appendMvColumn(table, columns); - appendPartitionColumns(table, columns); + appendAllColumns(table, columns); columns = columns.stream().filter(c -> StatisticsUtil.needAnalyzeColumn(table, c)).collect(Collectors.toSet()); if (columns.isEmpty()) { return; @@ -161,7 +161,8 @@ public class StatisticsAutoCollector extends MasterDaemon { } } - protected void appendPartitionColumns(TableIf table, Set<Pair<String, String>> columns) throws DdlException { + // If partition changed (partition first loaded, partition dropped and so on), need re-analyze all columns. + protected void appendAllColumns(TableIf table, Set<Pair<String, String>> columns) throws DdlException { if (!(table instanceof OlapTable)) { return; } @@ -169,7 +170,11 @@ public class StatisticsAutoCollector extends MasterDaemon { TableStatsMeta tableStatsStatus = manager.findTableStatsStatus(table.getId()); if (tableStatsStatus != null && tableStatsStatus.partitionChanged.get()) { OlapTable olapTable = (OlapTable) table; - columns.addAll(olapTable.getColumnIndexPairs(olapTable.getPartitionColumnNames())); + Set<String> allColumnPairs = olapTable.getSchemaAllIndexes(false).stream() + .filter(c -> !StatisticsUtil.isUnsupportedType(c.getType())) + .map(Column::getName) + .collect(Collectors.toSet()); + columns.addAll(olapTable.getColumnIndexPairs(allColumnPairs)); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatsMeta.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatsMeta.java index 4c984394f3c..1048708958b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatsMeta.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatsMeta.java @@ -19,7 +19,6 @@ package org.apache.doris.statistics; import org.apache.doris.catalog.Column; import org.apache.doris.catalog.OlapTable; -import org.apache.doris.catalog.PartitionInfo; import org.apache.doris.catalog.TableIf; import org.apache.doris.common.Pair; import org.apache.doris.common.io.Text; @@ -166,13 +165,6 @@ public class TableStatsMeta implements Writable, GsonPostProcessable { .map(Column::getName).collect(Collectors.toSet())))) { partitionChanged.set(false); userInjected = false; - } else if (tableIf instanceof OlapTable) { - PartitionInfo partitionInfo = ((OlapTable) tableIf).getPartitionInfo(); - if (partitionInfo != null && analyzedJob.jobColumns - .containsAll(tableIf.getColumnIndexPairs(partitionInfo.getPartitionColumns().stream() - .map(Column::getName).collect(Collectors.toSet())))) { - partitionChanged.set(false); - } } } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java index 5947fcc0e60..45b7454adff 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java @@ -1011,7 +1011,7 @@ public class StatisticsUtil { if (table instanceof OlapTable) { OlapTable olapTable = (OlapTable) table; // 0. Check new partition first time loaded flag. - if (olapTable.isPartitionColumn(column.second) && tableStatsStatus.partitionChanged.get()) { + if (tableStatsStatus.partitionChanged.get()) { return true; } // 1. Check row count. diff --git a/regression-test/suites/statistics/analyze_stats.groovy b/regression-test/suites/statistics/analyze_stats.groovy index 852dbeb1321..37f59406d25 100644 --- a/regression-test/suites/statistics/analyze_stats.groovy +++ b/regression-test/suites/statistics/analyze_stats.groovy @@ -165,6 +165,73 @@ suite("test_analyze") { exception = e } + // Test partition load data for the first time. + sql """ + CREATE TABLE `partition_test` ( + `id` INT NOT NULL, + `name` VARCHAR(25) NOT NULL, + `comment` VARCHAR(152) NULL + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + COMMENT 'OLAP' + PARTITION BY RANGE(`id`) + (PARTITION p1 VALUES [("0"), ("100")), + PARTITION p2 VALUES [("100"), ("200")), + PARTITION p3 VALUES [("200"), ("300"))) + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ( + "replication_num" = "1"); + """ + + sql """analyze table partition_test with sync""" + sql """insert into partition_test values (1, '1', '1')""" + def partition_result = sql """show table stats partition_test""" + assertEquals(partition_result[0][6], "true") + assertEquals(partition_result[0][0], "1") + sql """analyze table partition_test with sync""" + partition_result = sql """show table stats partition_test""" + assertEquals(partition_result[0][6], "false") + sql """insert into partition_test values (101, '1', '1')""" + partition_result = sql """show table stats partition_test""" + assertEquals(partition_result[0][6], "true") + sql """analyze table partition_test(id) with sync""" + partition_result = sql """show table stats partition_test""" + assertEquals(partition_result[0][6], "true") + sql """analyze table partition_test with sync""" + partition_result = sql """show table stats partition_test""" + assertEquals(partition_result[0][6], "false") + sql """insert into partition_test values (102, '1', '1')""" + partition_result = sql """show table stats partition_test""" + assertEquals(partition_result[0][6], "false") + sql """insert into partition_test values (2, '2', '2')""" + sql """insert into partition_test values (3, '3', '3')""" + sql """insert into partition_test values (4, '4', '4')""" + sql """insert into partition_test values (5, '5', '5')""" + sql """insert into partition_test values (6, '6', '6')""" + sql """insert into partition_test values (7, '7', '7')""" + sql """insert into partition_test values (8, '8', '8')""" + sql """insert into partition_test values (9, '9', '9')""" + sql """insert into partition_test values (10, '10', '10')""" + sql """insert into partition_test values (103, '1', '1')""" + sql """insert into partition_test values (104, '1', '1')""" + sql """insert into partition_test values (105, '1', '1')""" + partition_result = sql """show table stats partition_test""" + assertEquals(partition_result[0][6], "false") + sql """analyze table partition_test with sync""" + sql """insert into partition_test values (201, '1', '1')""" + partition_result = sql """show table stats partition_test""" + assertEquals(partition_result[0][6], "true") + partition_result = sql """show column stats partition_test(id)""" + assertEquals("id", partition_result[0][0]) + assertEquals("15.0", partition_result[0][2]) + partition_result = sql """show column stats partition_test(name)""" + assertEquals("name", partition_result[0][0]) + assertEquals("15.0", partition_result[0][2]) + partition_result = sql """show column stats partition_test(comment)""" + assertEquals("comment", partition_result[0][0]) + assertEquals("15.0", partition_result[0][2]) + + // Test sample agg table value column. Min max is N/A when zone map is not available. sql """ CREATE TABLE `agg_table_test` ( @@ -177,7 +244,7 @@ suite("test_analyze") { PROPERTIES ( "replication_num" = "1" ); - """ + """ sql """insert into agg_table_test values (1,'name1'), (2, 'name2')""" sql """analyze table agg_table_test with sample rows 100 with sync""" def agg_result = sql """show column stats agg_table_test (name)""" @@ -185,6 +252,34 @@ suite("test_analyze") { assertEquals(agg_result[0][7], "N/A") assertEquals(agg_result[0][8], "N/A") + // Continue test partition load data for the first time. + def reported = false; + for (int i = 0; i < 10; i++) { + def data_result = sql """show data from partition_test""" + logger.info("show data from partition_test: " + data_result) + if (data_result[0][4] == '16') { + reported = true; + break; + } + sleep(1000) + } + if (!reported) { + logger.info("partition_test row count is not reported.") + } else { + sql """analyze table partition_test(name) PROPERTIES("use.auto.analyzer"="true")""" + partition_result = sql """show table stats partition_test""" + assertEquals(partition_result[0][6], "false") + partition_result = sql """show column stats partition_test(id)""" + assertEquals("id", partition_result[0][0]) + assertEquals("16.0", partition_result[0][2]) + partition_result = sql """show column stats partition_test(name)""" + assertEquals("name", partition_result[0][0]) + assertEquals("16.0", partition_result[0][2]) + partition_result = sql """show column stats partition_test(comment)""" + assertEquals("comment", partition_result[0][0]) + assertEquals("16.0", partition_result[0][2]) + } + def a_result_1 = sql """ ANALYZE DATABASE ${db} WITH SYNC WITH SAMPLE PERCENT 10 """ @@ -2596,42 +2691,6 @@ PARTITION `p599` VALUES IN (599) assertEquals("\'name1\'", result[0][7]) assertEquals("\'name3\'", result[0][8]) - // Test partititon load data for the first time. - sql """ - CREATE TABLE `partition_test` ( - `id` INT NOT NULL, - `name` VARCHAR(25) NOT NULL, - `comment` VARCHAR(152) NULL - ) ENGINE=OLAP - DUPLICATE KEY(`id`) - COMMENT 'OLAP' - PARTITION BY RANGE(`id`) - (PARTITION p1 VALUES [("0"), ("100")), - PARTITION p2 VALUES [("100"), ("200")), - PARTITION p3 VALUES [("200"), ("300"))) - DISTRIBUTED BY HASH(`id`) BUCKETS 1 - PROPERTIES ( - "replication_num" = "1"); - """ - - sql """analyze table partition_test with sync""" - sql """insert into partition_test values (1, '1', '1')""" - def partition_result = sql """show table stats partition_test""" - assertEquals(partition_result[0][6], "true") - assertEquals(partition_result[0][0], "1") - sql """analyze table partition_test with sync""" - partition_result = sql """show table stats partition_test""" - assertEquals(partition_result[0][6], "false") - sql """insert into partition_test values (101, '1', '1')""" - partition_result = sql """show table stats partition_test""" - assertEquals(partition_result[0][6], "true") - sql """analyze table partition_test(id) with sync""" - partition_result = sql """show table stats partition_test""" - assertEquals(partition_result[0][6], "false") - sql """insert into partition_test values (102, '1', '1')""" - partition_result = sql """show table stats partition_test""" - assertEquals(partition_result[0][6], "false") - // Test sample string type min max sql """ CREATE TABLE `string_min_max` ( diff --git a/regression-test/suites/statistics/analyze_stats_partition_first_load.groovy b/regression-test/suites/statistics/analyze_stats_partition_first_load.groovy index d25ff8086c4..10a5d98df60 100644 --- a/regression-test/suites/statistics/analyze_stats_partition_first_load.groovy +++ b/regression-test/suites/statistics/analyze_stats_partition_first_load.groovy @@ -57,6 +57,9 @@ suite("test_analyze_partition_first_load") { assertEquals(partition_result.new_partition[0], "true") sql """analyze table `$tbl`(id) with sync""" partition_result = sql_return_maparray """show table stats `$tbl`""" + assertEquals(partition_result.new_partition[0], "true") + sql """analyze table `$tbl` with sync""" + partition_result = sql_return_maparray """show table stats `$tbl`""" assertEquals(partition_result.new_partition[0], "false") sql """insert into `$tbl` values (102, '1', '1')""" partition_result = sql_return_maparray """show table stats `$tbl`""" @@ -84,7 +87,7 @@ suite("test_analyze_partition_first_load") { assertEquals(partition_result.new_partition[0], "true") // last chance, still failure? } - sql """analyze table `$tbl`(id) with sync""" + sql """analyze table `$tbl` with sync""" partition_result = sql_return_maparray """show table stats `$tbl`""" assertEquals(partition_result.new_partition[0], "false") } --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
