This is an automated email from the ASF dual-hosted git repository. lijibing pushed a commit to branch branch-2.1 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.1 by this push: new 002232b2da3 [improvement](statistics)Analyze all columns when partition first loaded. (#38606) 002232b2da3 is described below commit 002232b2da3caf2eb5c239b585111f1bd0f01448 Author: Jibing-Li <64681310+jibing...@users.noreply.github.com> AuthorDate: Wed Jul 31 19:59:41 2024 +0800 [improvement](statistics)Analyze all columns when partition first loaded. (#38606) backport: https://github.com/apache/doris/pull/38540 --- .../apache/doris/analysis/AnalyzeProperties.java | 2 + .../java/org/apache/doris/catalog/OlapTable.java | 4 + .../apache/doris/statistics/AnalysisManager.java | 16 +- .../doris/statistics/StatisticsAutoCollector.java | 7 - .../apache/doris/statistics/TableStatsMeta.java | 12 +- .../doris/statistics/AnalysisManagerTest.java | 8 +- .../suites/statistics/analyze_stats.groovy | 264 +++++++++++++-------- 7 files changed, 198 insertions(+), 115 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeProperties.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeProperties.java index 4b5f161d2be..11197cfe62e 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeProperties.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeProperties.java @@ -44,6 +44,7 @@ public class AnalyzeProperties { public static final String PROPERTY_PERIOD_SECONDS = "period.seconds"; public static final String PROPERTY_FORCE_FULL = "force.full"; public static final String PROPERTY_PARTITION_COLUMN_FROM_SQL = "partition.column.from.sql"; + public static final String PROPERTY_USE_AUTO_ANALYZER = "use.auto.analyzer"; public static final AnalyzeProperties DEFAULT_PROP = new AnalyzeProperties(new HashMap<String, String>() { { @@ -72,6 +73,7 @@ public class AnalyzeProperties { .add(PROPERTY_PERIOD_CRON) .add(PROPERTY_FORCE_FULL) .add(PROPERTY_PARTITION_COLUMN_FROM_SQL) + .add(PROPERTY_USE_AUTO_ANALYZER) .build(); public AnalyzeProperties(Map<String, String> properties) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java index 55ec9938d9b..12550899a46 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java @@ -1304,6 +1304,10 @@ public class OlapTable extends Table implements MTMVRelatedTableIf { return true; } + // Check new partition first loaded. + if (tblStats.newPartitionLoaded != null && tblStats.newPartitionLoaded.get()) { + return true; + } // 1 Check row count. long currentRowCount = getRowCount(); long lastAnalyzeRowCount = tblStats.rowCount; diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java index 71f5ce0fa87..d4d0231505f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java @@ -204,7 +204,21 @@ public class AnalysisManager implements Writable { } // Each analyze stmt corresponding to an analysis job. - public void createAnalysisJob(AnalyzeTblStmt stmt, boolean proxy) throws DdlException { + public void createAnalysisJob(AnalyzeTblStmt stmt, boolean proxy) throws DdlException, AnalysisException { + // Using auto analyzer if user specifies. + if (stmt.getAnalyzeProperties().getProperties().containsKey("use.auto.analyzer")) { + StatisticsAutoCollector autoCollector = Env.getCurrentEnv().getStatisticsAutoCollector(); + if (autoCollector.skip(stmt.getTable())) { + return; + } + List<AnalysisInfo> jobs = new ArrayList<>(); + autoCollector.createAnalyzeJobForTbl(stmt.getDb(), jobs, stmt.getTable()); + AnalysisInfo job = autoCollector.getNeedAnalyzeColumns(jobs.get(0)); + if (job != null) { + Env.getCurrentEnv().getStatisticsAutoCollector().createSystemAnalysisJob(job); + } + return; + } AnalysisInfo jobInfo = buildAndAssignJob(stmt); if (jobInfo == null) { return; diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoCollector.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoCollector.java index 4408a0d9255..f76037d80cb 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoCollector.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoCollector.java @@ -21,7 +21,6 @@ import org.apache.doris.catalog.Column; import org.apache.doris.catalog.DatabaseIf; import org.apache.doris.catalog.Env; import org.apache.doris.catalog.OlapTable; -import org.apache.doris.catalog.Partition; import org.apache.doris.catalog.TableIf; import org.apache.doris.common.Config; import org.apache.doris.common.DdlException; @@ -41,7 +40,6 @@ import org.apache.logging.log4j.Logger; import java.time.LocalTime; import java.util.ArrayList; import java.util.List; -import java.util.Set; import java.util.StringJoiner; import java.util.concurrent.TimeUnit; import java.util.stream.Collectors; @@ -221,11 +219,6 @@ public class StatisticsAutoCollector extends StatisticsCollector { if (table.needReAnalyzeTable(tblStats)) { needRunColumns = table.getColumnIndexPairs(table.getSchemaAllIndexes(false) .stream().map(Column::getName).collect(Collectors.toSet())); - } else if (table instanceof OlapTable && tblStats.newPartitionLoaded.get()) { - OlapTable olapTable = (OlapTable) table; - Set<String> partitionNames = olapTable.getAllPartitions().stream() - .map(Partition::getName).collect(Collectors.toSet()); - needRunColumns = olapTable.getColumnIndexPairs(partitionNames); } if (needRunColumns == null || needRunColumns.isEmpty()) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatsMeta.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatsMeta.java index bb2f19ac25c..2d38b7ccd34 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatsMeta.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatsMeta.java @@ -19,7 +19,6 @@ package org.apache.doris.statistics; import org.apache.doris.catalog.Column; import org.apache.doris.catalog.OlapTable; -import org.apache.doris.catalog.PartitionInfo; import org.apache.doris.catalog.TableIf; import org.apache.doris.common.Pair; import org.apache.doris.common.io.Text; @@ -160,14 +159,6 @@ public class TableStatsMeta implements Writable, GsonPostProcessable { updatedRows.set(0); newPartitionLoaded.set(false); } - if (tableIf instanceof OlapTable) { - PartitionInfo partitionInfo = ((OlapTable) tableIf).getPartitionInfo(); - if (partitionInfo != null && analyzedJob.jobColumns - .containsAll(tableIf.getColumnIndexPairs(partitionInfo.getPartitionColumns().stream() - .map(Column::getName).collect(Collectors.toSet())))) { - newPartitionLoaded.set(false); - } - } } } @@ -180,6 +171,9 @@ public class TableStatsMeta implements Writable, GsonPostProcessable { if (indexesRowCount == null) { indexesRowCount = new ConcurrentHashMap<>(); } + if (newPartitionLoaded == null) { + newPartitionLoaded = new AtomicBoolean(false); + } } public long getRowCount(long indexId) { diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalysisManagerTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalysisManagerTest.java index 8a803bd2a30..012ab383fd7 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalysisManagerTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalysisManagerTest.java @@ -285,12 +285,18 @@ public class AnalysisManagerTest { }; OlapTable olapTable = new OlapTable(); + TableStatsMeta stats0 = new TableStatsMeta( + 50, new AnalysisInfoBuilder().setJobColumns(new ArrayList<>()) + .setColName("col1").setRowCount(100).build(), olapTable); + stats0.newPartitionLoaded.set(true); + Assertions.assertTrue(olapTable.needReAnalyzeTable(stats0)); + TableStatsMeta stats1 = new TableStatsMeta( 50, new AnalysisInfoBuilder().setJobColumns(new ArrayList<>()) .setColName("col1").setRowCount(100).build(), olapTable); stats1.updatedRows.addAndGet(70); - Assertions.assertTrue(olapTable.needReAnalyzeTable(stats1)); + TableStatsMeta stats2 = new TableStatsMeta( 190, new AnalysisInfoBuilder().setJobColumns(new ArrayList<>()) .setColName("col1").setRowCount(200).build(), olapTable); diff --git a/regression-test/suites/statistics/analyze_stats.groovy b/regression-test/suites/statistics/analyze_stats.groovy index 5a6e753e0a3..09a8ecbcad3 100644 --- a/regression-test/suites/statistics/analyze_stats.groovy +++ b/regression-test/suites/statistics/analyze_stats.groovy @@ -164,7 +164,73 @@ suite("test_analyze") { exception = e } - // Test sample agg table value column. Min max is N/A when zone map is not available. + // Test partition load data for the first time. + sql """ + CREATE TABLE `partition_test` ( + `id` INT NOT NULL, + `name` VARCHAR(25) NOT NULL, + `comment` VARCHAR(152) NULL + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + COMMENT 'OLAP' + PARTITION BY RANGE(`id`) + (PARTITION p1 VALUES [("0"), ("100")), + PARTITION p2 VALUES [("100"), ("200")), + PARTITION p3 VALUES [("200"), ("300"))) + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ( + "replication_num" = "1"); + """ + + sql """analyze table partition_test with sync""" + sql """insert into partition_test values (1, '1', '1')""" + def partition_result = sql """show table stats partition_test""" + assertEquals(partition_result[0][6], "true") + assertEquals(partition_result[0][0], "1") + sql """analyze table partition_test with sync""" + partition_result = sql """show table stats partition_test""" + assertEquals(partition_result[0][6], "false") + sql """insert into partition_test values (101, '1', '1')""" + partition_result = sql """show table stats partition_test""" + assertEquals(partition_result[0][6], "true") + sql """analyze table partition_test(id) with sync""" + partition_result = sql """show table stats partition_test""" + assertEquals(partition_result[0][6], "true") + sql """analyze table partition_test with sync""" + partition_result = sql """show table stats partition_test""" + assertEquals(partition_result[0][6], "false") + sql """insert into partition_test values (102, '1', '1')""" + partition_result = sql """show table stats partition_test""" + assertEquals(partition_result[0][6], "false") + sql """insert into partition_test values (2, '2', '2')""" + sql """insert into partition_test values (3, '3', '3')""" + sql """insert into partition_test values (4, '4', '4')""" + sql """insert into partition_test values (5, '5', '5')""" + sql """insert into partition_test values (6, '6', '6')""" + sql """insert into partition_test values (7, '7', '7')""" + sql """insert into partition_test values (8, '8', '8')""" + sql """insert into partition_test values (9, '9', '9')""" + sql """insert into partition_test values (10, '10', '10')""" + sql """insert into partition_test values (103, '1', '1')""" + sql """insert into partition_test values (104, '1', '1')""" + sql """insert into partition_test values (105, '1', '1')""" + partition_result = sql """show table stats partition_test""" + assertEquals(partition_result[0][6], "false") + sql """analyze table partition_test with sync""" + sql """insert into partition_test values (201, '1', '1')""" + partition_result = sql """show table stats partition_test""" + assertEquals(partition_result[0][6], "true") + partition_result = sql """show column stats partition_test(id)""" + assertEquals("id", partition_result[0][0]) + assertEquals("15.0", partition_result[0][2]) + partition_result = sql """show column stats partition_test(name)""" + assertEquals("name", partition_result[0][0]) + assertEquals("15.0", partition_result[0][2]) + partition_result = sql """show column stats partition_test(comment)""" + assertEquals("comment", partition_result[0][0]) + assertEquals("15.0", partition_result[0][2]) + + // Test sample agg table value column sql """ CREATE TABLE `agg_table_test` ( `id` BIGINT NOT NULL, @@ -184,6 +250,43 @@ suite("test_analyze") { assertEquals(agg_result[0][7], "N/A") assertEquals(agg_result[0][8], "N/A") + // Continue test partition load data for the first time. + def reported = false; + for (int i = 0; i < 10; i++) { + def data_result = sql """show data from partition_test""" + logger.info("show data from partition_test: " + data_result) + if (data_result[0][4] == '16') { + reported = true; + break; + } + sleep(1000) + } + if (!reported) { + logger.info("partition_test row count is not reported.") + } else { + sql """analyze table partition_test PROPERTIES("use.auto.analyzer"="true")""" + for (int i = 0; i < 10; i++) { + def auto_analyze_result = sql """show auto analyze partition_test""" + logger.info("show auto analyze partition_test result : " + auto_analyze_result) + if (auto_analyze_result[0][9] == 'FINISHED') { + logger.info("Auto analyze finished.") + auto_analyze_result = sql """show table stats partition_test""" + assertEquals(auto_analyze_result[0][6], "false") + auto_analyze_result = sql """show column stats partition_test(id)""" + assertEquals("id", auto_analyze_result[0][0]) + assertEquals("16.0", auto_analyze_result[0][2]) + auto_analyze_result = sql """show column stats partition_test(name)""" + assertEquals("name", auto_analyze_result[0][0]) + assertEquals("16.0", auto_analyze_result[0][2]) + auto_analyze_result = sql """show column stats partition_test(comment)""" + assertEquals("comment", auto_analyze_result[0][0]) + assertEquals("16.0", auto_analyze_result[0][2]) + break + } + sleep(1000) + } + } + def a_result_1 = sql """ ANALYZE DATABASE ${db} WITH SYNC WITH SAMPLE PERCENT 10 """ @@ -2595,42 +2698,6 @@ PARTITION `p599` VALUES IN (599) assertEquals("\'name1\'", result[0][7]) assertEquals("\'name3\'", result[0][8]) - // Test partititon load data for the first time. - sql """ - CREATE TABLE `partition_test` ( - `id` INT NOT NULL, - `name` VARCHAR(25) NOT NULL, - `comment` VARCHAR(152) NULL - ) ENGINE=OLAP - DUPLICATE KEY(`id`) - COMMENT 'OLAP' - PARTITION BY RANGE(`id`) - (PARTITION p1 VALUES [("0"), ("100")), - PARTITION p2 VALUES [("100"), ("200")), - PARTITION p3 VALUES [("200"), ("300"))) - DISTRIBUTED BY HASH(`id`) BUCKETS 1 - PROPERTIES ( - "replication_num" = "1"); - """ - - sql """analyze table partition_test with sync""" - sql """insert into partition_test values (1, '1', '1')""" - def partition_result = sql """show table stats partition_test""" - assertEquals(partition_result[0][6], "true") - assertEquals(partition_result[0][0], "1") - sql """analyze table partition_test with sync""" - partition_result = sql """show table stats partition_test""" - assertEquals(partition_result[0][6], "false") - sql """insert into partition_test values (101, '1', '1')""" - partition_result = sql """show table stats partition_test""" - assertEquals(partition_result[0][6], "true") - sql """analyze table partition_test(id) with sync""" - partition_result = sql """show table stats partition_test""" - assertEquals(partition_result[0][6], "false") - sql """insert into partition_test values (102, '1', '1')""" - partition_result = sql """show table stats partition_test""" - assertEquals(partition_result[0][6], "false") - // Test sample string type min max sql """ CREATE TABLE `string_min_max` ( @@ -2646,6 +2713,7 @@ PARTITION `p599` VALUES IN (599) """ sql """insert into string_min_max values (1,'name1'), (2, 'name2')""" sql """analyze table string_min_max with sync""" + explain { sql("select min(name), max(name) from string_min_max") contains "pushAggOp=NONE" @@ -2656,19 +2724,21 @@ PARTITION `p599` VALUES IN (599) contains "pushAggOp=MINMAX" } + sql """set forbid_unknown_col_stats=true""" + // Test alter sql """ - CREATE TABLE alter_test( - `id` int NOT NULL, - `name` VARCHAR(25) NOT NULL - )ENGINE=OLAP - DUPLICATE KEY(`id`) - COMMENT "OLAP" - DISTRIBUTED BY HASH(`id`) BUCKETS 1 - PROPERTIES ( - "replication_num" = "1" - ); - """ + CREATE TABLE alter_test( + `id` int NOT NULL, + `name` VARCHAR(25) NOT NULL + )ENGINE=OLAP + DUPLICATE KEY(`id`) + COMMENT "OLAP" + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ( + "replication_num" = "1" + ); + """ sql """ANALYZE TABLE alter_test WITH SYNC""" def alter_result = sql """show table stats alter_test""" assertEquals("false", alter_result[0][7]) @@ -2710,17 +2780,17 @@ PARTITION `p599` VALUES IN (599) sql """CREATE DATABASE IF NOT EXISTS trigger""" sql """USE trigger""" sql """ - CREATE TABLE if not exists trigger_test( - `id` int NOT NULL, - `name` VARCHAR(152) - )ENGINE=OLAP - DUPLICATE KEY(`id`) - COMMENT "OLAP" - DISTRIBUTED BY HASH(`id`) BUCKETS 1 - PROPERTIES ( - "replication_num" = "1" - ); - """ + CREATE TABLE if not exists trigger_test( + `id` int NOT NULL, + `name` VARCHAR(152) + )ENGINE=OLAP + DUPLICATE KEY(`id`) + COMMENT "OLAP" + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ( + "replication_num" = "1" + ); + """ // Test sample empty table def result_sample = sql """analyze table trigger_test with sample percent 10 with sync""" result_sample = sql """show column stats trigger_test""" @@ -2797,43 +2867,43 @@ PARTITION `p599` VALUES IN (599) // Test inject stats avg_size. sql """CREATE TABLE `date_dim` ( - `d_date_sk` BIGINT NOT NULL, - `d_date_id` CHAR(16) NOT NULL, - `d_date` DATE NULL, - `d_month_seq` INT NULL, - `d_week_seq` INT NULL, - `d_quarter_seq` INT NULL, - `d_year` INT NULL, - `d_dow` INT NULL, - `d_moy` INT NULL, - `d_dom` INT NULL, - `d_qoy` INT NULL, - `d_fy_year` INT NULL, - `d_fy_quarter_seq` INT NULL, - `d_fy_week_seq` INT NULL, - `d_day_name` CHAR(9) NULL, - `d_quarter_name` CHAR(6) NULL, - `d_holiday` CHAR(1) NULL, - `d_weekend` CHAR(1) NULL, - `d_following_holiday` CHAR(1) NULL, - `d_first_dom` INT NULL, - `d_last_dom` INT NULL, - `d_same_day_ly` INT NULL, - `d_same_day_lq` INT NULL, - `d_current_day` CHAR(1) NULL, - `d_current_week` CHAR(1) NULL, - `d_current_month` CHAR(1) NULL, - `d_current_quarter` CHAR(1) NULL, - `d_current_year` CHAR(1) NULL - ) ENGINE=OLAP - DUPLICATE KEY(`d_date_sk`) - DISTRIBUTED BY HASH(`d_date_sk`) BUCKETS 12 - PROPERTIES ( - "replication_allocation" = "tag.location.default: 1") + `d_date_sk` BIGINT NOT NULL, + `d_date_id` CHAR(16) NOT NULL, + `d_date` DATE NULL, + `d_month_seq` INT NULL, + `d_week_seq` INT NULL, + `d_quarter_seq` INT NULL, + `d_year` INT NULL, + `d_dow` INT NULL, + `d_moy` INT NULL, + `d_dom` INT NULL, + `d_qoy` INT NULL, + `d_fy_year` INT NULL, + `d_fy_quarter_seq` INT NULL, + `d_fy_week_seq` INT NULL, + `d_day_name` CHAR(9) NULL, + `d_quarter_name` CHAR(6) NULL, + `d_holiday` CHAR(1) NULL, + `d_weekend` CHAR(1) NULL, + `d_following_holiday` CHAR(1) NULL, + `d_first_dom` INT NULL, + `d_last_dom` INT NULL, + `d_same_day_ly` INT NULL, + `d_same_day_lq` INT NULL, + `d_current_day` CHAR(1) NULL, + `d_current_week` CHAR(1) NULL, + `d_current_month` CHAR(1) NULL, + `d_current_quarter` CHAR(1) NULL, + `d_current_year` CHAR(1) NULL + ) ENGINE=OLAP + DUPLICATE KEY(`d_date_sk`) + DISTRIBUTED BY HASH(`d_date_sk`) BUCKETS 12 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1") """ sql """ - alter table date_dim modify column d_day_name set stats ('row_count'='73049', 'ndv'='7', 'num_nulls'='0', 'min_value'='Friday', 'max_value'='Wednesday', 'data_size'='521779') + alter table date_dim modify column d_day_name set stats ('row_count'='73049', 'ndv'='7', 'num_nulls'='0', 'min_value'='Friday', 'max_value'='Wednesday', 'data_size'='521779') """ alter_result = sql """show column cached stats date_dim""" @@ -2847,17 +2917,17 @@ PARTITION `p599` VALUES IN (599) // Test analyze after new empty partition created. sql """CREATE TABLE `part` ( - `id` INT NULL, - `colint` INT NULL + `id` INT NULL, + `colint` INT NULL ) ENGINE=OLAP DUPLICATE KEY(`id`) COMMENT 'OLAP' PARTITION BY RANGE(`id`) (PARTITION p1 VALUES [("-2147483648"), ("10000")), PARTITION p2 VALUES [("10000"), ("20000"))) - DISTRIBUTED BY HASH(`id`) BUCKETS 3 + DISTRIBUTED BY HASH(`id`) BUCKETS 3 PROPERTIES ( - "replication_allocation" = "tag.location.default: 1" + "replication_allocation" = "tag.location.default: 1" ); """ --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org