This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new f4c5ce260b4 [fix](statistics)Fix rowCount==0 while analyzing bug (#28969) f4c5ce260b4 is described below commit f4c5ce260b4ac4b3974bcd8ef7dcd059ecfdd78b Author: Jibing-Li <64681310+jibing...@users.noreply.github.com> AuthorDate: Wed Dec 27 23:04:37 2023 +0800 [fix](statistics)Fix rowCount==0 while analyzing bug (#28969) Sample analyzing need to get row count by using table.getRowCount(). This method is not updated in real time, which may cause the sample task to scan whole table. This pr is to fix this. Set the flag that indicate the analyze job is for an empty table and skip scan the table. Meanwhile, don't reset updatedRows in this case. Set hugeTableAutoAnalyzeIntervalInMillis = 0 because all default huge table size has been set to 0. --- docs/en/docs/query-acceleration/statistics.md | 4 ++-- docs/zh-CN/docs/query-acceleration/statistics.md | 4 ++-- .../src/main/java/org/apache/doris/qe/SessionVariable.java | 2 +- .../main/java/org/apache/doris/statistics/AnalysisInfo.java | 7 ++++++- .../java/org/apache/doris/statistics/AnalysisInfoBuilder.java | 10 ++++++++-- .../main/java/org/apache/doris/statistics/AnalysisManager.java | 1 + .../java/org/apache/doris/statistics/OlapAnalysisTask.java | 3 ++- .../java/org/apache/doris/statistics/StatisticConstants.java | 2 +- .../org/apache/doris/statistics/StatisticsAutoCollector.java | 1 + .../main/java/org/apache/doris/statistics/TableStatsMeta.java | 2 +- .../apache/doris/statistics/StatisticsAutoCollectorTest.java | 2 +- regression-test/suites/statistics/analyze_stats.groovy | 2 +- 12 files changed, 27 insertions(+), 13 deletions(-) diff --git a/docs/en/docs/query-acceleration/statistics.md b/docs/en/docs/query-acceleration/statistics.md index c7a58277580..4cb0891172d 100644 --- a/docs/en/docs/query-acceleration/statistics.md +++ b/docs/en/docs/query-acceleration/statistics.md @@ -295,8 +295,8 @@ mysql> KILL ANALYZE 52357; |auto_analyze_end_time|End time for automatic statistics collection|23:59:59| |enable_auto_analyze|Enable automatic collection functionality|true| |huge_table_default_sample_rows|Sampling rows for large tables|4194304| -|huge_table_lower_bound_size_in_bytes|Tables with size greater than this value will be automatically sampled during collection of statistics|5368709120| -|huge_table_auto_analyze_interval_in_millis|Controls the minimum time interval for automatic ANALYZE on large tables. Tables with sizes greater than `huge_table_lower_bound_size_in_bytes * 5` will be ANALYZEed only once within this time interval.|43200000| +|huge_table_lower_bound_size_in_bytes|Tables with size greater than this value will be automatically sampled during collection of statistics|0| +|huge_table_auto_analyze_interval_in_millis|Controls the minimum time interval for automatic ANALYZE on large tables. Tables with sizes greater than `huge_table_lower_bound_size_in_bytes * 5` will be ANALYZEed only once within this time interval.|0| |table_stats_health_threshold|Ranges from 0 to 100. If data updates since the last statistics collection exceed `(100 - table_stats_health_threshold)%`, the table's statistics are considered outdated.|60| |analyze_timeout|Controls the timeout for synchronous ANALYZE in seconds|43200| |auto_analyze_table_width_threshold|Controls the maximum width of table that will be auto analyzed. Table with more columns than this value will not be auto analyzed.|70| diff --git a/docs/zh-CN/docs/query-acceleration/statistics.md b/docs/zh-CN/docs/query-acceleration/statistics.md index 20b535e357b..bff100fa98a 100644 --- a/docs/zh-CN/docs/query-acceleration/statistics.md +++ b/docs/zh-CN/docs/query-acceleration/statistics.md @@ -299,8 +299,8 @@ mysql> KILL ANALYZE 52357; |auto_analyze_end_time|自动统计信息收集结束时间|23:59:59| |enable_auto_analyze|开启自动收集功能|true| |huge_table_default_sample_rows|对大表的采样行数|4194304| -|huge_table_lower_bound_size_in_bytes|大小超过该值的的表,在自动收集时将会自动通过采样收集统计信息|5368709120| -|huge_table_auto_analyze_interval_in_millis|控制对大表的自动ANALYZE的最小时间间隔,在该时间间隔内大小超过huge_table_lower_bound_size_in_bytes * 5的表仅ANALYZE一次|43200000| +|huge_table_lower_bound_size_in_bytes|大小超过该值的的表,在自动收集时将会自动通过采样收集统计信息|0| +|huge_table_auto_analyze_interval_in_millis|控制对大表的自动ANALYZE的最小时间间隔,在该时间间隔内大小超过huge_table_lower_bound_size_in_bytes * 5的表仅ANALYZE一次|0| |table_stats_health_threshold|取值在0-100之间,当自上次统计信息收集操作之后,数据更新量达到 (100 - table_stats_health_threshold)% ,认为该表的统计信息已过时|60| |analyze_timeout|控制ANALYZE超时时间,单位为秒|43200| |auto_analyze_table_width_threshold|控制自动统计信息收集处理的最大表宽度,列数大于该值的表不会参与自动统计信息收集|70| diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java index 35545059901..c1ea2f29ff2 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java @@ -1450,7 +1450,7 @@ public class SessionVariable implements Serializable, Writable { "This controls the minimum time interval for automatic ANALYZE on large tables." + "Within this interval," + "tables larger than huge_table_lower_bound_size_in_bytes are analyzed only once."}) - public long hugeTableAutoAnalyzeIntervalInMillis = TimeUnit.HOURS.toMillis(12); + public long hugeTableAutoAnalyzeIntervalInMillis = TimeUnit.HOURS.toMillis(0); @VariableMgr.VarAttr(name = EXTERNAL_TABLE_AUTO_ANALYZE_INTERVAL_IN_MILLIS, flag = VariableMgr.GLOBAL, description = {"控制对外表的自动ANALYZE的最小时间间隔,在该时间间隔内的外表仅ANALYZE一次", diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfo.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfo.java index 65bb4a5dd95..aaff9e59927 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfo.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfo.java @@ -188,6 +188,9 @@ public class AnalysisInfo implements Writable { @SerializedName("endTime") public long endTime; + + @SerializedName("emptyJob") + public final boolean emptyJob; /** * * Used to store the newest partition version of tbl when creating this job. @@ -202,7 +205,7 @@ public class AnalysisInfo implements Writable { long lastExecTimeInMs, long timeCostInMs, AnalysisState state, ScheduleType scheduleType, boolean isExternalTableLevelTask, boolean partitionOnly, boolean samplingPartition, boolean isAllPartition, long partitionCount, CronExpression cronExpression, boolean forceFull, - boolean usingSqlForPartitionColumn, long tblUpdateTime) { + boolean usingSqlForPartitionColumn, long tblUpdateTime, boolean emptyJob) { this.jobId = jobId; this.taskId = taskId; this.taskIds = taskIds; @@ -238,6 +241,7 @@ public class AnalysisInfo implements Writable { this.forceFull = forceFull; this.usingSqlForPartitionColumn = usingSqlForPartitionColumn; this.tblUpdateTime = tblUpdateTime; + this.emptyJob = emptyJob; } @Override @@ -279,6 +283,7 @@ public class AnalysisInfo implements Writable { } sj.add("forceFull: " + forceFull); sj.add("usingSqlForPartitionColumn: " + usingSqlForPartitionColumn); + sj.add("emptyJob: " + emptyJob); return sj.toString(); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfoBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfoBuilder.java index 204aba6d0f8..310b7816ecd 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfoBuilder.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfoBuilder.java @@ -61,8 +61,8 @@ public class AnalysisInfoBuilder { private CronExpression cronExpression; private boolean forceFull; private boolean usingSqlForPartitionColumn; - private long tblUpdateTime; + private boolean emptyJob; public AnalysisInfoBuilder() { } @@ -100,6 +100,7 @@ public class AnalysisInfoBuilder { forceFull = info.forceFull; usingSqlForPartitionColumn = info.usingSqlForPartitionColumn; tblUpdateTime = info.tblUpdateTime; + emptyJob = info.emptyJob; } public AnalysisInfoBuilder setJobId(long jobId) { @@ -262,12 +263,17 @@ public class AnalysisInfoBuilder { return this; } + public AnalysisInfoBuilder setEmptyJob(boolean emptyJob) { + this.emptyJob = emptyJob; + return this; + } + public AnalysisInfo build() { return new AnalysisInfo(jobId, taskId, taskIds, catalogId, dbId, tblId, colToPartitions, partitionNames, colName, indexId, jobType, analysisMode, analysisMethod, analysisType, samplePercent, sampleRows, maxBucketNum, periodTimeInMs, message, lastExecTimeInMs, timeCostInMs, state, scheduleType, externalTableLevelTask, partitionOnly, samplingPartition, isAllPartition, partitionCount, - cronExpression, forceFull, usingSqlForPartitionColumn, tblUpdateTime); + cronExpression, forceFull, usingSqlForPartitionColumn, tblUpdateTime, emptyJob); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java index 0bf24e0c288..39ae191d45a 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java @@ -519,6 +519,7 @@ public class AnalysisManager implements Writable { infoBuilder.setColToPartitions(colToPartitions); infoBuilder.setTaskIds(Lists.newArrayList()); infoBuilder.setTblUpdateTime(table.getUpdateTime()); + infoBuilder.setEmptyJob(table instanceof OlapTable && table.getRowCount() == 0); return infoBuilder.build(); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java index e062e4eef85..81348c1f948 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java @@ -61,7 +61,8 @@ public class OlapAnalysisTask extends BaseAnalysisTask { public void doExecute() throws Exception { Set<String> partitionNames = info.colToPartitions.get(info.colName); - if (partitionNames == null || partitionNames.isEmpty()) { + if ((info.emptyJob && info.analysisMethod.equals(AnalysisInfo.AnalysisMethod.SAMPLE)) + || partitionNames == null || partitionNames.isEmpty()) { if (partitionNames == null) { LOG.warn("Table {}.{}.{}, partitionNames for column {} is null. ColToPartitions:[{}]", info.catalogId, info.dbId, info.tblId, info.colName, info.colToPartitions); diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticConstants.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticConstants.java index 3d6d2fe52aa..857a50e234c 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticConstants.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticConstants.java @@ -88,7 +88,7 @@ public class StatisticConstants { public static final long HUGE_TABLE_DEFAULT_SAMPLE_ROWS = 4194304; public static final long HUGE_TABLE_LOWER_BOUND_SIZE_IN_BYTES = 0; - public static final long HUGE_TABLE_AUTO_ANALYZE_INTERVAL_IN_MILLIS = TimeUnit.HOURS.toMillis(12); + public static final long HUGE_TABLE_AUTO_ANALYZE_INTERVAL_IN_MILLIS = TimeUnit.HOURS.toMillis(0); public static final long EXTERNAL_TABLE_AUTO_ANALYZE_INTERVAL_IN_MILLIS = TimeUnit.HOURS.toMillis(24); diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoCollector.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoCollector.java index ee50471175d..f799da56206 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoCollector.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoCollector.java @@ -170,6 +170,7 @@ public class StatisticsAutoCollector extends StatisticsCollector { .setLastExecTimeInMs(System.currentTimeMillis()) .setJobType(JobType.SYSTEM) .setTblUpdateTime(table.getUpdateTime()) + .setEmptyJob(table instanceof OlapTable && table.getRowCount() == 0) .build(); analysisInfos.add(jobInfo); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatsMeta.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatsMeta.java index f500ab09f0b..eb6672ffe18 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatsMeta.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatsMeta.java @@ -149,7 +149,7 @@ public class TableStatsMeta implements Writable { if (tableIf instanceof OlapTable) { rowCount = tableIf.getRowCount(); } - if (analyzedJob.colToPartitions.keySet() + if (!analyzedJob.emptyJob && analyzedJob.colToPartitions.keySet() .containsAll(tableIf.getBaseSchema().stream() .filter(c -> !StatisticsUtil.isUnsupportedType(c.getType())) .map(Column::getName).collect(Collectors.toSet()))) { diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/StatisticsAutoCollectorTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/StatisticsAutoCollectorTest.java index 0b4b2203d0d..87342202fb2 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/StatisticsAutoCollectorTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/StatisticsAutoCollectorTest.java @@ -299,7 +299,7 @@ public class StatisticsAutoCollectorTest { // A very huge table has been updated recently, so we should skip it this time stats.updatedTime = System.currentTimeMillis() - 1000; StatisticsAutoCollector autoCollector = new StatisticsAutoCollector(); - Assertions.assertTrue(autoCollector.skip(olapTable)); + Assertions.assertFalse(autoCollector.skip(olapTable)); // The update of this huge table is long time ago, so we shouldn't skip it this time stats.updatedTime = System.currentTimeMillis() - StatisticsUtil.getHugeTableAutoAnalyzeIntervalInMillis() - 10000; diff --git a/regression-test/suites/statistics/analyze_stats.groovy b/regression-test/suites/statistics/analyze_stats.groovy index e7e89f858fb..64967280ce9 100644 --- a/regression-test/suites/statistics/analyze_stats.groovy +++ b/regression-test/suites/statistics/analyze_stats.groovy @@ -1168,7 +1168,7 @@ PARTITION `p599` VALUES IN (599) sql """ INSERT INTO test_updated_rows SELECT * FROM test_updated_rows """ sql """ANALYZE TABLE test_updated_rows WITH SYNC""" def cnt2 = sql """ SHOW TABLE STATS test_updated_rows """ - assertEquals(Integer.valueOf(cnt2[0][0]), 0) + assertTrue(Integer.valueOf(cnt2[0][0]) == 0 || Integer.valueOf(cnt2[0][0]) == 8) // test analyze specific column sql """CREATE TABLE test_analyze_specific_column (col1 varchar(11451) not null, col2 int not null, col3 int not null) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org