Jibing-Li commented on code in PR #48757: URL: https://github.com/apache/doris/pull/48757#discussion_r1984754028
########## fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java: ########## @@ -372,12 +396,64 @@ protected String getIndex() { } } - protected long pickSamplePartition(List<Partition> partitions, List<Long> pickedTabletIds) { - long averageRowsPerPartition = tbl.getRowCount() / partitions.size(); + // For partition tables with single time type partition column, we'd better to skip sampling the partition + // that contains all the history data. Because this partition may contain many old data which is not + // visited by most queries. To sample this partition may cause the statistics not accurate. + // For example, one table has 366 partitions, partition 1 ~ 365 store date for each day of the year from now. + // Partition 0 stores all the history data earlier than 1 year. We want to skip sampling partition 0. + protected long getSkipPartitionId(List<Partition> partitions) { + if (partitions == null || partitions.size() < StatisticsUtil.getPartitionSampleCount()) { + return NO_SKIP_TABLET_ID; + } + PartitionInfo partitionInfo = ((OlapTable) tbl).getPartitionInfo(); + if (!PartitionType.RANGE.equals(partitionInfo.getType())) { + return NO_SKIP_TABLET_ID; + } + if (partitionInfo.getPartitionColumns().size() != 1) { + return NO_SKIP_TABLET_ID; + } + Column column = partitionInfo.getPartitionColumns().get(0); + if (!column.getType().isDateType()) { + return NO_SKIP_TABLET_ID; + } + PartitionKey lowestKey = PartitionKey.createMaxPartitionKey(); + long lowestPartitionId = -1; + for (Partition p : partitions) { + RangePartitionItem item = (RangePartitionItem) partitionInfo.getItem(p.getId()); + Range<PartitionKey> items = item.getItems(); + if (!items.hasLowerBound()) { + lowestPartitionId = p.getId(); + break; + } + if (items.lowerEndpoint().compareTo(lowestKey) < 0) { + lowestKey = items.lowerEndpoint(); + lowestPartitionId = p.getId(); + } + } + return lowestPartitionId; + } + + protected long pickSamplePartition(List<Partition> partitions, List<Long> pickedTabletIds, long skipPartition) { + Partition partition = ((OlapTable) tbl).getPartition(skipPartition); + long averageRowsPerPartition; + if (partition != null) { + LOG.info("Going to skip partition {} in table {}", skipPartition, tbl.getName()); + // If we want to skip the oldest partition, calculate the average rows per partition value without + // the oldest partition, otherwise if the oldest partition is very large, we may skip all partitions. + // Because we only pick partitions partitionRowCount >= averageRowsPerPartition. + Preconditions.checkState(partitions.size() > 1); Review Comment: In getSkipPartitionId function. We have minimum number of partitions threshold to skip the oldest partition. Or the result may not good enough. For example, if there are only 2 partitions and we skip one of them. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org