yiguolei commented on code in PR #35383: URL: https://github.com/apache/doris/pull/35383#discussion_r1613692288
########## be/src/olap/parallel_scanner_builder.cpp: ########## @@ -32,13 +32,120 @@ template <typename ParentType> Status ParallelScannerBuilder<ParentType>::build_scanners(std::list<VScannerSPtr>& scanners) { RETURN_IF_ERROR(_load()); if (_is_dup_mow_key) { + if (_split_by_segment_size) { + return _build_scanners_by_segment_size(scanners); + } return _build_scanners_by_rowid(scanners); } else { // TODO: support to split by key range return Status::NotSupported("split by key range not supported yet."); } } +template <typename ParentType> +Status ParallelScannerBuilder<ParentType>::_build_scanners_by_segment_size( + std::list<VScannerSPtr>& scanners) { + DCHECK_GE(_bytes_per_scanner, 0); + + for (auto&& [tablet, version] : _tablets) { + DCHECK(_all_rowsets.contains(tablet->tablet_id())); + auto& rowsets = _all_rowsets[tablet->tablet_id()]; + + TabletReader::ReadSource reade_source_with_delete_info; + + if (config::is_cloud_mode()) { + // FIXME(plat1ko): Avoid pointer cast + ExecEnv::GetInstance()->storage_engine().to_cloud().tablet_hotspot().count(*tablet); + } + + if (!_state->skip_delete_predicate()) { + RETURN_IF_ERROR(tablet->capture_rs_readers( + {0, version}, &reade_source_with_delete_info.rs_splits, false)); + reade_source_with_delete_info.fill_delete_predicates(); + } + + TabletReader::ReadSource read_source; + + size_t bytes_collected = 0; + for (auto& rowset : rowsets) { + auto beta_rowset = std::dynamic_pointer_cast<BetaRowset>(rowset); + RowsetReaderSharedPtr reader; + RETURN_IF_ERROR(beta_rowset->create_reader(&reader)); + const auto rowset_id = beta_rowset->rowset_id(); + + if (beta_rowset->num_rows() == 0) { + continue; + } + + DCHECK(_segments_size.contains(rowset_id)); + auto& segments_size = _segments_size[rowset_id]; + + int segment_start = 0; + auto split = RowSetSplits(reader->clone()); + + for (size_t i = 0; i != segments_size.size(); ++i) { + const auto segment_size = segments_size[i]; + RowRanges row_ranges; + + row_ranges.add({0, std::numeric_limits<int64_t>::max()}); + split.segment_row_ranges.emplace_back(std::move(row_ranges)); + Review Comment: if segment1 = 31MB and segment2 = 31mb, then collectored is 62MB, it maybe too large. I think maybe we should sort the segment by size and then we could use some algoriithm to make the split more well distributed between scanners. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org