yiguolei commented on code in PR #35383:
URL: https://github.com/apache/doris/pull/35383#discussion_r1613692288


##########
be/src/olap/parallel_scanner_builder.cpp:
##########
@@ -32,13 +32,120 @@ template <typename ParentType>
 Status 
ParallelScannerBuilder<ParentType>::build_scanners(std::list<VScannerSPtr>& 
scanners) {
     RETURN_IF_ERROR(_load());
     if (_is_dup_mow_key) {
+        if (_split_by_segment_size) {
+            return _build_scanners_by_segment_size(scanners);
+        }
         return _build_scanners_by_rowid(scanners);
     } else {
         // TODO: support to split by key range
         return Status::NotSupported("split by key range not supported yet.");
     }
 }
 
+template <typename ParentType>
+Status ParallelScannerBuilder<ParentType>::_build_scanners_by_segment_size(
+        std::list<VScannerSPtr>& scanners) {
+    DCHECK_GE(_bytes_per_scanner, 0);
+
+    for (auto&& [tablet, version] : _tablets) {
+        DCHECK(_all_rowsets.contains(tablet->tablet_id()));
+        auto& rowsets = _all_rowsets[tablet->tablet_id()];
+
+        TabletReader::ReadSource reade_source_with_delete_info;
+
+        if (config::is_cloud_mode()) {
+            // FIXME(plat1ko): Avoid pointer cast
+            
ExecEnv::GetInstance()->storage_engine().to_cloud().tablet_hotspot().count(*tablet);
+        }
+
+        if (!_state->skip_delete_predicate()) {
+            RETURN_IF_ERROR(tablet->capture_rs_readers(
+                    {0, version}, &reade_source_with_delete_info.rs_splits, 
false));
+            reade_source_with_delete_info.fill_delete_predicates();
+        }
+
+        TabletReader::ReadSource read_source;
+
+        size_t bytes_collected = 0;
+        for (auto& rowset : rowsets) {
+            auto beta_rowset = std::dynamic_pointer_cast<BetaRowset>(rowset);
+            RowsetReaderSharedPtr reader;
+            RETURN_IF_ERROR(beta_rowset->create_reader(&reader));
+            const auto rowset_id = beta_rowset->rowset_id();
+
+            if (beta_rowset->num_rows() == 0) {
+                continue;
+            }
+
+            DCHECK(_segments_size.contains(rowset_id));
+            auto& segments_size = _segments_size[rowset_id];
+
+            int segment_start = 0;
+            auto split = RowSetSplits(reader->clone());
+
+            for (size_t i = 0; i != segments_size.size(); ++i) {
+                const auto segment_size = segments_size[i];
+                RowRanges row_ranges;
+
+                row_ranges.add({0, std::numeric_limits<int64_t>::max()});
+                split.segment_row_ranges.emplace_back(std::move(row_ranges));
+

Review Comment:
   if segment1 = 31MB and segment2 = 31mb, then collectored is 62MB, it maybe 
too large.
   
   
   I think maybe we should sort the segment by size and then we could use some 
algoriithm to make the split more well distributed between scanners.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to