This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch opt_perf
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/opt_perf by this push:
     new 3d2a73c028 [improvement](scan) merge scan keys based on the number of 
scanners (#12884)
3d2a73c028 is described below

commit 3d2a73c028802bfcdeeba0ff5851cfded6d548e4
Author: Jerry Hu <mrh...@gmail.com>
AuthorDate: Thu Sep 22 20:10:42 2022 +0800

    [improvement](scan) merge scan keys based on the number of scanners (#12884)
---
 be/src/exec/olap_common.cpp                 | 113 +++++++++++++++++++++++++++
 be/src/exec/olap_common.h                   | 116 ++++++++++++++++++++++++----
 be/src/runtime/datetime_value.h             |  21 +++++
 be/src/vec/exec/scan/new_olap_scan_node.cpp |  22 ++++--
 be/src/vec/runtime/vdatetime_value.h        |  11 +++
 5 files changed, 262 insertions(+), 21 deletions(-)

diff --git a/be/src/exec/olap_common.cpp b/be/src/exec/olap_common.cpp
index 8069c47a17..087a62928c 100644
--- a/be/src/exec/olap_common.cpp
+++ b/be/src/exec/olap_common.cpp
@@ -59,6 +59,42 @@ void 
ColumnValueRange<PrimitiveType::TYPE_LARGEINT>::convert_to_fixed_value() {
     return;
 }
 
+template <>
+std::vector<ColumnValueRange<PrimitiveType::TYPE_STRING>>
+ColumnValueRange<PrimitiveType::TYPE_STRING>::split(size_t count) {
+    __builtin_unreachable();
+}
+
+template <>
+std::vector<ColumnValueRange<PrimitiveType::TYPE_CHAR>>
+ColumnValueRange<PrimitiveType::TYPE_CHAR>::split(size_t count) {
+    __builtin_unreachable();
+}
+
+template <>
+std::vector<ColumnValueRange<PrimitiveType::TYPE_VARCHAR>>
+ColumnValueRange<PrimitiveType::TYPE_VARCHAR>::split(size_t count) {
+    __builtin_unreachable();
+}
+
+template <>
+std::vector<ColumnValueRange<PrimitiveType::TYPE_HLL>>
+ColumnValueRange<PrimitiveType::TYPE_HLL>::split(size_t count) {
+    __builtin_unreachable();
+}
+
+template <>
+std::vector<ColumnValueRange<PrimitiveType::TYPE_DECIMALV2>>
+ColumnValueRange<PrimitiveType::TYPE_DECIMALV2>::split(size_t count) {
+    __builtin_unreachable();
+}
+
+template <>
+std::vector<ColumnValueRange<PrimitiveType::TYPE_LARGEINT>>
+ColumnValueRange<PrimitiveType::TYPE_LARGEINT>::split(size_t count) {
+    __builtin_unreachable();
+}
+
 Status 
OlapScanKeys::get_key_range(std::vector<std::unique_ptr<OlapScanRange>>* 
key_range) {
     key_range->clear();
 
@@ -74,6 +110,83 @@ Status 
OlapScanKeys::get_key_range(std::vector<std::unique_ptr<OlapScanRange>>*
     return Status::OK();
 }
 
+Status 
OlapScanKeys::extend_scan_splitted_keys(std::vector<ColumnValueRangeType>& 
ranges) {
+    using namespace std;
+    DCHECK(!_has_range_value);
+
+    std::vector<OlapTuple> new_begin_keys;
+    std::vector<OlapTuple> new_end_keys;
+    for (size_t i = 0; i != ranges.size(); ++i) {
+        std::visit(
+                [&](auto&& range) {
+                    using RangeType = std::decay_t<decltype(range)>;
+                    using CppType = typename RangeType::CppType;
+                    auto begin_keys = _begin_scan_keys;
+                    auto end_keys = _end_scan_keys;
+                    if (begin_keys.empty()) {
+                        begin_keys.emplace_back();
+                        begin_keys.back().add_value(
+                                cast_to_string<RangeType::Type, CppType>(
+                                        range.get_range_min_value(), 
range.scale()),
+                                range.contain_null());
+                        end_keys.emplace_back();
+                        
end_keys.back().add_value(cast_to_string<RangeType::Type, CppType>(
+                                range.get_range_max_value(), range.scale()));
+                    } else {
+                        for (int i = 0; i < begin_keys.size(); ++i) {
+                            begin_keys[i].add_value(
+                                    cast_to_string<RangeType::Type, CppType>(
+                                            range.get_range_min_value(), 
range.scale()),
+                                    range.contain_null());
+                        }
+
+                        for (int i = 0; i < end_keys.size(); ++i) {
+                            
end_keys[i].add_value(cast_to_string<RangeType::Type, CppType>(
+                                    range.get_range_max_value(), 
range.scale()));
+                        }
+                    }
+                    new_begin_keys.insert(new_begin_keys.end(), 
begin_keys.begin(),
+                                          begin_keys.end());
+                    new_end_keys.insert(new_end_keys.end(), end_keys.begin(), 
end_keys.end());
+                },
+                ranges[i]);
+    }
+    _begin_scan_keys = new_begin_keys;
+    _end_scan_keys = new_end_keys;
+    return Status::OK();
+}
+
+OlapScanKeys OlapScanKeys::merge(size_t to_ranges_count) {
+    OlapScanKeys merged;
+    merged.set_is_convertible(_is_convertible);
+    merged.set_max_scan_key_num(_max_scan_key_num);
+    bool exact_value = false;
+    for (size_t i = 0; i != _column_ranges.size(); ++i) {
+        std::visit(
+                [&](auto&& range) {
+                    if (i == _index_of_max_size_range) {
+                        return;
+                    }
+                    merged.extend_scan_key(range, &exact_value);
+                },
+                _column_ranges[i]);
+    }
+
+    size_t size_of_ranges = std::max(size_t(1), merged.size());
+    size_t split_to_count = (to_ranges_count + size_of_ranges - 1) / 
size_of_ranges;
+    std::vector<ColumnValueRangeType> splitted = std::visit(
+            [&](auto&& range) {
+                auto splitted = range.split(split_to_count);
+                std::vector<ColumnValueRangeType> splitted_variants;
+                splitted_variants.assign(splitted.cbegin(), splitted.cend());
+                return splitted_variants;
+            },
+            _column_ranges[_index_of_max_size_range]);
+
+    merged.extend_scan_splitted_keys(splitted);
+    return merged;
+}
+
 } // namespace doris
 
 /* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
diff --git a/be/src/exec/olap_common.h b/be/src/exec/olap_common.h
index 2f37b05633..87cac1e4ac 100644
--- a/be/src/exec/olap_common.h
+++ b/be/src/exec/olap_common.h
@@ -72,6 +72,7 @@ std::string cast_to_string(T value, int scale) {
 template <PrimitiveType primitive_type>
 class ColumnValueRange {
 public:
+    constexpr static auto Type = primitive_type;
     using CppType = typename PrimitiveTypeTraits<primitive_type>::CppType;
     using IteratorType = typename std::set<CppType>::iterator;
 
@@ -109,6 +110,9 @@ public:
 
     void convert_to_fixed_value();
 
+    // split this range to small ranges which are step_size.
+    std::vector<ColumnValueRange> split(size_t count);
+
     void convert_to_range_value();
 
     bool has_intersection(ColumnValueRange<primitive_type>& range);
@@ -302,22 +306,45 @@ private:
     int _scale;
 };
 
+using ColumnValueRangeType =
+        std::variant<ColumnValueRange<TYPE_TINYINT>, 
ColumnValueRange<TYPE_SMALLINT>,
+                     ColumnValueRange<TYPE_INT>, ColumnValueRange<TYPE_BIGINT>,
+                     ColumnValueRange<TYPE_LARGEINT>, 
ColumnValueRange<TYPE_CHAR>,
+                     ColumnValueRange<TYPE_VARCHAR>, 
ColumnValueRange<TYPE_STRING>,
+                     ColumnValueRange<TYPE_DATE>, 
ColumnValueRange<TYPE_DATEV2>,
+                     ColumnValueRange<TYPE_DATETIME>, 
ColumnValueRange<TYPE_DATETIMEV2>,
+                     ColumnValueRange<TYPE_DECIMALV2>, 
ColumnValueRange<TYPE_BOOLEAN>,
+                     ColumnValueRange<TYPE_HLL>, 
ColumnValueRange<TYPE_DECIMAL32>,
+                     ColumnValueRange<TYPE_DECIMAL64>, 
ColumnValueRange<TYPE_DECIMAL128>>;
 class OlapScanKeys {
 public:
     OlapScanKeys()
-            : _has_range_value(false),
+            : _index_of_max_size_range(0),
+              _max_size_of_range(0),
+              _max_scan_key_num(1024),
+              _has_range_value(false),
               _begin_include(true),
               _end_include(true),
-              _is_convertible(true) {}
+              _is_convertible(true),
+              _is_mergeable(false) {}
 
     template <PrimitiveType primitive_type>
     Status extend_scan_key(ColumnValueRange<primitive_type>& range, int32_t 
max_scan_key_num,
                            bool* exact_value);
 
+    template <PrimitiveType primitive_type>
+    Status extend_scan_key(ColumnValueRange<primitive_type>& range, bool* 
exact_value) {
+        return extend_scan_key(range, _max_scan_key_num, exact_value);
+    }
+
+    Status extend_scan_splitted_keys(std::vector<ColumnValueRangeType>& 
ranges);
+
     Status get_key_range(std::vector<std::unique_ptr<OlapScanRange>>* 
key_range);
 
     bool has_range_value() const { return _has_range_value; }
 
+    void set_max_scan_key_num(int32_t val) { _max_scan_key_num = val; }
+
     void clear() {
         _has_range_value = false;
         _begin_scan_keys.clear();
@@ -351,6 +378,10 @@ public:
 
     void set_is_convertible(bool is_convertible) { _is_convertible = 
is_convertible; }
 
+    bool is_mergeable() const { return _is_mergeable; }
+
+    OlapScanKeys merge(size_t to_ranges_count);
+
     // now, only use in UT
     static std::string to_print_key(const OlapTuple& scan_keys) {
         std::stringstream sstream;
@@ -361,22 +392,19 @@ public:
 private:
     std::vector<OlapTuple> _begin_scan_keys;
     std::vector<OlapTuple> _end_scan_keys;
+    std::vector<ColumnValueRangeType> _column_ranges;
+    size_t _index_of_max_size_range;
+    size_t _max_size_of_range;
+    int32_t _max_scan_key_num;
     bool _has_range_value;
     bool _begin_include;
     bool _end_include;
     bool _is_convertible;
-};
 
-using ColumnValueRangeType =
-        std::variant<ColumnValueRange<TYPE_TINYINT>, 
ColumnValueRange<TYPE_SMALLINT>,
-                     ColumnValueRange<TYPE_INT>, ColumnValueRange<TYPE_BIGINT>,
-                     ColumnValueRange<TYPE_LARGEINT>, 
ColumnValueRange<TYPE_CHAR>,
-                     ColumnValueRange<TYPE_VARCHAR>, 
ColumnValueRange<TYPE_STRING>,
-                     ColumnValueRange<TYPE_DATE>, 
ColumnValueRange<TYPE_DATEV2>,
-                     ColumnValueRange<TYPE_DATETIME>, 
ColumnValueRange<TYPE_DATETIMEV2>,
-                     ColumnValueRange<TYPE_DECIMALV2>, 
ColumnValueRange<TYPE_BOOLEAN>,
-                     ColumnValueRange<TYPE_HLL>, 
ColumnValueRange<TYPE_DECIMAL32>,
-                     ColumnValueRange<TYPE_DECIMAL64>, 
ColumnValueRange<TYPE_DECIMAL128>>;
+    // if tablet is too same, there is no need too many scan keys,
+    // we should merge the scan keys
+    bool _is_mergeable;
+};
 
 template <PrimitiveType primitive_type>
 const typename ColumnValueRange<primitive_type>::CppType
@@ -542,6 +570,61 @@ void 
ColumnValueRange<primitive_type>::convert_to_fixed_value() {
     }
 }
 
+template <>
+[[noreturn]] std::vector<ColumnValueRange<PrimitiveType::TYPE_STRING>>
+ColumnValueRange<PrimitiveType::TYPE_STRING>::split(size_t count);
+
+template <>
+[[noreturn]] std::vector<ColumnValueRange<PrimitiveType::TYPE_CHAR>>
+ColumnValueRange<PrimitiveType::TYPE_CHAR>::split(size_t count);
+
+template <>
+[[noreturn]] std::vector<ColumnValueRange<PrimitiveType::TYPE_VARCHAR>>
+ColumnValueRange<PrimitiveType::TYPE_VARCHAR>::split(size_t count);
+
+template <>
+[[noreturn]] std::vector<ColumnValueRange<PrimitiveType::TYPE_HLL>>
+ColumnValueRange<PrimitiveType::TYPE_HLL>::split(size_t count);
+
+template <>
+[[noreturn]] std::vector<ColumnValueRange<PrimitiveType::TYPE_DECIMALV2>>
+ColumnValueRange<PrimitiveType::TYPE_DECIMALV2>::split(size_t count);
+
+template <>
+[[noreturn]] std::vector<ColumnValueRange<PrimitiveType::TYPE_LARGEINT>>
+ColumnValueRange<PrimitiveType::TYPE_LARGEINT>::split(size_t count);
+
+template <PrimitiveType primitive_type>
+std::vector<ColumnValueRange<primitive_type>> 
ColumnValueRange<primitive_type>::split(
+        size_t count) {
+    if (!is_fixed_value_range()) {
+        LOG(FATAL) << "should be converted to fixed value range";
+    }
+
+    // Incrementing boolean is denied in C++17, So we use int as bool type
+    using type = std::conditional_t<std::is_same<bool, CppType>::value, int, 
CppType>;
+    type low_value = *_fixed_values.begin();
+    type high_value = *_fixed_values.rbegin();
+
+    int64_t step = int64_t((high_value - low_value + count - 1) / count);
+    std::vector<ColumnValueRange> splitted;
+
+    for (auto v = low_value; count > 0;) {
+        ColumnValueRange<primitive_type> range(_column_name, _precision, 
_scale);
+        range.set_contain_null(_contain_null);
+        range.add_range(FILTER_LARGER_OR_EQUAL, v);
+        if (count > 1) {
+            v += step;
+            range.add_range(FILTER_LESS, v);
+        } else {
+            range.add_range(FILTER_LESS_OR_EQUAL, high_value);
+        }
+        count--;
+        splitted.emplace_back(range);
+    }
+    return splitted;
+}
+
 template <PrimitiveType primitive_type>
 void ColumnValueRange<primitive_type>::convert_to_range_value() {
     if (!is_range_value_convertible()) {
@@ -866,6 +949,12 @@ Status 
OlapScanKeys::extend_scan_key(ColumnValueRange<primitive_type>& range,
         if (range.is_fixed_value_convertible() && _is_convertible) {
             if (range.get_convertible_fixed_value_size() < max_scan_key_num / 
scan_keys_size) {
                 range.convert_to_fixed_value();
+                size_t size = range.get_fixed_value_size();
+                if (size > _max_size_of_range) {
+                    _max_size_of_range = size;
+                    _index_of_max_size_range = _column_ranges.size();
+                }
+                _is_mergeable = true;
             }
         }
     }
@@ -960,6 +1049,7 @@ Status 
OlapScanKeys::extend_scan_key(ColumnValueRange<primitive_type>& range,
         _begin_include = range.is_begin_include();
         _end_include = range.is_end_include();
     }
+    _column_ranges.emplace_back(range);
 
     return Status::OK();
 }
diff --git a/be/src/runtime/datetime_value.h b/be/src/runtime/datetime_value.h
index b34f0d6dd6..e8389b8d8c 100644
--- a/be/src/runtime/datetime_value.h
+++ b/be/src/runtime/datetime_value.h
@@ -516,6 +516,27 @@ public:
         return *this;
     }
 
+    DateTimeValue& operator+=(int64_t val) {
+        switch (_type) {
+        case TIME_DATE: {
+            TimeInterval interval(DAY, val, false);
+            date_add_interval(interval, DAY);
+            break;
+        }
+        case TIME_DATETIME: {
+            TimeInterval interval(SECOND, val, false);
+            date_add_interval(interval, SECOND);
+            break;
+        }
+        case TIME_TIME: {
+            TimeInterval interval(SECOND, val, false);
+            date_add_interval(interval, SECOND);
+            break;
+        }
+        }
+        return *this;
+    }
+
     void to_datetime_val(doris_udf::DateTimeVal* tv) const {
         tv->packed_time = to_int64_datetime_packed();
         tv->type = _type;
diff --git a/be/src/vec/exec/scan/new_olap_scan_node.cpp 
b/be/src/vec/exec/scan/new_olap_scan_node.cpp
index 8242abef77..14f9bc94e1 100644
--- a/be/src/vec/exec/scan/new_olap_scan_node.cpp
+++ b/be/src/vec/exec/scan/new_olap_scan_node.cpp
@@ -149,6 +149,7 @@ Status NewOlapScanNode::_build_key_ranges_and_filters() {
 
     // 1. construct scan key except last olap engine short key
     _scan_keys.set_is_convertible(limit() == -1);
+    _scan_keys.set_max_scan_key_num(_max_scan_key_num);
 
     // we use `exact_range` to identify a key range is an exact range or not 
when we convert
     // it to `_scan_keys`. If `exact_range` is true, we can just discard it 
from `_olap_filters`.
@@ -162,8 +163,7 @@ Status NewOlapScanNode::_build_key_ranges_and_filters() {
 
         RETURN_IF_ERROR(std::visit(
                 [&](auto&& range) {
-                    RETURN_IF_ERROR(
-                            _scan_keys.extend_scan_key(range, 
_max_scan_key_num, &exact_range));
+                    RETURN_IF_ERROR(_scan_keys.extend_scan_key(range, 
&exact_range));
                     if (exact_range) {
                         _colname_to_value_range.erase(iter->first);
                     }
@@ -265,11 +265,6 @@ Status 
NewOlapScanNode::_init_scanners(std::list<VScanner*>* scanners) {
 
     // ranges constructed from scan keys
     std::vector<std::unique_ptr<doris::OlapScanRange>> cond_ranges;
-    RETURN_IF_ERROR(_scan_keys.get_key_range(&cond_ranges));
-    // if we can't get ranges from conditions, we give it a total range
-    if (cond_ranges.empty()) {
-        cond_ranges.emplace_back(new doris::OlapScanRange());
-    }
     int scanners_per_tablet = std::max(1, 64 / (int)_scan_ranges.size());
 
     std::unordered_set<std::string> disk_set;
@@ -285,7 +280,6 @@ Status 
NewOlapScanNode::_init_scanners(std::list<VScanner*>* scanners) {
             return Status::InternalError(ss.str());
         }
 
-        std::vector<std::unique_ptr<doris::OlapScanRange>>* ranges = 
&cond_ranges;
         int size_based_scanners_per_tablet = 1;
 
         if (config::doris_scan_range_max_mb > 0) {
@@ -293,6 +287,18 @@ Status 
NewOlapScanNode::_init_scanners(std::list<VScanner*>* scanners) {
                     1, (int)(tablet->tablet_footprint() / 
(config::doris_scan_range_max_mb << 20)));
         }
 
+        if (_scan_keys.size() > size_based_scanners_per_tablet && 
_scan_keys.is_mergeable()) {
+            auto scan_keys = _scan_keys.merge(size_based_scanners_per_tablet);
+            RETURN_IF_ERROR(scan_keys.get_key_range(&cond_ranges));
+        } else {
+            RETURN_IF_ERROR(_scan_keys.get_key_range(&cond_ranges));
+        }
+        // if we can't get ranges from conditions, we give it a total range
+        if (cond_ranges.empty()) {
+            cond_ranges.emplace_back(new doris::OlapScanRange());
+        }
+        std::vector<std::unique_ptr<doris::OlapScanRange>>* ranges = 
&cond_ranges;
+
         int ranges_per_scanner =
                 std::max(1, (int)ranges->size() /
                                     std::min(scanners_per_tablet, 
size_based_scanners_per_tablet));
diff --git a/be/src/vec/runtime/vdatetime_value.h 
b/be/src/vec/runtime/vdatetime_value.h
index 42ce3a173c..17c391558d 100644
--- a/be/src/vec/runtime/vdatetime_value.h
+++ b/be/src/vec/runtime/vdatetime_value.h
@@ -1026,6 +1026,17 @@ public:
         return *this;
     }
 
+    DateV2Value<T>& operator+=(int64_t val) {
+        if constexpr (is_datetime) {
+            TimeInterval interval(SECOND, val, false);
+            date_add_interval<SECOND>(interval);
+        } else {
+            TimeInterval interval(DAY, val, false);
+            date_add_interval<DAY>(interval);
+        }
+        return *this;
+    }
+
     uint32_t hash(int seed) const { return HashUtil::hash(this, sizeof(*this), 
seed); }
 
     int day_of_year() const { return daynr() - calc_daynr(this->year(), 1, 1) 
+ 1; }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to