This is an automated email from the ASF dual-hosted git repository. yiguolei pushed a commit to branch opt_perf in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/opt_perf by this push: new 3d2a73c028 [improvement](scan) merge scan keys based on the number of scanners (#12884) 3d2a73c028 is described below commit 3d2a73c028802bfcdeeba0ff5851cfded6d548e4 Author: Jerry Hu <mrh...@gmail.com> AuthorDate: Thu Sep 22 20:10:42 2022 +0800 [improvement](scan) merge scan keys based on the number of scanners (#12884) --- be/src/exec/olap_common.cpp | 113 +++++++++++++++++++++++++++ be/src/exec/olap_common.h | 116 ++++++++++++++++++++++++---- be/src/runtime/datetime_value.h | 21 +++++ be/src/vec/exec/scan/new_olap_scan_node.cpp | 22 ++++-- be/src/vec/runtime/vdatetime_value.h | 11 +++ 5 files changed, 262 insertions(+), 21 deletions(-) diff --git a/be/src/exec/olap_common.cpp b/be/src/exec/olap_common.cpp index 8069c47a17..087a62928c 100644 --- a/be/src/exec/olap_common.cpp +++ b/be/src/exec/olap_common.cpp @@ -59,6 +59,42 @@ void ColumnValueRange<PrimitiveType::TYPE_LARGEINT>::convert_to_fixed_value() { return; } +template <> +std::vector<ColumnValueRange<PrimitiveType::TYPE_STRING>> +ColumnValueRange<PrimitiveType::TYPE_STRING>::split(size_t count) { + __builtin_unreachable(); +} + +template <> +std::vector<ColumnValueRange<PrimitiveType::TYPE_CHAR>> +ColumnValueRange<PrimitiveType::TYPE_CHAR>::split(size_t count) { + __builtin_unreachable(); +} + +template <> +std::vector<ColumnValueRange<PrimitiveType::TYPE_VARCHAR>> +ColumnValueRange<PrimitiveType::TYPE_VARCHAR>::split(size_t count) { + __builtin_unreachable(); +} + +template <> +std::vector<ColumnValueRange<PrimitiveType::TYPE_HLL>> +ColumnValueRange<PrimitiveType::TYPE_HLL>::split(size_t count) { + __builtin_unreachable(); +} + +template <> +std::vector<ColumnValueRange<PrimitiveType::TYPE_DECIMALV2>> +ColumnValueRange<PrimitiveType::TYPE_DECIMALV2>::split(size_t count) { + __builtin_unreachable(); +} + +template <> +std::vector<ColumnValueRange<PrimitiveType::TYPE_LARGEINT>> +ColumnValueRange<PrimitiveType::TYPE_LARGEINT>::split(size_t count) { + __builtin_unreachable(); +} + Status OlapScanKeys::get_key_range(std::vector<std::unique_ptr<OlapScanRange>>* key_range) { key_range->clear(); @@ -74,6 +110,83 @@ Status OlapScanKeys::get_key_range(std::vector<std::unique_ptr<OlapScanRange>>* return Status::OK(); } +Status OlapScanKeys::extend_scan_splitted_keys(std::vector<ColumnValueRangeType>& ranges) { + using namespace std; + DCHECK(!_has_range_value); + + std::vector<OlapTuple> new_begin_keys; + std::vector<OlapTuple> new_end_keys; + for (size_t i = 0; i != ranges.size(); ++i) { + std::visit( + [&](auto&& range) { + using RangeType = std::decay_t<decltype(range)>; + using CppType = typename RangeType::CppType; + auto begin_keys = _begin_scan_keys; + auto end_keys = _end_scan_keys; + if (begin_keys.empty()) { + begin_keys.emplace_back(); + begin_keys.back().add_value( + cast_to_string<RangeType::Type, CppType>( + range.get_range_min_value(), range.scale()), + range.contain_null()); + end_keys.emplace_back(); + end_keys.back().add_value(cast_to_string<RangeType::Type, CppType>( + range.get_range_max_value(), range.scale())); + } else { + for (int i = 0; i < begin_keys.size(); ++i) { + begin_keys[i].add_value( + cast_to_string<RangeType::Type, CppType>( + range.get_range_min_value(), range.scale()), + range.contain_null()); + } + + for (int i = 0; i < end_keys.size(); ++i) { + end_keys[i].add_value(cast_to_string<RangeType::Type, CppType>( + range.get_range_max_value(), range.scale())); + } + } + new_begin_keys.insert(new_begin_keys.end(), begin_keys.begin(), + begin_keys.end()); + new_end_keys.insert(new_end_keys.end(), end_keys.begin(), end_keys.end()); + }, + ranges[i]); + } + _begin_scan_keys = new_begin_keys; + _end_scan_keys = new_end_keys; + return Status::OK(); +} + +OlapScanKeys OlapScanKeys::merge(size_t to_ranges_count) { + OlapScanKeys merged; + merged.set_is_convertible(_is_convertible); + merged.set_max_scan_key_num(_max_scan_key_num); + bool exact_value = false; + for (size_t i = 0; i != _column_ranges.size(); ++i) { + std::visit( + [&](auto&& range) { + if (i == _index_of_max_size_range) { + return; + } + merged.extend_scan_key(range, &exact_value); + }, + _column_ranges[i]); + } + + size_t size_of_ranges = std::max(size_t(1), merged.size()); + size_t split_to_count = (to_ranges_count + size_of_ranges - 1) / size_of_ranges; + std::vector<ColumnValueRangeType> splitted = std::visit( + [&](auto&& range) { + auto splitted = range.split(split_to_count); + std::vector<ColumnValueRangeType> splitted_variants; + splitted_variants.assign(splitted.cbegin(), splitted.cend()); + return splitted_variants; + }, + _column_ranges[_index_of_max_size_range]); + + merged.extend_scan_splitted_keys(splitted); + return merged; +} + } // namespace doris /* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */ diff --git a/be/src/exec/olap_common.h b/be/src/exec/olap_common.h index 2f37b05633..87cac1e4ac 100644 --- a/be/src/exec/olap_common.h +++ b/be/src/exec/olap_common.h @@ -72,6 +72,7 @@ std::string cast_to_string(T value, int scale) { template <PrimitiveType primitive_type> class ColumnValueRange { public: + constexpr static auto Type = primitive_type; using CppType = typename PrimitiveTypeTraits<primitive_type>::CppType; using IteratorType = typename std::set<CppType>::iterator; @@ -109,6 +110,9 @@ public: void convert_to_fixed_value(); + // split this range to small ranges which are step_size. + std::vector<ColumnValueRange> split(size_t count); + void convert_to_range_value(); bool has_intersection(ColumnValueRange<primitive_type>& range); @@ -302,22 +306,45 @@ private: int _scale; }; +using ColumnValueRangeType = + std::variant<ColumnValueRange<TYPE_TINYINT>, ColumnValueRange<TYPE_SMALLINT>, + ColumnValueRange<TYPE_INT>, ColumnValueRange<TYPE_BIGINT>, + ColumnValueRange<TYPE_LARGEINT>, ColumnValueRange<TYPE_CHAR>, + ColumnValueRange<TYPE_VARCHAR>, ColumnValueRange<TYPE_STRING>, + ColumnValueRange<TYPE_DATE>, ColumnValueRange<TYPE_DATEV2>, + ColumnValueRange<TYPE_DATETIME>, ColumnValueRange<TYPE_DATETIMEV2>, + ColumnValueRange<TYPE_DECIMALV2>, ColumnValueRange<TYPE_BOOLEAN>, + ColumnValueRange<TYPE_HLL>, ColumnValueRange<TYPE_DECIMAL32>, + ColumnValueRange<TYPE_DECIMAL64>, ColumnValueRange<TYPE_DECIMAL128>>; class OlapScanKeys { public: OlapScanKeys() - : _has_range_value(false), + : _index_of_max_size_range(0), + _max_size_of_range(0), + _max_scan_key_num(1024), + _has_range_value(false), _begin_include(true), _end_include(true), - _is_convertible(true) {} + _is_convertible(true), + _is_mergeable(false) {} template <PrimitiveType primitive_type> Status extend_scan_key(ColumnValueRange<primitive_type>& range, int32_t max_scan_key_num, bool* exact_value); + template <PrimitiveType primitive_type> + Status extend_scan_key(ColumnValueRange<primitive_type>& range, bool* exact_value) { + return extend_scan_key(range, _max_scan_key_num, exact_value); + } + + Status extend_scan_splitted_keys(std::vector<ColumnValueRangeType>& ranges); + Status get_key_range(std::vector<std::unique_ptr<OlapScanRange>>* key_range); bool has_range_value() const { return _has_range_value; } + void set_max_scan_key_num(int32_t val) { _max_scan_key_num = val; } + void clear() { _has_range_value = false; _begin_scan_keys.clear(); @@ -351,6 +378,10 @@ public: void set_is_convertible(bool is_convertible) { _is_convertible = is_convertible; } + bool is_mergeable() const { return _is_mergeable; } + + OlapScanKeys merge(size_t to_ranges_count); + // now, only use in UT static std::string to_print_key(const OlapTuple& scan_keys) { std::stringstream sstream; @@ -361,22 +392,19 @@ public: private: std::vector<OlapTuple> _begin_scan_keys; std::vector<OlapTuple> _end_scan_keys; + std::vector<ColumnValueRangeType> _column_ranges; + size_t _index_of_max_size_range; + size_t _max_size_of_range; + int32_t _max_scan_key_num; bool _has_range_value; bool _begin_include; bool _end_include; bool _is_convertible; -}; -using ColumnValueRangeType = - std::variant<ColumnValueRange<TYPE_TINYINT>, ColumnValueRange<TYPE_SMALLINT>, - ColumnValueRange<TYPE_INT>, ColumnValueRange<TYPE_BIGINT>, - ColumnValueRange<TYPE_LARGEINT>, ColumnValueRange<TYPE_CHAR>, - ColumnValueRange<TYPE_VARCHAR>, ColumnValueRange<TYPE_STRING>, - ColumnValueRange<TYPE_DATE>, ColumnValueRange<TYPE_DATEV2>, - ColumnValueRange<TYPE_DATETIME>, ColumnValueRange<TYPE_DATETIMEV2>, - ColumnValueRange<TYPE_DECIMALV2>, ColumnValueRange<TYPE_BOOLEAN>, - ColumnValueRange<TYPE_HLL>, ColumnValueRange<TYPE_DECIMAL32>, - ColumnValueRange<TYPE_DECIMAL64>, ColumnValueRange<TYPE_DECIMAL128>>; + // if tablet is too same, there is no need too many scan keys, + // we should merge the scan keys + bool _is_mergeable; +}; template <PrimitiveType primitive_type> const typename ColumnValueRange<primitive_type>::CppType @@ -542,6 +570,61 @@ void ColumnValueRange<primitive_type>::convert_to_fixed_value() { } } +template <> +[[noreturn]] std::vector<ColumnValueRange<PrimitiveType::TYPE_STRING>> +ColumnValueRange<PrimitiveType::TYPE_STRING>::split(size_t count); + +template <> +[[noreturn]] std::vector<ColumnValueRange<PrimitiveType::TYPE_CHAR>> +ColumnValueRange<PrimitiveType::TYPE_CHAR>::split(size_t count); + +template <> +[[noreturn]] std::vector<ColumnValueRange<PrimitiveType::TYPE_VARCHAR>> +ColumnValueRange<PrimitiveType::TYPE_VARCHAR>::split(size_t count); + +template <> +[[noreturn]] std::vector<ColumnValueRange<PrimitiveType::TYPE_HLL>> +ColumnValueRange<PrimitiveType::TYPE_HLL>::split(size_t count); + +template <> +[[noreturn]] std::vector<ColumnValueRange<PrimitiveType::TYPE_DECIMALV2>> +ColumnValueRange<PrimitiveType::TYPE_DECIMALV2>::split(size_t count); + +template <> +[[noreturn]] std::vector<ColumnValueRange<PrimitiveType::TYPE_LARGEINT>> +ColumnValueRange<PrimitiveType::TYPE_LARGEINT>::split(size_t count); + +template <PrimitiveType primitive_type> +std::vector<ColumnValueRange<primitive_type>> ColumnValueRange<primitive_type>::split( + size_t count) { + if (!is_fixed_value_range()) { + LOG(FATAL) << "should be converted to fixed value range"; + } + + // Incrementing boolean is denied in C++17, So we use int as bool type + using type = std::conditional_t<std::is_same<bool, CppType>::value, int, CppType>; + type low_value = *_fixed_values.begin(); + type high_value = *_fixed_values.rbegin(); + + int64_t step = int64_t((high_value - low_value + count - 1) / count); + std::vector<ColumnValueRange> splitted; + + for (auto v = low_value; count > 0;) { + ColumnValueRange<primitive_type> range(_column_name, _precision, _scale); + range.set_contain_null(_contain_null); + range.add_range(FILTER_LARGER_OR_EQUAL, v); + if (count > 1) { + v += step; + range.add_range(FILTER_LESS, v); + } else { + range.add_range(FILTER_LESS_OR_EQUAL, high_value); + } + count--; + splitted.emplace_back(range); + } + return splitted; +} + template <PrimitiveType primitive_type> void ColumnValueRange<primitive_type>::convert_to_range_value() { if (!is_range_value_convertible()) { @@ -866,6 +949,12 @@ Status OlapScanKeys::extend_scan_key(ColumnValueRange<primitive_type>& range, if (range.is_fixed_value_convertible() && _is_convertible) { if (range.get_convertible_fixed_value_size() < max_scan_key_num / scan_keys_size) { range.convert_to_fixed_value(); + size_t size = range.get_fixed_value_size(); + if (size > _max_size_of_range) { + _max_size_of_range = size; + _index_of_max_size_range = _column_ranges.size(); + } + _is_mergeable = true; } } } @@ -960,6 +1049,7 @@ Status OlapScanKeys::extend_scan_key(ColumnValueRange<primitive_type>& range, _begin_include = range.is_begin_include(); _end_include = range.is_end_include(); } + _column_ranges.emplace_back(range); return Status::OK(); } diff --git a/be/src/runtime/datetime_value.h b/be/src/runtime/datetime_value.h index b34f0d6dd6..e8389b8d8c 100644 --- a/be/src/runtime/datetime_value.h +++ b/be/src/runtime/datetime_value.h @@ -516,6 +516,27 @@ public: return *this; } + DateTimeValue& operator+=(int64_t val) { + switch (_type) { + case TIME_DATE: { + TimeInterval interval(DAY, val, false); + date_add_interval(interval, DAY); + break; + } + case TIME_DATETIME: { + TimeInterval interval(SECOND, val, false); + date_add_interval(interval, SECOND); + break; + } + case TIME_TIME: { + TimeInterval interval(SECOND, val, false); + date_add_interval(interval, SECOND); + break; + } + } + return *this; + } + void to_datetime_val(doris_udf::DateTimeVal* tv) const { tv->packed_time = to_int64_datetime_packed(); tv->type = _type; diff --git a/be/src/vec/exec/scan/new_olap_scan_node.cpp b/be/src/vec/exec/scan/new_olap_scan_node.cpp index 8242abef77..14f9bc94e1 100644 --- a/be/src/vec/exec/scan/new_olap_scan_node.cpp +++ b/be/src/vec/exec/scan/new_olap_scan_node.cpp @@ -149,6 +149,7 @@ Status NewOlapScanNode::_build_key_ranges_and_filters() { // 1. construct scan key except last olap engine short key _scan_keys.set_is_convertible(limit() == -1); + _scan_keys.set_max_scan_key_num(_max_scan_key_num); // we use `exact_range` to identify a key range is an exact range or not when we convert // it to `_scan_keys`. If `exact_range` is true, we can just discard it from `_olap_filters`. @@ -162,8 +163,7 @@ Status NewOlapScanNode::_build_key_ranges_and_filters() { RETURN_IF_ERROR(std::visit( [&](auto&& range) { - RETURN_IF_ERROR( - _scan_keys.extend_scan_key(range, _max_scan_key_num, &exact_range)); + RETURN_IF_ERROR(_scan_keys.extend_scan_key(range, &exact_range)); if (exact_range) { _colname_to_value_range.erase(iter->first); } @@ -265,11 +265,6 @@ Status NewOlapScanNode::_init_scanners(std::list<VScanner*>* scanners) { // ranges constructed from scan keys std::vector<std::unique_ptr<doris::OlapScanRange>> cond_ranges; - RETURN_IF_ERROR(_scan_keys.get_key_range(&cond_ranges)); - // if we can't get ranges from conditions, we give it a total range - if (cond_ranges.empty()) { - cond_ranges.emplace_back(new doris::OlapScanRange()); - } int scanners_per_tablet = std::max(1, 64 / (int)_scan_ranges.size()); std::unordered_set<std::string> disk_set; @@ -285,7 +280,6 @@ Status NewOlapScanNode::_init_scanners(std::list<VScanner*>* scanners) { return Status::InternalError(ss.str()); } - std::vector<std::unique_ptr<doris::OlapScanRange>>* ranges = &cond_ranges; int size_based_scanners_per_tablet = 1; if (config::doris_scan_range_max_mb > 0) { @@ -293,6 +287,18 @@ Status NewOlapScanNode::_init_scanners(std::list<VScanner*>* scanners) { 1, (int)(tablet->tablet_footprint() / (config::doris_scan_range_max_mb << 20))); } + if (_scan_keys.size() > size_based_scanners_per_tablet && _scan_keys.is_mergeable()) { + auto scan_keys = _scan_keys.merge(size_based_scanners_per_tablet); + RETURN_IF_ERROR(scan_keys.get_key_range(&cond_ranges)); + } else { + RETURN_IF_ERROR(_scan_keys.get_key_range(&cond_ranges)); + } + // if we can't get ranges from conditions, we give it a total range + if (cond_ranges.empty()) { + cond_ranges.emplace_back(new doris::OlapScanRange()); + } + std::vector<std::unique_ptr<doris::OlapScanRange>>* ranges = &cond_ranges; + int ranges_per_scanner = std::max(1, (int)ranges->size() / std::min(scanners_per_tablet, size_based_scanners_per_tablet)); diff --git a/be/src/vec/runtime/vdatetime_value.h b/be/src/vec/runtime/vdatetime_value.h index 42ce3a173c..17c391558d 100644 --- a/be/src/vec/runtime/vdatetime_value.h +++ b/be/src/vec/runtime/vdatetime_value.h @@ -1026,6 +1026,17 @@ public: return *this; } + DateV2Value<T>& operator+=(int64_t val) { + if constexpr (is_datetime) { + TimeInterval interval(SECOND, val, false); + date_add_interval<SECOND>(interval); + } else { + TimeInterval interval(DAY, val, false); + date_add_interval<DAY>(interval); + } + return *this; + } + uint32_t hash(int seed) const { return HashUtil::hash(this, sizeof(*this), seed); } int day_of_year() const { return daynr() - calc_daynr(this->year(), 1, 1) + 1; } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org