This is an automated email from the ASF dual-hosted git repository. yiguolei pushed a commit to branch branch-2.0 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.0 by this push: new 0d51acd5426 [improvement](scanner) Remove the predicate that is always true for the segment (#25582) 0d51acd5426 is described below commit 0d51acd54263ebb13508cc70e8caa56314025bc8 Author: Jerry Hu <mrh...@gmail.com> AuthorDate: Wed Oct 18 20:36:06 2023 +0800 [improvement](scanner) Remove the predicate that is always true for the segment (#25582) * [improvement](scanner) Remove the predicate that is always true for the segment (#25366) (#25427) By utilizing the zonemap index of the segment, we can ascertain if a predicate is always true. For example, if the segment’s maximum value is 100 and the predicate is col < 101, then this predicate is always true for this segment. * [fix](scanner) coredump caused by 'prune_predicates_by_zone_map' (#25555) --- be/src/common/config.cpp | 2 + be/src/common/config.h | 3 + be/src/olap/column_predicate.h | 4 ++ be/src/olap/comparison_predicate.h | 27 +++++++++ be/src/olap/rowset/segment_v2/column_reader.cpp | 26 +++++++++ be/src/olap/rowset/segment_v2/column_reader.h | 3 + be/src/olap/rowset/segment_v2/segment.cpp | 22 ++++++- .../query_p0/test_select_with_predicate_prune.out | 25 ++++++++ .../test_select_with_predicate_prune.groovy | 67 ++++++++++++++++++++++ 9 files changed, 178 insertions(+), 1 deletion(-) diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index f82bc241c29..265a2232d68 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -1085,6 +1085,8 @@ DEFINE_mInt32(tablet_schema_cache_recycle_interval, "86400"); DEFINE_Bool(exit_on_exception, "false") +DEFINE_Bool(ignore_always_true_predicate_for_segment, "true"); + // clang-format off #ifdef BE_TEST // test s3 diff --git a/be/src/common/config.h b/be/src/common/config.h index 7c4ab39a40e..06f9a18fcd5 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -1141,6 +1141,9 @@ DECLARE_mInt32(tablet_schema_cache_recycle_interval); // Use `LOG(FATAL)` to replace `throw` when true DECLARE_mBool(exit_on_exception); +// Remove predicate that is always true for a segment. +DECLARE_Bool(ignore_always_true_predicate_for_segment); + #ifdef BE_TEST // test s3 DECLARE_String(test_s3_resource); diff --git a/be/src/olap/column_predicate.h b/be/src/olap/column_predicate.h index b98156f5fb8..05e84999a83 100644 --- a/be/src/olap/column_predicate.h +++ b/be/src/olap/column_predicate.h @@ -173,6 +173,10 @@ public: return true; } + virtual bool is_always_true(const std::pair<WrapperField*, WrapperField*>& statistic) const { + return false; + } + virtual bool evaluate_del(const std::pair<WrapperField*, WrapperField*>& statistic) const { return false; } diff --git a/be/src/olap/comparison_predicate.h b/be/src/olap/comparison_predicate.h index 04dfd5dc5c3..53149ea7ed4 100644 --- a/be/src/olap/comparison_predicate.h +++ b/be/src/olap/comparison_predicate.h @@ -158,6 +158,8 @@ public: return _operator(*reinterpret_cast<const T*>(statistic.ELE->cell_ptr()), _value); \ } + using WarpperFieldType = std::conditional_t<Type == TYPE_DATE, uint24_t, T>; + bool evaluate_and(const std::pair<WrapperField*, WrapperField*>& statistic) const override { if (statistic.first->is_null()) { return true; @@ -202,6 +204,31 @@ public: } } + bool is_always_true(const std::pair<WrapperField*, WrapperField*>& statistic) const override { + if (statistic.first->is_null() || statistic.second->is_null()) { + return false; + } + + DCHECK_LE(sizeof(T), statistic.first->size()); + + T tmp_min_value {}; + T tmp_max_value {}; + memcpy((char*)(&tmp_min_value), statistic.first->cell_ptr(), sizeof(WarpperFieldType)); + memcpy((char*)(&tmp_max_value), statistic.second->cell_ptr(), sizeof(WarpperFieldType)); + + if constexpr (PT == PredicateType::LT) { + return _value > tmp_max_value; + } else if constexpr (PT == PredicateType::LE) { + return _value >= tmp_max_value; + } else if constexpr (PT == PredicateType::GT) { + return _value < tmp_min_value; + } else if constexpr (PT == PredicateType::GE) { + return _value <= tmp_min_value; + } + + return false; + } + bool evaluate_del(const std::pair<WrapperField*, WrapperField*>& statistic) const override { if (statistic.first->is_null() || statistic.second->is_null()) { return false; diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp b/be/src/olap/rowset/segment_v2/column_reader.cpp index b1b817f545a..d9a074e2904 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.cpp +++ b/be/src/olap/rowset/segment_v2/column_reader.cpp @@ -30,6 +30,7 @@ #include "io/fs/file_reader.h" #include "olap/block_column_predicate.h" #include "olap/column_predicate.h" +#include "olap/comparison_predicate.h" #include "olap/decimal12.h" #include "olap/inverted_index_parser.h" #include "olap/iterators.h" @@ -339,6 +340,31 @@ bool ColumnReader::match_condition(const AndBlockColumnPredicate* col_predicates col_predicates); } +bool ColumnReader::prune_predicates_by_zone_map(std::vector<ColumnPredicate*>& predicates, + const int column_id) const { + if (_zone_map_index == nullptr) { + return false; + } + + FieldType type = _type_info->type(); + std::unique_ptr<WrapperField> min_value(WrapperField::create_by_type(type, _meta_length)); + std::unique_ptr<WrapperField> max_value(WrapperField::create_by_type(type, _meta_length)); + _parse_zone_map(*_segment_zone_map, min_value.get(), max_value.get()); + + auto pruned = false; + for (auto it = predicates.begin(); it != predicates.end();) { + auto predicate = *it; + if (predicate->column_id() == column_id && + predicate->is_always_true({min_value.get(), max_value.get()})) { + pruned = true; + it = predicates.erase(it); + } else { + ++it; + } + } + return pruned; +} + void ColumnReader::_parse_zone_map(const ZoneMapPB& zone_map, WrapperField* min_value_container, WrapperField* max_value_container) const { // min value and max value are valid if has_not_null is true diff --git a/be/src/olap/rowset/segment_v2/column_reader.h b/be/src/olap/rowset/segment_v2/column_reader.h index 174aabdefa8..7964555adeb 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.h +++ b/be/src/olap/rowset/segment_v2/column_reader.h @@ -162,6 +162,9 @@ public: bool is_empty() const { return _num_rows == 0; } + bool prune_predicates_by_zone_map(std::vector<ColumnPredicate*>& predicates, + const int column_id) const; + CompressionTypePB get_compression() const { return _meta_compression; } uint64_t num_rows() const { return _num_rows; } diff --git a/be/src/olap/rowset/segment_v2/segment.cpp b/be/src/olap/rowset/segment_v2/segment.cpp index 153ed925176..991518347c9 100644 --- a/be/src/olap/rowset/segment_v2/segment.cpp +++ b/be/src/olap/rowset/segment_v2/segment.cpp @@ -147,7 +147,6 @@ Status Segment::new_iterator(SchemaSPtr schema, const StorageReadOptions& read_o return Status::OK(); } } - if (read_options.use_topn_opt) { auto query_ctx = read_options.runtime_state->get_query_ctx(); auto runtime_predicate = query_ctx->get_runtime_predicate().get_predictate(); @@ -175,6 +174,27 @@ Status Segment::new_iterator(SchemaSPtr schema, const StorageReadOptions& read_o iter->reset(new SegmentIterator(this->shared_from_this(), schema)); } + if (config::ignore_always_true_predicate_for_segment && + read_options.io_ctx.reader_type == ReaderType::READER_QUERY && + !read_options.column_predicates.empty()) { + auto pruned_predicates = read_options.column_predicates; + auto pruned = false; + for (auto& it : _column_readers) { + const auto uid = it.first; + const auto column_id = read_options.tablet_schema->field_index(uid); + if (it.second->prune_predicates_by_zone_map(pruned_predicates, column_id)) { + pruned = true; + } + } + + if (pruned) { + auto options_with_pruned_predicates = read_options; + options_with_pruned_predicates.column_predicates = pruned_predicates; + LOG(INFO) << "column_predicates pruned from " << read_options.column_predicates.size() + << " to " << pruned_predicates.size(); + return iter->get()->init(options_with_pruned_predicates); + } + } return iter->get()->init(read_options); } diff --git a/regression-test/data/query_p0/test_select_with_predicate_prune.out b/regression-test/data/query_p0/test_select_with_predicate_prune.out new file mode 100644 index 00000000000..2e1fad87499 --- /dev/null +++ b/regression-test/data/query_p0/test_select_with_predicate_prune.out @@ -0,0 +1,25 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select1 -- +1 jerry 2020-10-01 +2 tom 2020-10-02 +3 jack 2020-10-01 +4 tony 2020-10-02 + +-- !select2 -- +1 jerry 2020-10-01 +3 jack 2020-10-01 + +-- !select3 -- + +-- !select4 -- +1 jerry 2020-10-01 +2 tom 2020-10-02 +3 jack 2020-10-01 +4 tony 2020-10-02 + +-- !select5 -- +2 tom 2020-10-02 +4 tony 2020-10-02 + +-- !select6 -- + diff --git a/regression-test/suites/query_p0/test_select_with_predicate_prune.groovy b/regression-test/suites/query_p0/test_select_with_predicate_prune.groovy new file mode 100644 index 00000000000..768e04b4c32 --- /dev/null +++ b/regression-test/suites/query_p0/test_select_with_predicate_prune.groovy @@ -0,0 +1,67 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +suite("test_select_with_predicate_prune") { + sql """ + drop table if exists `test_select_with_predicate_prune`; + """ + sql """ + CREATE TABLE IF NOT EXISTS `test_select_with_predicate_prune` ( + id int, + name string, + birthday date not null + ) + duplicate key(`id`) + AUTO PARTITION BY LIST (`birthday`)() + DISTRIBUTED BY HASH(`id`) buckets 1 + PROPERTIES + ( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + sql """ + insert into test_select_with_predicate_prune values (1, 'jerry', '2020-10-01'), (2, 'tom', '2020-10-02'); + """ + sql """ + insert into test_select_with_predicate_prune values (3, 'jack', '2020-10-01'), (4, 'tony', '2020-10-02'); + """ + + qt_select1 """ + select * from test_select_with_predicate_prune where birthday < '2020-10-03' order by id; + """ + + qt_select2 """ + select * from test_select_with_predicate_prune where birthday < '2020-10-02' order by id; + """ + + qt_select3 """ + select * from test_select_with_predicate_prune where birthday < '2020-10-01' order by id; + """ + + + qt_select4 """ + select * from test_select_with_predicate_prune where birthday > '2020-09-30' order by id; + """ + + qt_select5 """ + select * from test_select_with_predicate_prune where birthday > '2020-10-01' order by id; + """ + + qt_select6 """ + select * from test_select_with_predicate_prune where birthday > '2020-10-02' order by id; + """ +} \ No newline at end of file --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org