This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-2.0 by this push:
     new 0d51acd5426 [improvement](scanner) Remove the predicate that is always 
true for the segment (#25582)
0d51acd5426 is described below

commit 0d51acd54263ebb13508cc70e8caa56314025bc8
Author: Jerry Hu <mrh...@gmail.com>
AuthorDate: Wed Oct 18 20:36:06 2023 +0800

    [improvement](scanner) Remove the predicate that is always true for the 
segment (#25582)
    
    * [improvement](scanner) Remove the predicate that is always true for the 
segment (#25366) (#25427)
    
    By utilizing the zonemap index of the segment, we can ascertain if a 
predicate is always true. For example, if the segment’s maximum value is 100 
and the predicate is col < 101, then this predicate is always true for this 
segment.
    
    * [fix](scanner) coredump caused by 'prune_predicates_by_zone_map' (#25555)
---
 be/src/common/config.cpp                           |  2 +
 be/src/common/config.h                             |  3 +
 be/src/olap/column_predicate.h                     |  4 ++
 be/src/olap/comparison_predicate.h                 | 27 +++++++++
 be/src/olap/rowset/segment_v2/column_reader.cpp    | 26 +++++++++
 be/src/olap/rowset/segment_v2/column_reader.h      |  3 +
 be/src/olap/rowset/segment_v2/segment.cpp          | 22 ++++++-
 .../query_p0/test_select_with_predicate_prune.out  | 25 ++++++++
 .../test_select_with_predicate_prune.groovy        | 67 ++++++++++++++++++++++
 9 files changed, 178 insertions(+), 1 deletion(-)

diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp
index f82bc241c29..265a2232d68 100644
--- a/be/src/common/config.cpp
+++ b/be/src/common/config.cpp
@@ -1085,6 +1085,8 @@ DEFINE_mInt32(tablet_schema_cache_recycle_interval, 
"86400");
 
 DEFINE_Bool(exit_on_exception, "false")
 
+DEFINE_Bool(ignore_always_true_predicate_for_segment, "true");
+
 // clang-format off
 #ifdef BE_TEST
 // test s3
diff --git a/be/src/common/config.h b/be/src/common/config.h
index 7c4ab39a40e..06f9a18fcd5 100644
--- a/be/src/common/config.h
+++ b/be/src/common/config.h
@@ -1141,6 +1141,9 @@ DECLARE_mInt32(tablet_schema_cache_recycle_interval);
 // Use `LOG(FATAL)` to replace `throw` when true
 DECLARE_mBool(exit_on_exception);
 
+// Remove predicate that is always true for a segment.
+DECLARE_Bool(ignore_always_true_predicate_for_segment);
+
 #ifdef BE_TEST
 // test s3
 DECLARE_String(test_s3_resource);
diff --git a/be/src/olap/column_predicate.h b/be/src/olap/column_predicate.h
index b98156f5fb8..05e84999a83 100644
--- a/be/src/olap/column_predicate.h
+++ b/be/src/olap/column_predicate.h
@@ -173,6 +173,10 @@ public:
         return true;
     }
 
+    virtual bool is_always_true(const std::pair<WrapperField*, WrapperField*>& 
statistic) const {
+        return false;
+    }
+
     virtual bool evaluate_del(const std::pair<WrapperField*, WrapperField*>& 
statistic) const {
         return false;
     }
diff --git a/be/src/olap/comparison_predicate.h 
b/be/src/olap/comparison_predicate.h
index 04dfd5dc5c3..53149ea7ed4 100644
--- a/be/src/olap/comparison_predicate.h
+++ b/be/src/olap/comparison_predicate.h
@@ -158,6 +158,8 @@ public:
         return _operator(*reinterpret_cast<const 
T*>(statistic.ELE->cell_ptr()), _value); \
     }
 
+    using WarpperFieldType = std::conditional_t<Type == TYPE_DATE, uint24_t, 
T>;
+
     bool evaluate_and(const std::pair<WrapperField*, WrapperField*>& 
statistic) const override {
         if (statistic.first->is_null()) {
             return true;
@@ -202,6 +204,31 @@ public:
         }
     }
 
+    bool is_always_true(const std::pair<WrapperField*, WrapperField*>& 
statistic) const override {
+        if (statistic.first->is_null() || statistic.second->is_null()) {
+            return false;
+        }
+
+        DCHECK_LE(sizeof(T), statistic.first->size());
+
+        T tmp_min_value {};
+        T tmp_max_value {};
+        memcpy((char*)(&tmp_min_value), statistic.first->cell_ptr(), 
sizeof(WarpperFieldType));
+        memcpy((char*)(&tmp_max_value), statistic.second->cell_ptr(), 
sizeof(WarpperFieldType));
+
+        if constexpr (PT == PredicateType::LT) {
+            return _value > tmp_max_value;
+        } else if constexpr (PT == PredicateType::LE) {
+            return _value >= tmp_max_value;
+        } else if constexpr (PT == PredicateType::GT) {
+            return _value < tmp_min_value;
+        } else if constexpr (PT == PredicateType::GE) {
+            return _value <= tmp_min_value;
+        }
+
+        return false;
+    }
+
     bool evaluate_del(const std::pair<WrapperField*, WrapperField*>& 
statistic) const override {
         if (statistic.first->is_null() || statistic.second->is_null()) {
             return false;
diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp 
b/be/src/olap/rowset/segment_v2/column_reader.cpp
index b1b817f545a..d9a074e2904 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/column_reader.cpp
@@ -30,6 +30,7 @@
 #include "io/fs/file_reader.h"
 #include "olap/block_column_predicate.h"
 #include "olap/column_predicate.h"
+#include "olap/comparison_predicate.h"
 #include "olap/decimal12.h"
 #include "olap/inverted_index_parser.h"
 #include "olap/iterators.h"
@@ -339,6 +340,31 @@ bool ColumnReader::match_condition(const 
AndBlockColumnPredicate* col_predicates
                                      col_predicates);
 }
 
+bool ColumnReader::prune_predicates_by_zone_map(std::vector<ColumnPredicate*>& 
predicates,
+                                                const int column_id) const {
+    if (_zone_map_index == nullptr) {
+        return false;
+    }
+
+    FieldType type = _type_info->type();
+    std::unique_ptr<WrapperField> min_value(WrapperField::create_by_type(type, 
_meta_length));
+    std::unique_ptr<WrapperField> max_value(WrapperField::create_by_type(type, 
_meta_length));
+    _parse_zone_map(*_segment_zone_map, min_value.get(), max_value.get());
+
+    auto pruned = false;
+    for (auto it = predicates.begin(); it != predicates.end();) {
+        auto predicate = *it;
+        if (predicate->column_id() == column_id &&
+            predicate->is_always_true({min_value.get(), max_value.get()})) {
+            pruned = true;
+            it = predicates.erase(it);
+        } else {
+            ++it;
+        }
+    }
+    return pruned;
+}
+
 void ColumnReader::_parse_zone_map(const ZoneMapPB& zone_map, WrapperField* 
min_value_container,
                                    WrapperField* max_value_container) const {
     // min value and max value are valid if has_not_null is true
diff --git a/be/src/olap/rowset/segment_v2/column_reader.h 
b/be/src/olap/rowset/segment_v2/column_reader.h
index 174aabdefa8..7964555adeb 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.h
+++ b/be/src/olap/rowset/segment_v2/column_reader.h
@@ -162,6 +162,9 @@ public:
 
     bool is_empty() const { return _num_rows == 0; }
 
+    bool prune_predicates_by_zone_map(std::vector<ColumnPredicate*>& 
predicates,
+                                      const int column_id) const;
+
     CompressionTypePB get_compression() const { return _meta_compression; }
 
     uint64_t num_rows() const { return _num_rows; }
diff --git a/be/src/olap/rowset/segment_v2/segment.cpp 
b/be/src/olap/rowset/segment_v2/segment.cpp
index 153ed925176..991518347c9 100644
--- a/be/src/olap/rowset/segment_v2/segment.cpp
+++ b/be/src/olap/rowset/segment_v2/segment.cpp
@@ -147,7 +147,6 @@ Status Segment::new_iterator(SchemaSPtr schema, const 
StorageReadOptions& read_o
             return Status::OK();
         }
     }
-
     if (read_options.use_topn_opt) {
         auto query_ctx = read_options.runtime_state->get_query_ctx();
         auto runtime_predicate = 
query_ctx->get_runtime_predicate().get_predictate();
@@ -175,6 +174,27 @@ Status Segment::new_iterator(SchemaSPtr schema, const 
StorageReadOptions& read_o
         iter->reset(new SegmentIterator(this->shared_from_this(), schema));
     }
 
+    if (config::ignore_always_true_predicate_for_segment &&
+        read_options.io_ctx.reader_type == ReaderType::READER_QUERY &&
+        !read_options.column_predicates.empty()) {
+        auto pruned_predicates = read_options.column_predicates;
+        auto pruned = false;
+        for (auto& it : _column_readers) {
+            const auto uid = it.first;
+            const auto column_id = 
read_options.tablet_schema->field_index(uid);
+            if (it.second->prune_predicates_by_zone_map(pruned_predicates, 
column_id)) {
+                pruned = true;
+            }
+        }
+
+        if (pruned) {
+            auto options_with_pruned_predicates = read_options;
+            options_with_pruned_predicates.column_predicates = 
pruned_predicates;
+            LOG(INFO) << "column_predicates pruned from " << 
read_options.column_predicates.size()
+                      << " to " << pruned_predicates.size();
+            return iter->get()->init(options_with_pruned_predicates);
+        }
+    }
     return iter->get()->init(read_options);
 }
 
diff --git a/regression-test/data/query_p0/test_select_with_predicate_prune.out 
b/regression-test/data/query_p0/test_select_with_predicate_prune.out
new file mode 100644
index 00000000000..2e1fad87499
--- /dev/null
+++ b/regression-test/data/query_p0/test_select_with_predicate_prune.out
@@ -0,0 +1,25 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !select1 --
+1      jerry   2020-10-01
+2      tom     2020-10-02
+3      jack    2020-10-01
+4      tony    2020-10-02
+
+-- !select2 --
+1      jerry   2020-10-01
+3      jack    2020-10-01
+
+-- !select3 --
+
+-- !select4 --
+1      jerry   2020-10-01
+2      tom     2020-10-02
+3      jack    2020-10-01
+4      tony    2020-10-02
+
+-- !select5 --
+2      tom     2020-10-02
+4      tony    2020-10-02
+
+-- !select6 --
+
diff --git 
a/regression-test/suites/query_p0/test_select_with_predicate_prune.groovy 
b/regression-test/suites/query_p0/test_select_with_predicate_prune.groovy
new file mode 100644
index 00000000000..768e04b4c32
--- /dev/null
+++ b/regression-test/suites/query_p0/test_select_with_predicate_prune.groovy
@@ -0,0 +1,67 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+suite("test_select_with_predicate_prune") {
+    sql """
+        drop table if exists `test_select_with_predicate_prune`;
+    """
+    sql """
+        CREATE TABLE IF NOT EXISTS `test_select_with_predicate_prune` (
+            id int,
+            name string,
+            birthday date not null
+        )
+        duplicate key(`id`)
+        AUTO PARTITION BY LIST (`birthday`)()
+        DISTRIBUTED BY HASH(`id`) buckets 1
+        PROPERTIES
+        (
+            "replication_allocation" = "tag.location.default: 1"
+        );
+    """
+
+    sql """
+        insert into test_select_with_predicate_prune values (1, 'jerry', 
'2020-10-01'), (2, 'tom', '2020-10-02');
+    """
+    sql """
+        insert into test_select_with_predicate_prune values (3, 'jack', 
'2020-10-01'), (4, 'tony', '2020-10-02');
+    """
+
+    qt_select1 """
+        select * from test_select_with_predicate_prune where birthday < 
'2020-10-03' order by id;
+    """
+
+    qt_select2 """
+        select * from test_select_with_predicate_prune where birthday < 
'2020-10-02' order by id;
+    """
+
+    qt_select3 """
+        select * from test_select_with_predicate_prune where birthday < 
'2020-10-01' order by id;
+    """
+
+
+    qt_select4 """
+        select * from test_select_with_predicate_prune where birthday > 
'2020-09-30' order by id;
+    """
+
+    qt_select5 """
+        select * from test_select_with_predicate_prune where birthday > 
'2020-10-01' order by id;
+    """
+
+    qt_select6 """
+        select * from test_select_with_predicate_prune where birthday > 
'2020-10-02' order by id;
+    """
+}
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to