This is an automated email from the ASF dual-hosted git repository.

eldenmoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new abc21f5d77 [bugfix](ngram bf index) process differently for normal 
bloom filter index and ngram bf index (#21310)
abc21f5d77 is described below

commit abc21f5d77f82ce7ca4a95baf780cffba4108acf
Author: Kang <kxiao.ti...@gmail.com>
AuthorDate: Thu Jul 13 17:31:45 2023 +0800

    [bugfix](ngram bf index) process differently for normal bloom filter index 
and ngram bf index (#21310)
    
    * process differently for normal bloom filter index and ngram bf index
    
    * fix review comments for readbility
    
    * add test case
    
    * add testcase for delete condition
---
 be/src/olap/accept_null_predicate.h                |  4 +-
 be/src/olap/block_column_predicate.h               | 10 ++--
 be/src/olap/column_predicate.h                     |  2 +-
 be/src/olap/comparison_predicate.h                 |  6 ++-
 be/src/olap/in_list_predicate.h                    |  6 ++-
 be/src/olap/like_column_predicate.h                |  4 +-
 be/src/olap/null_predicate.h                       |  4 +-
 be/src/olap/rowset/segment_v2/column_reader.cpp    |  3 +-
 be/src/olap/rowset/segment_v2/column_reader.h      | 10 +++-
 .../data/index_p0/test_ngram_bloomfilter_index.out | 33 ++++++++++++
 .../index_p0/test_ngram_bloomfilter_index.groovy   | 62 ++++++++++++++++++++++
 11 files changed, 132 insertions(+), 12 deletions(-)

diff --git a/be/src/olap/accept_null_predicate.h 
b/be/src/olap/accept_null_predicate.h
index bfff2910ca..1a5f586ed5 100644
--- a/be/src/olap/accept_null_predicate.h
+++ b/be/src/olap/accept_null_predicate.h
@@ -148,7 +148,9 @@ public:
 
     bool evaluate_and(const BloomFilter* bf) const override { return 
_nested->evaluate_and(bf); }
 
-    bool can_do_bloom_filter() const override { return 
_nested->can_do_bloom_filter(); }
+    bool can_do_bloom_filter(bool ngram) const override {
+        return _nested->can_do_bloom_filter(ngram);
+    }
 
     void evaluate_vec(const vectorized::IColumn& column, uint16_t size,
                       bool* flags) const override {
diff --git a/be/src/olap/block_column_predicate.h 
b/be/src/olap/block_column_predicate.h
index 0069a62d29..c91dc0c367 100644
--- a/be/src/olap/block_column_predicate.h
+++ b/be/src/olap/block_column_predicate.h
@@ -87,7 +87,7 @@ public:
         return true;
     }
 
-    virtual bool can_do_bloom_filter() const { return false; }
+    virtual bool can_do_bloom_filter(bool ngram) const { return false; }
 
     //evaluate predicate on inverted
     virtual Status evaluate(const std::string& column_name, 
InvertedIndexIterator* iterator,
@@ -121,7 +121,9 @@ public:
 
     void evaluate_vec(vectorized::MutableColumns& block, uint16_t size, bool* 
flags) const override;
 
-    bool can_do_bloom_filter() const override { return 
_predicate->can_do_bloom_filter(); }
+    bool can_do_bloom_filter(bool ngram) const override {
+        return _predicate->can_do_bloom_filter(ngram);
+    }
 
 private:
     const ColumnPredicate* _predicate;
@@ -188,9 +190,9 @@ public:
 
     bool evaluate_and(const StringRef* dict_words, const size_t dict_num) 
const override;
 
-    bool can_do_bloom_filter() const override {
+    bool can_do_bloom_filter(bool ngram) const override {
         for (auto& pred : _block_column_predicate_vec) {
-            if (!pred->can_do_bloom_filter()) {
+            if (!pred->can_do_bloom_filter(ngram)) {
                 return false;
             }
         }
diff --git a/be/src/olap/column_predicate.h b/be/src/olap/column_predicate.h
index 88f40c92c1..cad253ac1a 100644
--- a/be/src/olap/column_predicate.h
+++ b/be/src/olap/column_predicate.h
@@ -183,7 +183,7 @@ public:
         return true;
     }
 
-    virtual bool can_do_bloom_filter() const { return false; }
+    virtual bool can_do_bloom_filter(bool ngram) const { return false; }
 
     // used to evaluate pre read column in lazy materialization
     // now only support integer/float
diff --git a/be/src/olap/comparison_predicate.h 
b/be/src/olap/comparison_predicate.h
index 6524fdfc7d..04dfd5dc5c 100644
--- a/be/src/olap/comparison_predicate.h
+++ b/be/src/olap/comparison_predicate.h
@@ -244,6 +244,8 @@ public:
 
     bool evaluate_and(const segment_v2::BloomFilter* bf) const override {
         if constexpr (PT == PredicateType::EQ) {
+            // EQ predicate can not use ngram bf, just return true to accept
+            if (bf->is_ngram_bf()) return true;
             if constexpr (std::is_same_v<T, StringRef>) {
                 return bf->test_bytes(_value.data, _value.size);
             } else if constexpr (Type == TYPE_DATE) {
@@ -272,7 +274,9 @@ public:
         return true;
     }
 
-    bool can_do_bloom_filter() const override { return PT == 
PredicateType::EQ; }
+    bool can_do_bloom_filter(bool ngram) const override {
+        return PT == PredicateType::EQ && !ngram;
+    }
 
     void evaluate_or(const vectorized::IColumn& column, const uint16_t* sel, 
uint16_t size,
                      bool* flags) const override {
diff --git a/be/src/olap/in_list_predicate.h b/be/src/olap/in_list_predicate.h
index 5f0f99f7eb..f4e432cf28 100644
--- a/be/src/olap/in_list_predicate.h
+++ b/be/src/olap/in_list_predicate.h
@@ -381,6 +381,8 @@ public:
 
     bool evaluate_and(const segment_v2::BloomFilter* bf) const override {
         if constexpr (PT == PredicateType::IN_LIST) {
+            // IN predicate can not use ngram bf, just return true to accept
+            if (bf->is_ngram_bf()) return true;
             HybridSetBase::IteratorBase* iter = _values->begin();
             while (iter->has_next()) {
                 if constexpr (std::is_same_v<T, StringRef>) {
@@ -408,7 +410,9 @@ public:
         }
     }
 
-    bool can_do_bloom_filter() const override { return PT == 
PredicateType::IN_LIST; }
+    bool can_do_bloom_filter(bool ngram) const override {
+        return PT == PredicateType::IN_LIST && !ngram;
+    }
 
 private:
     template <typename LeftT, typename RightT>
diff --git a/be/src/olap/like_column_predicate.h 
b/be/src/olap/like_column_predicate.h
index ddbe892303..f97ff46453 100644
--- a/be/src/olap/like_column_predicate.h
+++ b/be/src/olap/like_column_predicate.h
@@ -76,12 +76,14 @@ public:
         _page_ng_bf = std::move(src);
     }
     bool evaluate_and(const BloomFilter* bf) const override {
+        // like predicate can not use normal bf, just return true to accept
+        if (!bf->is_ngram_bf()) return true;
         if (_page_ng_bf) {
             return bf->contains(*_page_ng_bf);
         }
         return true;
     }
-    bool can_do_bloom_filter() const override { return true; }
+    bool can_do_bloom_filter(bool ngram) const override { return ngram; }
 
 private:
     template <bool is_and>
diff --git a/be/src/olap/null_predicate.h b/be/src/olap/null_predicate.h
index ed81cc6f1b..4313adea11 100644
--- a/be/src/olap/null_predicate.h
+++ b/be/src/olap/null_predicate.h
@@ -84,6 +84,8 @@ public:
     }
 
     bool evaluate_and(const segment_v2::BloomFilter* bf) const override {
+        // null predicate can not use ngram bf, just return true to accept
+        if (bf->is_ngram_bf()) return true;
         if (_is_null) {
             return bf->test_bytes(nullptr, 0);
         } else {
@@ -92,7 +94,7 @@ public:
         }
     }
 
-    bool can_do_bloom_filter() const override { return _is_null; }
+    bool can_do_bloom_filter(bool ngram) const override { return _is_null && 
!ngram; }
 
     void evaluate_vec(const vectorized::IColumn& column, uint16_t size, bool* 
flags) const override;
 
diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp 
b/be/src/olap/rowset/segment_v2/column_reader.cpp
index a27d999169..2e4db26c50 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/column_reader.cpp
@@ -1213,7 +1213,8 @@ Status FileColumnIterator::get_row_ranges_by_zone_map(
 
 Status FileColumnIterator::get_row_ranges_by_bloom_filter(
         const AndBlockColumnPredicate* col_predicates, RowRanges* row_ranges) {
-    if (col_predicates->can_do_bloom_filter() && 
_reader->has_bloom_filter_index()) {
+    if ((col_predicates->can_do_bloom_filter(false) && 
_reader->has_bloom_filter_index(false)) ||
+        (col_predicates->can_do_bloom_filter(true) && 
_reader->has_bloom_filter_index(true))) {
         
RETURN_IF_ERROR(_reader->get_row_ranges_by_bloom_filter(col_predicates, 
row_ranges));
     }
     return Status::OK();
diff --git a/be/src/olap/rowset/segment_v2/column_reader.h 
b/be/src/olap/rowset/segment_v2/column_reader.h
index a6d23ac950..fb212ef33d 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.h
+++ b/be/src/olap/rowset/segment_v2/column_reader.h
@@ -137,7 +137,15 @@ public:
 
     bool has_zone_map() const { return _zone_map_index_meta != nullptr; }
     bool has_bitmap_index() const { return _bitmap_index_meta != nullptr; }
-    bool has_bloom_filter_index() const { return _bf_index_meta != nullptr; }
+    bool has_bloom_filter_index(bool ngram) const {
+        if (_bf_index_meta == nullptr) return false;
+
+        if (ngram) {
+            return _bf_index_meta->algorithm() == 
BloomFilterAlgorithmPB::NGRAM_BLOOM_FILTER;
+        } else {
+            return _bf_index_meta->algorithm() != 
BloomFilterAlgorithmPB::NGRAM_BLOOM_FILTER;
+        }
+    }
 
     // Check if this column could match `cond' using segment zone map.
     // Since segment zone map is stored in metadata, this function is fast 
without I/O.
diff --git a/regression-test/data/index_p0/test_ngram_bloomfilter_index.out 
b/regression-test/data/index_p0/test_ngram_bloomfilter_index.out
new file mode 100644
index 0000000000..7849739f42
--- /dev/null
+++ b/regression-test/data/index_p0/test_ngram_bloomfilter_index.out
@@ -0,0 +1,33 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !select_all_1 --
+1      dt_bjn001       p9-webcast-sign.douyinpic.com   test    
/%/7212503657802320699% /test   100     false
+1      dt_bjn001       p9-webcast-sign.douyinpic.com   test    
/%/7212503657802320699%xxx      /test   100     false
+
+-- !select_eq_1 --
+1      dt_bjn001       p9-webcast-sign.douyinpic.com   test    
/%/7212503657802320699% /test   100     false
+
+-- !select_in_1 --
+1      dt_bjn001       p9-webcast-sign.douyinpic.com   test    
/%/7212503657802320699% /test   100     false
+
+-- !select_like_1 --
+1      dt_bjn001       p9-webcast-sign.douyinpic.com   test    
/%/7212503657802320699% /test   100     false
+1      dt_bjn001       p9-webcast-sign.douyinpic.com   test    
/%/7212503657802320699%xxx      /test   100     false
+
+-- !select_all_2 --
+1      dt_bjn001       p9-webcast-sign.douyinpic.com   test    
/%/7212503657802320699%xxx      /test   100     false
+
+-- !select_eq_2 --
+
+-- !select_in_2 --
+
+-- !select_like_2 --
+1      dt_bjn001       p9-webcast-sign.douyinpic.com   test    
/%/7212503657802320699%xxx      /test   100     false
+
+-- !select_all_3 --
+
+-- !select_eq_3 --
+
+-- !select_in_3 --
+
+-- !select_like_3 --
+
diff --git 
a/regression-test/suites/index_p0/test_ngram_bloomfilter_index.groovy 
b/regression-test/suites/index_p0/test_ngram_bloomfilter_index.groovy
new file mode 100644
index 0000000000..7619adedc3
--- /dev/null
+++ b/regression-test/suites/index_p0/test_ngram_bloomfilter_index.groovy
@@ -0,0 +1,62 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+suite("test_ngram_bloomfilter_index") {
+    // todo: test bitmap index, such as create, drop, alter table index
+    def tableName = 'test_ngram_bloomfilter_index'
+    sql "DROP TABLE IF EXISTS ${tableName}"
+    sql """
+    CREATE TABLE IF NOT EXISTS ${tableName} (
+        `key_id` bigint(20) NULL COMMENT '',
+        `category` varchar(200) NULL COMMENT '',
+        `https_url` varchar(300) NULL COMMENT '',
+        `hostname` varchar(300) NULL,
+        `http_url` text NULL COMMENT '',
+        `url_path` varchar(2000) NULL COMMENT '',
+        `cnt` bigint(20) NULL COMMENT '',
+        `host_flag` boolean NULL COMMENT '',
+        INDEX idx_ngrambf (`http_url`) USING NGRAM_BF PROPERTIES("gram_size" = 
"2", "bf_size" = "512")
+    ) ENGINE=OLAP
+    DUPLICATE KEY(`key_id`, `category`)
+    COMMENT 'OLAP'
+    DISTRIBUTED BY HASH(`key_id`) BUCKETS 3
+    PROPERTIES("replication_num" = "1");
+    """
+
+    sql "INSERT INTO ${tableName} values (1, 'dt_bjn001', 
'p9-webcast-sign.douyinpic.com', 'test', '/%/7212503657802320699%', '/test', 
100, false);"
+    sql "INSERT INTO ${tableName} values (1, 'dt_bjn001', 
'p9-webcast-sign.douyinpic.com', 'test', '/%/7212503657802320699%xxx', '/test', 
100, false);"
+
+
+    sql "SET enable_function_pushdown = true"
+
+    qt_select_all_1 "SELECT * FROM ${tableName}"
+    qt_select_eq_1 "SELECT * FROM ${tableName} WHERE http_url = 
'/%/7212503657802320699%'"
+    qt_select_in_1 "SELECT * FROM ${tableName} WHERE http_url IN 
('/%/7212503657802320699%')"
+    qt_select_like_1 "SELECT * FROM ${tableName} WHERE http_url like 
'/%/7212503657802320699%'"
+
+    // delete and then select
+    sql "DELETE FROM ${tableName} WHERE http_url IN 
('/%/7212503657802320699%')"
+    qt_select_all_2 "SELECT * FROM ${tableName}"
+    qt_select_eq_2 "SELECT * FROM ${tableName} WHERE http_url = 
'/%/7212503657802320699%'"
+    qt_select_in_2 "SELECT * FROM ${tableName} WHERE http_url IN 
('/%/7212503657802320699%')"
+    qt_select_like_2 "SELECT * FROM ${tableName} WHERE http_url like 
'/%/7212503657802320699%'"
+
+    sql "DELETE FROM ${tableName} WHERE http_url = 
'/%/7212503657802320699%xxx'"
+    qt_select_all_3 "SELECT * FROM ${tableName}"
+    qt_select_eq_3 "SELECT * FROM ${tableName} WHERE http_url = 
'/%/7212503657802320699%'"
+    qt_select_in_3 "SELECT * FROM ${tableName} WHERE http_url IN 
('/%/7212503657802320699%')"
+    qt_select_like_3 "SELECT * FROM ${tableName} WHERE http_url like 
'/%/7212503657802320699%'"
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to