This is an automated email from the ASF dual-hosted git repository. eldenmoon pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new abc21f5d77 [bugfix](ngram bf index) process differently for normal bloom filter index and ngram bf index (#21310) abc21f5d77 is described below commit abc21f5d77f82ce7ca4a95baf780cffba4108acf Author: Kang <kxiao.ti...@gmail.com> AuthorDate: Thu Jul 13 17:31:45 2023 +0800 [bugfix](ngram bf index) process differently for normal bloom filter index and ngram bf index (#21310) * process differently for normal bloom filter index and ngram bf index * fix review comments for readbility * add test case * add testcase for delete condition --- be/src/olap/accept_null_predicate.h | 4 +- be/src/olap/block_column_predicate.h | 10 ++-- be/src/olap/column_predicate.h | 2 +- be/src/olap/comparison_predicate.h | 6 ++- be/src/olap/in_list_predicate.h | 6 ++- be/src/olap/like_column_predicate.h | 4 +- be/src/olap/null_predicate.h | 4 +- be/src/olap/rowset/segment_v2/column_reader.cpp | 3 +- be/src/olap/rowset/segment_v2/column_reader.h | 10 +++- .../data/index_p0/test_ngram_bloomfilter_index.out | 33 ++++++++++++ .../index_p0/test_ngram_bloomfilter_index.groovy | 62 ++++++++++++++++++++++ 11 files changed, 132 insertions(+), 12 deletions(-) diff --git a/be/src/olap/accept_null_predicate.h b/be/src/olap/accept_null_predicate.h index bfff2910ca..1a5f586ed5 100644 --- a/be/src/olap/accept_null_predicate.h +++ b/be/src/olap/accept_null_predicate.h @@ -148,7 +148,9 @@ public: bool evaluate_and(const BloomFilter* bf) const override { return _nested->evaluate_and(bf); } - bool can_do_bloom_filter() const override { return _nested->can_do_bloom_filter(); } + bool can_do_bloom_filter(bool ngram) const override { + return _nested->can_do_bloom_filter(ngram); + } void evaluate_vec(const vectorized::IColumn& column, uint16_t size, bool* flags) const override { diff --git a/be/src/olap/block_column_predicate.h b/be/src/olap/block_column_predicate.h index 0069a62d29..c91dc0c367 100644 --- a/be/src/olap/block_column_predicate.h +++ b/be/src/olap/block_column_predicate.h @@ -87,7 +87,7 @@ public: return true; } - virtual bool can_do_bloom_filter() const { return false; } + virtual bool can_do_bloom_filter(bool ngram) const { return false; } //evaluate predicate on inverted virtual Status evaluate(const std::string& column_name, InvertedIndexIterator* iterator, @@ -121,7 +121,9 @@ public: void evaluate_vec(vectorized::MutableColumns& block, uint16_t size, bool* flags) const override; - bool can_do_bloom_filter() const override { return _predicate->can_do_bloom_filter(); } + bool can_do_bloom_filter(bool ngram) const override { + return _predicate->can_do_bloom_filter(ngram); + } private: const ColumnPredicate* _predicate; @@ -188,9 +190,9 @@ public: bool evaluate_and(const StringRef* dict_words, const size_t dict_num) const override; - bool can_do_bloom_filter() const override { + bool can_do_bloom_filter(bool ngram) const override { for (auto& pred : _block_column_predicate_vec) { - if (!pred->can_do_bloom_filter()) { + if (!pred->can_do_bloom_filter(ngram)) { return false; } } diff --git a/be/src/olap/column_predicate.h b/be/src/olap/column_predicate.h index 88f40c92c1..cad253ac1a 100644 --- a/be/src/olap/column_predicate.h +++ b/be/src/olap/column_predicate.h @@ -183,7 +183,7 @@ public: return true; } - virtual bool can_do_bloom_filter() const { return false; } + virtual bool can_do_bloom_filter(bool ngram) const { return false; } // used to evaluate pre read column in lazy materialization // now only support integer/float diff --git a/be/src/olap/comparison_predicate.h b/be/src/olap/comparison_predicate.h index 6524fdfc7d..04dfd5dc5c 100644 --- a/be/src/olap/comparison_predicate.h +++ b/be/src/olap/comparison_predicate.h @@ -244,6 +244,8 @@ public: bool evaluate_and(const segment_v2::BloomFilter* bf) const override { if constexpr (PT == PredicateType::EQ) { + // EQ predicate can not use ngram bf, just return true to accept + if (bf->is_ngram_bf()) return true; if constexpr (std::is_same_v<T, StringRef>) { return bf->test_bytes(_value.data, _value.size); } else if constexpr (Type == TYPE_DATE) { @@ -272,7 +274,9 @@ public: return true; } - bool can_do_bloom_filter() const override { return PT == PredicateType::EQ; } + bool can_do_bloom_filter(bool ngram) const override { + return PT == PredicateType::EQ && !ngram; + } void evaluate_or(const vectorized::IColumn& column, const uint16_t* sel, uint16_t size, bool* flags) const override { diff --git a/be/src/olap/in_list_predicate.h b/be/src/olap/in_list_predicate.h index 5f0f99f7eb..f4e432cf28 100644 --- a/be/src/olap/in_list_predicate.h +++ b/be/src/olap/in_list_predicate.h @@ -381,6 +381,8 @@ public: bool evaluate_and(const segment_v2::BloomFilter* bf) const override { if constexpr (PT == PredicateType::IN_LIST) { + // IN predicate can not use ngram bf, just return true to accept + if (bf->is_ngram_bf()) return true; HybridSetBase::IteratorBase* iter = _values->begin(); while (iter->has_next()) { if constexpr (std::is_same_v<T, StringRef>) { @@ -408,7 +410,9 @@ public: } } - bool can_do_bloom_filter() const override { return PT == PredicateType::IN_LIST; } + bool can_do_bloom_filter(bool ngram) const override { + return PT == PredicateType::IN_LIST && !ngram; + } private: template <typename LeftT, typename RightT> diff --git a/be/src/olap/like_column_predicate.h b/be/src/olap/like_column_predicate.h index ddbe892303..f97ff46453 100644 --- a/be/src/olap/like_column_predicate.h +++ b/be/src/olap/like_column_predicate.h @@ -76,12 +76,14 @@ public: _page_ng_bf = std::move(src); } bool evaluate_and(const BloomFilter* bf) const override { + // like predicate can not use normal bf, just return true to accept + if (!bf->is_ngram_bf()) return true; if (_page_ng_bf) { return bf->contains(*_page_ng_bf); } return true; } - bool can_do_bloom_filter() const override { return true; } + bool can_do_bloom_filter(bool ngram) const override { return ngram; } private: template <bool is_and> diff --git a/be/src/olap/null_predicate.h b/be/src/olap/null_predicate.h index ed81cc6f1b..4313adea11 100644 --- a/be/src/olap/null_predicate.h +++ b/be/src/olap/null_predicate.h @@ -84,6 +84,8 @@ public: } bool evaluate_and(const segment_v2::BloomFilter* bf) const override { + // null predicate can not use ngram bf, just return true to accept + if (bf->is_ngram_bf()) return true; if (_is_null) { return bf->test_bytes(nullptr, 0); } else { @@ -92,7 +94,7 @@ public: } } - bool can_do_bloom_filter() const override { return _is_null; } + bool can_do_bloom_filter(bool ngram) const override { return _is_null && !ngram; } void evaluate_vec(const vectorized::IColumn& column, uint16_t size, bool* flags) const override; diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp b/be/src/olap/rowset/segment_v2/column_reader.cpp index a27d999169..2e4db26c50 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.cpp +++ b/be/src/olap/rowset/segment_v2/column_reader.cpp @@ -1213,7 +1213,8 @@ Status FileColumnIterator::get_row_ranges_by_zone_map( Status FileColumnIterator::get_row_ranges_by_bloom_filter( const AndBlockColumnPredicate* col_predicates, RowRanges* row_ranges) { - if (col_predicates->can_do_bloom_filter() && _reader->has_bloom_filter_index()) { + if ((col_predicates->can_do_bloom_filter(false) && _reader->has_bloom_filter_index(false)) || + (col_predicates->can_do_bloom_filter(true) && _reader->has_bloom_filter_index(true))) { RETURN_IF_ERROR(_reader->get_row_ranges_by_bloom_filter(col_predicates, row_ranges)); } return Status::OK(); diff --git a/be/src/olap/rowset/segment_v2/column_reader.h b/be/src/olap/rowset/segment_v2/column_reader.h index a6d23ac950..fb212ef33d 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.h +++ b/be/src/olap/rowset/segment_v2/column_reader.h @@ -137,7 +137,15 @@ public: bool has_zone_map() const { return _zone_map_index_meta != nullptr; } bool has_bitmap_index() const { return _bitmap_index_meta != nullptr; } - bool has_bloom_filter_index() const { return _bf_index_meta != nullptr; } + bool has_bloom_filter_index(bool ngram) const { + if (_bf_index_meta == nullptr) return false; + + if (ngram) { + return _bf_index_meta->algorithm() == BloomFilterAlgorithmPB::NGRAM_BLOOM_FILTER; + } else { + return _bf_index_meta->algorithm() != BloomFilterAlgorithmPB::NGRAM_BLOOM_FILTER; + } + } // Check if this column could match `cond' using segment zone map. // Since segment zone map is stored in metadata, this function is fast without I/O. diff --git a/regression-test/data/index_p0/test_ngram_bloomfilter_index.out b/regression-test/data/index_p0/test_ngram_bloomfilter_index.out new file mode 100644 index 0000000000..7849739f42 --- /dev/null +++ b/regression-test/data/index_p0/test_ngram_bloomfilter_index.out @@ -0,0 +1,33 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_all_1 -- +1 dt_bjn001 p9-webcast-sign.douyinpic.com test /%/7212503657802320699% /test 100 false +1 dt_bjn001 p9-webcast-sign.douyinpic.com test /%/7212503657802320699%xxx /test 100 false + +-- !select_eq_1 -- +1 dt_bjn001 p9-webcast-sign.douyinpic.com test /%/7212503657802320699% /test 100 false + +-- !select_in_1 -- +1 dt_bjn001 p9-webcast-sign.douyinpic.com test /%/7212503657802320699% /test 100 false + +-- !select_like_1 -- +1 dt_bjn001 p9-webcast-sign.douyinpic.com test /%/7212503657802320699% /test 100 false +1 dt_bjn001 p9-webcast-sign.douyinpic.com test /%/7212503657802320699%xxx /test 100 false + +-- !select_all_2 -- +1 dt_bjn001 p9-webcast-sign.douyinpic.com test /%/7212503657802320699%xxx /test 100 false + +-- !select_eq_2 -- + +-- !select_in_2 -- + +-- !select_like_2 -- +1 dt_bjn001 p9-webcast-sign.douyinpic.com test /%/7212503657802320699%xxx /test 100 false + +-- !select_all_3 -- + +-- !select_eq_3 -- + +-- !select_in_3 -- + +-- !select_like_3 -- + diff --git a/regression-test/suites/index_p0/test_ngram_bloomfilter_index.groovy b/regression-test/suites/index_p0/test_ngram_bloomfilter_index.groovy new file mode 100644 index 0000000000..7619adedc3 --- /dev/null +++ b/regression-test/suites/index_p0/test_ngram_bloomfilter_index.groovy @@ -0,0 +1,62 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +suite("test_ngram_bloomfilter_index") { + // todo: test bitmap index, such as create, drop, alter table index + def tableName = 'test_ngram_bloomfilter_index' + sql "DROP TABLE IF EXISTS ${tableName}" + sql """ + CREATE TABLE IF NOT EXISTS ${tableName} ( + `key_id` bigint(20) NULL COMMENT '', + `category` varchar(200) NULL COMMENT '', + `https_url` varchar(300) NULL COMMENT '', + `hostname` varchar(300) NULL, + `http_url` text NULL COMMENT '', + `url_path` varchar(2000) NULL COMMENT '', + `cnt` bigint(20) NULL COMMENT '', + `host_flag` boolean NULL COMMENT '', + INDEX idx_ngrambf (`http_url`) USING NGRAM_BF PROPERTIES("gram_size" = "2", "bf_size" = "512") + ) ENGINE=OLAP + DUPLICATE KEY(`key_id`, `category`) + COMMENT 'OLAP' + DISTRIBUTED BY HASH(`key_id`) BUCKETS 3 + PROPERTIES("replication_num" = "1"); + """ + + sql "INSERT INTO ${tableName} values (1, 'dt_bjn001', 'p9-webcast-sign.douyinpic.com', 'test', '/%/7212503657802320699%', '/test', 100, false);" + sql "INSERT INTO ${tableName} values (1, 'dt_bjn001', 'p9-webcast-sign.douyinpic.com', 'test', '/%/7212503657802320699%xxx', '/test', 100, false);" + + + sql "SET enable_function_pushdown = true" + + qt_select_all_1 "SELECT * FROM ${tableName}" + qt_select_eq_1 "SELECT * FROM ${tableName} WHERE http_url = '/%/7212503657802320699%'" + qt_select_in_1 "SELECT * FROM ${tableName} WHERE http_url IN ('/%/7212503657802320699%')" + qt_select_like_1 "SELECT * FROM ${tableName} WHERE http_url like '/%/7212503657802320699%'" + + // delete and then select + sql "DELETE FROM ${tableName} WHERE http_url IN ('/%/7212503657802320699%')" + qt_select_all_2 "SELECT * FROM ${tableName}" + qt_select_eq_2 "SELECT * FROM ${tableName} WHERE http_url = '/%/7212503657802320699%'" + qt_select_in_2 "SELECT * FROM ${tableName} WHERE http_url IN ('/%/7212503657802320699%')" + qt_select_like_2 "SELECT * FROM ${tableName} WHERE http_url like '/%/7212503657802320699%'" + + sql "DELETE FROM ${tableName} WHERE http_url = '/%/7212503657802320699%xxx'" + qt_select_all_3 "SELECT * FROM ${tableName}" + qt_select_eq_3 "SELECT * FROM ${tableName} WHERE http_url = '/%/7212503657802320699%'" + qt_select_in_3 "SELECT * FROM ${tableName} WHERE http_url IN ('/%/7212503657802320699%')" + qt_select_like_3 "SELECT * FROM ${tableName} WHERE http_url like '/%/7212503657802320699%'" +} --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org