This is an automated email from the ASF dual-hosted git repository. wangbo pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/incubator-doris.git
The following commit(s) were added to refs/heads/master by this push: new 48222f1fb0 [fix](storage)bloom filter support ColumnDict (#9167) 48222f1fb0 is described below commit 48222f1fb0381dc3e7182f3ea6a97a4ff80a02a4 Author: wangbo <wan...@apache.org> AuthorDate: Thu Apr 28 20:03:26 2022 +0800 [fix](storage)bloom filter support ColumnDict (#9167) bloom filter support ColumnDict(#9167) --- be/src/exprs/bloomfilter_predicate.h | 28 ++++++++++++++++------- be/src/olap/bloom_filter_predicate.h | 28 +++++++++++++++++++---- be/src/vec/columns/column_dictionary.h | 42 ++++++++++++++++++++++++++++++++-- 3 files changed, 84 insertions(+), 14 deletions(-) diff --git a/be/src/exprs/bloomfilter_predicate.h b/be/src/exprs/bloomfilter_predicate.h index d9ce266c91..ae1a8945e8 100644 --- a/be/src/exprs/bloomfilter_predicate.h +++ b/be/src/exprs/bloomfilter_predicate.h @@ -61,8 +61,9 @@ public: size_t size() { return _bloom_filter->directory().size; } - bool test_bytes(const char* data, size_t len) const { - return _bloom_filter->find(Slice(data, len)); + template <typename T> + bool test(T data) const { + return _bloom_filter->find(data); } void add_bytes(const char* data, size_t len) { _bloom_filter->insert(Slice(data, len)); } @@ -83,6 +84,7 @@ public: virtual void insert(const void* data) = 0; virtual bool find(const void* data) const = 0; virtual bool find_olap_engine(const void* data) const = 0; + virtual bool find_uint32_t(uint32_t data) const = 0; virtual Status merge(IBloomFilterFuncBase* bloomfilter_func) = 0; virtual Status assign(const char* data, int len) = 0; @@ -171,12 +173,15 @@ struct CommonFindOp { bloom_filter.add_bytes((char*)data, sizeof(T)); } ALWAYS_INLINE bool find(const BloomFilterAdaptor& bloom_filter, const void* data) const { - return bloom_filter.test_bytes((char*)data, sizeof(T)); + return bloom_filter.test(Slice((char*)data, sizeof(T))); } ALWAYS_INLINE bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, const void* data) const { return this->find(bloom_filter, data); } + ALWAYS_INLINE bool find(const BloomFilterAdaptor& bloom_filter, uint32_t data) const { + return bloom_filter.test(data); + } }; template <class BloomFilterAdaptor> @@ -192,12 +197,15 @@ struct StringFindOp { if (value == nullptr) { return false; } - return bloom_filter.test_bytes(value->ptr, value->len); + return bloom_filter.test(Slice(value->ptr, value->len)); } ALWAYS_INLINE bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, const void* data) const { return StringFindOp::find(bloom_filter, data); } + ALWAYS_INLINE bool find(const BloomFilterAdaptor& bloom_filter, uint32_t data) const { + return bloom_filter.test(data); + } }; // We do not need to judge whether data is empty, because null will not appear @@ -210,7 +218,7 @@ struct FixedStringFindOp : public StringFindOp<BloomFilterAdaptor> { int64_t size = value->len; char* data = value->ptr; while (size > 0 && data[size - 1] == '\0') size--; - return bloom_filter.test_bytes(value->ptr, size); + return bloom_filter.test(Slice(value->ptr, size)); } }; @@ -219,7 +227,7 @@ struct DateTimeFindOp : public CommonFindOp<DateTimeValue, BloomFilterAdaptor> { bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, const void* data) const { DateTimeValue value; value.from_olap_datetime(*reinterpret_cast<const uint64_t*>(data)); - return bloom_filter.test_bytes((char*)&value, sizeof(DateTimeValue)); + return bloom_filter.test(Slice((char*)&value, sizeof(DateTimeValue))); } }; @@ -238,7 +246,7 @@ struct DateFindOp : public CommonFindOp<DateTimeValue, BloomFilterAdaptor> { char data_bytes[sizeof(date_value)]; memcpy(&data_bytes, &date_value, sizeof(date_value)); - return bloom_filter.test_bytes(data_bytes, sizeof(DateTimeValue)); + return bloom_filter.test(Slice(data_bytes, sizeof(DateTimeValue))); } }; @@ -254,7 +262,7 @@ struct DecimalV2FindOp : public CommonFindOp<DecimalV2Value, BloomFilterAdaptor> constexpr int decimal_value_sz = sizeof(DecimalV2Value); char data_bytes[decimal_value_sz]; memcpy(&data_bytes, &value, decimal_value_sz); - return bloom_filter.test_bytes(data_bytes, decimal_value_sz); + return bloom_filter.test(Slice(data_bytes, decimal_value_sz)); } }; @@ -314,6 +322,10 @@ public: bool find_olap_engine(const void* data) const override { return dummy.find_olap_engine(*this->_bloom_filter, data); } + + bool find_uint32_t(uint32_t data) const override { + return dummy.find(*this->_bloom_filter, data); + } private: typename BloomFilterTypeTraits<type, BloomFilterAdaptor>::FindOp dummy; diff --git a/be/src/olap/bloom_filter_predicate.h b/be/src/olap/bloom_filter_predicate.h index 8094b7b9b0..1605248229 100644 --- a/be/src/olap/bloom_filter_predicate.h +++ b/be/src/olap/bloom_filter_predicate.h @@ -30,6 +30,7 @@ #include "vec/columns/column_vector.h" #include "vec/columns/predicate_column.h" #include "vec/utils/util.hpp" +#include "vec/columns/column_dictionary.h" namespace doris { @@ -115,14 +116,33 @@ void BloomFilterColumnPredicate<T>::evaluate(vectorized::IColumn& column, uint16 if (column.is_nullable()) { auto* nullable_col = vectorized::check_and_get_column<vectorized::ColumnNullable>(column); auto& null_map_data = nullable_col->get_null_map_column().get_data(); - auto* pred_col = vectorized::check_and_get_column<vectorized::PredicateColumnType<FT>>( + // deal ColumnDict + if (nullable_col->get_nested_column().is_column_dictionary()) { + auto* dict_col = vectorized::check_and_get_column<vectorized::ColumnDictI32>(nullable_col->get_nested_column()); + const_cast<vectorized::ColumnDictI32*>(dict_col)->generate_hash_values(); + for (uint16_t i = 0; i < *size; i++) { + uint16_t idx = sel[i]; + sel[new_size] = idx; + new_size += (!null_map_data[idx]) && _specific_filter->find_uint32_t(dict_col->get_hash_value(idx)); + } + } else { + auto* pred_col = vectorized::check_and_get_column<vectorized::PredicateColumnType<FT>>( nullable_col->get_nested_column()); - auto& pred_col_data = pred_col->get_data(); + auto& pred_col_data = pred_col->get_data(); + for (uint16_t i = 0; i < *size; i++) { + uint16_t idx = sel[i]; + sel[new_size] = idx; + const auto* cell_value = reinterpret_cast<const void*>(&(pred_col_data[idx])); + new_size += (!null_map_data[idx]) && _specific_filter->find_olap_engine(cell_value); + } + } + } else if (column.is_column_dictionary()) { + auto* dict_col = vectorized::check_and_get_column<vectorized::ColumnDictI32>(column); + const_cast<vectorized::ColumnDictI32*>(dict_col)->generate_hash_values(); for (uint16_t i = 0; i < *size; i++) { uint16_t idx = sel[i]; sel[new_size] = idx; - const auto* cell_value = reinterpret_cast<const void*>(&(pred_col_data[idx])); - new_size += (!null_map_data[idx]) && _specific_filter->find_olap_engine(cell_value); + new_size += _specific_filter->find_uint32_t(dict_col->get_hash_value(idx)); } } else { auto* pred_col = diff --git a/be/src/vec/columns/column_dictionary.h b/be/src/vec/columns/column_dictionary.h index dab8d81ee3..50f764a447 100644 --- a/be/src/vec/columns/column_dictionary.h +++ b/be/src/vec/columns/column_dictionary.h @@ -67,6 +67,7 @@ public: using value_type = T; using Container = PaddedPODArray<value_type>; using DictContainer = PaddedPODArray<StringValue>; + using HashValueContainer = PaddedPODArray<uint32_t>; // used for bloom filter bool is_column_dictionary() const override { return true; } @@ -106,6 +107,7 @@ public: void clear() override { _codes.clear(); _dict_code_converted = false; + _dict.clear_hash_values(); } // TODO: Make dict memory usage more precise @@ -251,6 +253,14 @@ public: return _dict.find_code_by_bound(value, greater, eq); } + void generate_hash_values() { + _dict.generate_hash_values(); + } + + uint32_t get_hash_value(uint32_t idx) const { + return _dict.get_hash_value(_codes[idx]); + } + phmap::flat_hash_set<int32_t> find_codes( const phmap::flat_hash_set<StringValue>& values) const { return _dict.find_codes(values); @@ -297,6 +307,23 @@ public: return -1; } + inline StringValue& get_value(T code) { return _dict_data[code]; } + + inline void generate_hash_values() { + if (_hash_values.size() == 0) { + _hash_values.resize(_dict_data.size()); + for (size_t i = 0; i < _dict_data.size(); i++) { + auto& sv = _dict_data[i]; + uint32_t hash_val = HashUtil::murmur_hash3_32(sv.ptr, sv.len, 0); + _hash_values[i] = hash_val; + } + } + } + + inline uint32_t get_hash_value(T code) const { + return _hash_values[code]; + } + // For > , code takes upper_bound - 1; For >= , code takes upper_bound // For < , code takes upper_bound; For <=, code takes upper_bound - 1 // For example a sorted dict: <'b',0> <'c',1> <'d',2> @@ -336,12 +363,15 @@ public: return code_set; } - StringValue& get_value(T code) { return _dict_data[code]; } - void clear() { _dict_data.clear(); _inverted_index.clear(); _code_convert_map.clear(); + _hash_values.clear(); + } + + void clear_hash_values() { + _hash_values.clear(); } void sort() { @@ -365,6 +395,12 @@ public: phmap::flat_hash_map<StringValue, T, StringValue::HashOfStringValue> _inverted_index; // data page code -> sorted dict code, only used for range comparison predicate phmap::flat_hash_map<T, T> _code_convert_map; + // hash value of origin string , used for bloom filter + // It's a trade-off of space for performance + // But in TPC-DS 1GB q60,we see no significant improvement. + // This may because the magnitude of the data is not large enough(in q60, only about 80k rows data is filtered for largest table) + // So we may need more test here. + HashValueContainer _hash_values; }; private: @@ -381,4 +417,6 @@ template class ColumnDictionary<uint16_t>; template class ColumnDictionary<uint32_t>; template class ColumnDictionary<int32_t>; +using ColumnDictI32 = vectorized::ColumnDictionary<doris::vectorized::Int32>; + } // namespace doris::vectorized --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org