This is an automated email from the ASF dual-hosted git repository.

wangbo pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 48222f1fb0 [fix](storage)bloom filter support ColumnDict (#9167)
48222f1fb0 is described below

commit 48222f1fb0381dc3e7182f3ea6a97a4ff80a02a4
Author: wangbo <wan...@apache.org>
AuthorDate: Thu Apr 28 20:03:26 2022 +0800

    [fix](storage)bloom filter support ColumnDict (#9167)
    
    bloom filter support ColumnDict(#9167)
---
 be/src/exprs/bloomfilter_predicate.h   | 28 ++++++++++++++++-------
 be/src/olap/bloom_filter_predicate.h   | 28 +++++++++++++++++++----
 be/src/vec/columns/column_dictionary.h | 42 ++++++++++++++++++++++++++++++++--
 3 files changed, 84 insertions(+), 14 deletions(-)

diff --git a/be/src/exprs/bloomfilter_predicate.h 
b/be/src/exprs/bloomfilter_predicate.h
index d9ce266c91..ae1a8945e8 100644
--- a/be/src/exprs/bloomfilter_predicate.h
+++ b/be/src/exprs/bloomfilter_predicate.h
@@ -61,8 +61,9 @@ public:
 
     size_t size() { return _bloom_filter->directory().size; }
 
-    bool test_bytes(const char* data, size_t len) const {
-        return _bloom_filter->find(Slice(data, len));
+    template <typename T>
+    bool test(T data) const {
+        return _bloom_filter->find(data);
     }
 
     void add_bytes(const char* data, size_t len) { 
_bloom_filter->insert(Slice(data, len)); }
@@ -83,6 +84,7 @@ public:
     virtual void insert(const void* data) = 0;
     virtual bool find(const void* data) const = 0;
     virtual bool find_olap_engine(const void* data) const = 0;
+    virtual bool find_uint32_t(uint32_t data) const = 0;
 
     virtual Status merge(IBloomFilterFuncBase* bloomfilter_func) = 0;
     virtual Status assign(const char* data, int len) = 0;
@@ -171,12 +173,15 @@ struct CommonFindOp {
         bloom_filter.add_bytes((char*)data, sizeof(T));
     }
     ALWAYS_INLINE bool find(const BloomFilterAdaptor& bloom_filter, const 
void* data) const {
-        return bloom_filter.test_bytes((char*)data, sizeof(T));
+        return bloom_filter.test(Slice((char*)data, sizeof(T)));
     }
     ALWAYS_INLINE bool find_olap_engine(const BloomFilterAdaptor& bloom_filter,
                                         const void* data) const {
         return this->find(bloom_filter, data);
     }
+    ALWAYS_INLINE bool find(const BloomFilterAdaptor& bloom_filter, uint32_t 
data) const {
+        return bloom_filter.test(data);
+    }
 };
 
 template <class BloomFilterAdaptor>
@@ -192,12 +197,15 @@ struct StringFindOp {
         if (value == nullptr) {
             return false;
         }
-        return bloom_filter.test_bytes(value->ptr, value->len);
+        return bloom_filter.test(Slice(value->ptr, value->len));
     }
     ALWAYS_INLINE bool find_olap_engine(const BloomFilterAdaptor& bloom_filter,
                                         const void* data) const {
         return StringFindOp::find(bloom_filter, data);
     }
+    ALWAYS_INLINE bool find(const BloomFilterAdaptor& bloom_filter, uint32_t 
data) const {
+        return bloom_filter.test(data);
+    }
 };
 
 // We do not need to judge whether data is empty, because null will not appear
@@ -210,7 +218,7 @@ struct FixedStringFindOp : public 
StringFindOp<BloomFilterAdaptor> {
         int64_t size = value->len;
         char* data = value->ptr;
         while (size > 0 && data[size - 1] == '\0') size--;
-        return bloom_filter.test_bytes(value->ptr, size);
+        return bloom_filter.test(Slice(value->ptr, size));
     }
 };
 
@@ -219,7 +227,7 @@ struct DateTimeFindOp : public CommonFindOp<DateTimeValue, 
BloomFilterAdaptor> {
     bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, const void* 
data) const {
         DateTimeValue value;
         value.from_olap_datetime(*reinterpret_cast<const uint64_t*>(data));
-        return bloom_filter.test_bytes((char*)&value, sizeof(DateTimeValue));
+        return bloom_filter.test(Slice((char*)&value, sizeof(DateTimeValue)));
     }
 };
 
@@ -238,7 +246,7 @@ struct DateFindOp : public CommonFindOp<DateTimeValue, 
BloomFilterAdaptor> {
 
         char data_bytes[sizeof(date_value)];
         memcpy(&data_bytes, &date_value, sizeof(date_value));
-        return bloom_filter.test_bytes(data_bytes, sizeof(DateTimeValue));
+        return bloom_filter.test(Slice(data_bytes, sizeof(DateTimeValue)));
     }
 };
 
@@ -254,7 +262,7 @@ struct DecimalV2FindOp : public 
CommonFindOp<DecimalV2Value, BloomFilterAdaptor>
         constexpr int decimal_value_sz = sizeof(DecimalV2Value);
         char data_bytes[decimal_value_sz];
         memcpy(&data_bytes, &value, decimal_value_sz);
-        return bloom_filter.test_bytes(data_bytes, decimal_value_sz);
+        return bloom_filter.test(Slice(data_bytes, decimal_value_sz));
     }
 };
 
@@ -314,6 +322,10 @@ public:
     bool find_olap_engine(const void* data) const override {
         return dummy.find_olap_engine(*this->_bloom_filter, data);
     }
+    
+    bool find_uint32_t(uint32_t data) const override {
+        return dummy.find(*this->_bloom_filter, data);
+    }
 
 private:
     typename BloomFilterTypeTraits<type, BloomFilterAdaptor>::FindOp dummy;
diff --git a/be/src/olap/bloom_filter_predicate.h 
b/be/src/olap/bloom_filter_predicate.h
index 8094b7b9b0..1605248229 100644
--- a/be/src/olap/bloom_filter_predicate.h
+++ b/be/src/olap/bloom_filter_predicate.h
@@ -30,6 +30,7 @@
 #include "vec/columns/column_vector.h"
 #include "vec/columns/predicate_column.h"
 #include "vec/utils/util.hpp"
+#include "vec/columns/column_dictionary.h"
 
 namespace doris {
 
@@ -115,14 +116,33 @@ void 
BloomFilterColumnPredicate<T>::evaluate(vectorized::IColumn& column, uint16
     if (column.is_nullable()) {
         auto* nullable_col = 
vectorized::check_and_get_column<vectorized::ColumnNullable>(column);
         auto& null_map_data = nullable_col->get_null_map_column().get_data();
-        auto* pred_col = 
vectorized::check_and_get_column<vectorized::PredicateColumnType<FT>>(
+        // deal ColumnDict
+        if (nullable_col->get_nested_column().is_column_dictionary()) {
+            auto* dict_col = 
vectorized::check_and_get_column<vectorized::ColumnDictI32>(nullable_col->get_nested_column());
+            
const_cast<vectorized::ColumnDictI32*>(dict_col)->generate_hash_values();
+            for (uint16_t i = 0; i < *size; i++) {
+                uint16_t idx = sel[i];
+                sel[new_size] = idx;
+                new_size += (!null_map_data[idx]) && 
_specific_filter->find_uint32_t(dict_col->get_hash_value(idx));
+            }
+        } else {
+            auto* pred_col = 
vectorized::check_and_get_column<vectorized::PredicateColumnType<FT>>(
                 nullable_col->get_nested_column());
-        auto& pred_col_data = pred_col->get_data();
+            auto& pred_col_data = pred_col->get_data();
+            for (uint16_t i = 0; i < *size; i++) {
+                uint16_t idx = sel[i];
+                sel[new_size] = idx;
+                const auto* cell_value = reinterpret_cast<const 
void*>(&(pred_col_data[idx]));
+                new_size += (!null_map_data[idx]) && 
_specific_filter->find_olap_engine(cell_value);
+            }
+        }
+    } else if (column.is_column_dictionary()) {
+        auto* dict_col = 
vectorized::check_and_get_column<vectorized::ColumnDictI32>(column);
+        
const_cast<vectorized::ColumnDictI32*>(dict_col)->generate_hash_values();
         for (uint16_t i = 0; i < *size; i++) {
             uint16_t idx = sel[i];
             sel[new_size] = idx;
-            const auto* cell_value = reinterpret_cast<const 
void*>(&(pred_col_data[idx]));
-            new_size += (!null_map_data[idx]) && 
_specific_filter->find_olap_engine(cell_value);
+            new_size += 
_specific_filter->find_uint32_t(dict_col->get_hash_value(idx));
         }
     } else {
         auto* pred_col =
diff --git a/be/src/vec/columns/column_dictionary.h 
b/be/src/vec/columns/column_dictionary.h
index dab8d81ee3..50f764a447 100644
--- a/be/src/vec/columns/column_dictionary.h
+++ b/be/src/vec/columns/column_dictionary.h
@@ -67,6 +67,7 @@ public:
     using value_type = T;
     using Container = PaddedPODArray<value_type>;
     using DictContainer = PaddedPODArray<StringValue>;
+    using HashValueContainer = PaddedPODArray<uint32_t>; // used for bloom 
filter
 
     bool is_column_dictionary() const override { return true; }
 
@@ -106,6 +107,7 @@ public:
     void clear() override {
         _codes.clear();
         _dict_code_converted = false;
+        _dict.clear_hash_values();
     }
 
     // TODO: Make dict memory usage more precise
@@ -251,6 +253,14 @@ public:
         return _dict.find_code_by_bound(value, greater, eq);
     }
 
+    void generate_hash_values() {
+        _dict.generate_hash_values();
+    }
+
+    uint32_t get_hash_value(uint32_t idx) const {
+        return _dict.get_hash_value(_codes[idx]);
+    }
+
     phmap::flat_hash_set<int32_t> find_codes(
             const phmap::flat_hash_set<StringValue>& values) const {
         return _dict.find_codes(values);
@@ -297,6 +307,23 @@ public:
             return -1;
         }
 
+        inline StringValue& get_value(T code) { return _dict_data[code]; }
+        
+        inline void generate_hash_values() {
+            if (_hash_values.size() == 0) {
+                _hash_values.resize(_dict_data.size());
+                for (size_t i = 0; i < _dict_data.size(); i++) {
+                    auto& sv = _dict_data[i];
+                    uint32_t hash_val = HashUtil::murmur_hash3_32(sv.ptr, 
sv.len, 0);
+                    _hash_values[i] = hash_val;
+                }
+            }
+        }
+
+        inline uint32_t get_hash_value(T code) const {
+            return _hash_values[code];
+        }
+
         // For > , code takes upper_bound - 1; For >= , code takes upper_bound
         // For < , code takes upper_bound; For <=, code takes upper_bound - 1
         // For example a sorted dict: <'b',0> <'c',1> <'d',2>
@@ -336,12 +363,15 @@ public:
             return code_set;
         }
 
-        StringValue& get_value(T code) { return _dict_data[code]; }
-
         void clear() {
             _dict_data.clear();
             _inverted_index.clear();
             _code_convert_map.clear();
+            _hash_values.clear();
+        }
+
+        void clear_hash_values() {
+            _hash_values.clear();
         }
 
         void sort() {
@@ -365,6 +395,12 @@ public:
         phmap::flat_hash_map<StringValue, T, StringValue::HashOfStringValue> 
_inverted_index;
         // data page code -> sorted dict code, only used for range comparison 
predicate
         phmap::flat_hash_map<T, T> _code_convert_map;
+        // hash value of origin string , used for bloom filter
+        // It's a trade-off of space for performance
+        // But in TPC-DS 1GB q60,we see no significant improvement. 
+        // This may because the magnitude of the data is not large enough(in 
q60, only about 80k rows data is filtered for largest table)
+        // So we may need more test here.
+        HashValueContainer _hash_values;
     };
 
 private:
@@ -381,4 +417,6 @@ template class ColumnDictionary<uint16_t>;
 template class ColumnDictionary<uint32_t>;
 template class ColumnDictionary<int32_t>;
 
+using ColumnDictI32 = vectorized::ColumnDictionary<doris::vectorized::Int32>;
+
 } // namespace doris::vectorized


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to