This is an automated email from the ASF dual-hosted git repository.

wangbo pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 802fcbb  (#8162)refactor binary dict
802fcbb is described below

commit 802fcbbb056a0c62be2160461ebb5c5e6e11f576
Author: zuochunwei <zchw...@qq.com>
AuthorDate: Tue Feb 22 11:23:54 2022 +0800

    (#8162)refactor binary dict
    
    Co-authored-by: zuochunwei <zuochun...@meituan.com>
---
 be/src/olap/rowset/segment_v2/binary_dict_page.cpp | 14 +++++------
 be/src/olap/rowset/segment_v2/binary_dict_page.h   |  5 ++--
 be/src/olap/rowset/segment_v2/binary_plain_page.h  | 16 ++++++++++++
 be/src/olap/rowset/segment_v2/column_reader.cpp    | 16 +++---------
 be/src/olap/rowset/segment_v2/column_reader.h      |  3 +--
 be/src/vec/columns/column.h                        |  3 +--
 be/src/vec/columns/column_nullable.h               |  5 ++--
 be/src/vec/columns/column_string.h                 |  9 +++----
 be/src/vec/columns/predicate_column.h              | 29 +++++++++-------------
 .../rowset/segment_v2/binary_dict_page_test.cpp    | 24 +++++-------------
 be/test/tools/benchmark_tool.cpp                   | 13 +++-------
 11 files changed, 56 insertions(+), 81 deletions(-)

diff --git a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp 
b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp
index 413b082..974679b 100644
--- a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp
+++ b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp
@@ -209,7 +209,7 @@ Status BinaryDictPageDecoder::init() {
         TypeInfo* type_info = get_scalar_type_info(OLAP_FIELD_TYPE_INT);
 
         RETURN_IF_ERROR(ColumnVectorBatch::create(0, false, type_info, 
nullptr, &_batch));
-        _data_page_decoder.reset(new 
BitShufflePageDecoder<OLAP_FIELD_TYPE_INT>(_data, _options));
+        _data_page_decoder.reset(_bit_shuffle_ptr = new 
BitShufflePageDecoder<OLAP_FIELD_TYPE_INT>(_data, _options));
     } else if (_encoding_type == PLAIN_ENCODING) {
         DCHECK_EQ(_encoding_type, PLAIN_ENCODING);
         _data_page_decoder.reset(new BinaryPlainPageDecoder(_data, _options));
@@ -233,11 +233,9 @@ bool BinaryDictPageDecoder::is_dict_encoding() const {
     return _encoding_type == DICT_ENCODING;
 }
 
-void BinaryDictPageDecoder::set_dict_decoder(PageDecoder* dict_decoder, 
uint32_t* start_offset_array, uint32_t* len_array) {
+void BinaryDictPageDecoder::set_dict_decoder(PageDecoder* dict_decoder, 
StringRef* dict_word_info) {
     _dict_decoder = (BinaryPlainPageDecoder*)dict_decoder;
-    _bit_shuffle_ptr = 
reinterpret_cast<BitShufflePageDecoder<OLAP_FIELD_TYPE_INT>*>(_data_page_decoder.get());
-    _start_offset_array = start_offset_array;
-    _len_array = len_array;
+    _dict_word_info = dict_word_info;
 };
 
 Status BinaryDictPageDecoder::next_batch(size_t* n, 
vectorized::MutableColumnPtr &dst) {
@@ -259,8 +257,8 @@ Status BinaryDictPageDecoder::next_batch(size_t* n, 
vectorized::MutableColumnPtr
     const int32_t* data_array = reinterpret_cast<const 
int32_t*>(_bit_shuffle_ptr->_chunk.data);
     size_t start_index = _bit_shuffle_ptr->_cur_index;
 
-    dst->insert_many_dict_data(data_array, start_index, _start_offset_array, 
_len_array, 
-        _dict_decoder->_data.mutable_data(), max_fetch);
+    dst->insert_many_dict_data(data_array, start_index, _dict_word_info, 
max_fetch);
+
     _bit_shuffle_ptr->_cur_index += max_fetch;
  
     return Status::OK();
@@ -291,7 +289,7 @@ Status BinaryDictPageDecoder::next_batch(size_t* n, 
ColumnBlockView* dst) {
     for (int i = 0; i < len; ++i) {
         int32_t codeword = *reinterpret_cast<const 
int32_t*>(column_block.cell_ptr(i));
         // get the string from the dict decoder
-        *out = Slice(&_dict_decoder->_data[_start_offset_array[codeword]], 
_len_array[codeword]);
+        *out = Slice(_dict_word_info[codeword].data, 
_dict_word_info[codeword].size);
         mem_len[i] = out->size;
         out++;
     }
diff --git a/be/src/olap/rowset/segment_v2/binary_dict_page.h 
b/be/src/olap/rowset/segment_v2/binary_dict_page.h
index 15f11aa..54754be 100644
--- a/be/src/olap/rowset/segment_v2/binary_dict_page.h
+++ b/be/src/olap/rowset/segment_v2/binary_dict_page.h
@@ -115,7 +115,7 @@ public:
 
     bool is_dict_encoding() const;
 
-    void set_dict_decoder(PageDecoder* dict_decoder, uint32_t* 
start_offset_array = nullptr, uint32_t* len_array = nullptr);
+    void set_dict_decoder(PageDecoder* dict_decoder, StringRef* 
dict_word_info);
 
     ~BinaryDictPageDecoder();
 
@@ -130,8 +130,7 @@ private:
     // use as data buf.
     std::unique_ptr<ColumnVectorBatch> _batch;
 
-    uint32_t* _start_offset_array = nullptr;
-    uint32_t* _len_array = nullptr;
+    StringRef* _dict_word_info = nullptr;
 };
 
 } // namespace segment_v2
diff --git a/be/src/olap/rowset/segment_v2/binary_plain_page.h 
b/be/src/olap/rowset/segment_v2/binary_plain_page.h
index 20e0c98..2060bd5 100644
--- a/be/src/olap/rowset/segment_v2/binary_plain_page.h
+++ b/be/src/olap/rowset/segment_v2/binary_plain_page.h
@@ -268,6 +268,22 @@ public:
         return Slice(&_data[start_offset], len);
     }
 
+    void get_dict_word_info(StringRef* dict_word_info) {
+        char* data_begin = (char*)&_data[0];
+        char* offset_ptr = (char*)&_data[_offsets_pos];
+
+        for (uint32_t i = 0; i < _num_elems; ++i) {
+            dict_word_info[i].data = data_begin + 
decode_fixed32_le((uint8_t*)offset_ptr);
+            offset_ptr += sizeof(uint32_t);
+        }
+
+        for (int i = 0; i < (int)_num_elems - 1; ++i) {
+            dict_word_info[i].size = (char*)dict_word_info[i+1].data - 
(char*)dict_word_info[i].data;
+        }
+
+        dict_word_info[_num_elems-1].size = (data_begin + _offsets_pos) - 
(char*)dict_word_info[_num_elems-1].data;
+    }
+
 private:
     // Return the offset within '_data' where the string value with index 
'idx' can be found.
     uint32_t offset(size_t idx) const {
diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp 
b/be/src/olap/rowset/segment_v2/column_reader.cpp
index 9b14ff4..93a9151 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/column_reader.cpp
@@ -672,23 +672,15 @@ Status FileColumnIterator::_read_data_page(const 
OrdinalPageIndexIterator& iter)
                                                    &_dict_page_handle, 
&dict_data, &dict_footer));
                 // ignore dict_footer.dict_page_footer().encoding() due to only
                 // PLAIN_ENCODING is supported for dict page right now
-                _dict_decoder.reset(new BinaryPlainPageDecoder(dict_data));
+                _dict_decoder = 
std::make_unique<BinaryPlainPageDecoder>(dict_data);
                 RETURN_IF_ERROR(_dict_decoder->init());
 
                 auto* pd_decoder = 
(BinaryPlainPageDecoder*)_dict_decoder.get();
-                _dict_start_offset_array.reset(new 
uint32_t[pd_decoder->_num_elems]);
-                _dict_len_array.reset(new uint32_t[pd_decoder->_num_elems]);
-
-                // todo(wb) padding dict value for SIMD comparison
-                for (int i = 0; i < pd_decoder->_num_elems; i++) {
-                    const uint32_t start_offset = pd_decoder->offset(i);
-                    uint32_t len = pd_decoder->offset(i + 1) - start_offset;
-                    _dict_start_offset_array[i] = start_offset;
-                    _dict_len_array[i] = len;
-                }
+                _dict_word_info.reset(new StringRef[pd_decoder->_num_elems]);
+                pd_decoder->get_dict_word_info(_dict_word_info.get());
             }
 
-            dict_page_decoder->set_dict_decoder(_dict_decoder.get(), 
_dict_start_offset_array.get(), _dict_len_array.get());
+            dict_page_decoder->set_dict_decoder(_dict_decoder.get(), 
_dict_word_info.get());
         }
     }
     return Status::OK();
diff --git a/be/src/olap/rowset/segment_v2/column_reader.h 
b/be/src/olap/rowset/segment_v2/column_reader.h
index db77577..e1cb2aa 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.h
+++ b/be/src/olap/rowset/segment_v2/column_reader.h
@@ -304,8 +304,7 @@ private:
     // current value ordinal
     ordinal_t _current_ordinal = 0;
 
-    std::unique_ptr<uint32_t[]> _dict_start_offset_array;
-    std::unique_ptr<uint32_t[]> _dict_len_array;
+    std::unique_ptr<StringRef[]> _dict_word_info;
 };
 
 class ArrayFileColumnIterator final : public ColumnIterator {
diff --git a/be/src/vec/columns/column.h b/be/src/vec/columns/column.h
index 10b1e00..cb04a1c 100644
--- a/be/src/vec/columns/column.h
+++ b/be/src/vec/columns/column.h
@@ -174,8 +174,7 @@ public:
       LOG(FATAL) << "Method insert_many_fix_len_data is not supported for " << 
get_name();
     }
  
-    virtual void insert_many_dict_data(const int32_t* data_array, size_t 
start_index, const uint32_t* start_offset_array, 
-        const uint32_t* len_array, char* dict_data, size_t num) {
+    virtual void insert_many_dict_data(const int32_t* data_array, size_t 
start_index, const StringRef* dict, size_t num) {
       LOG(FATAL) << "Method insert_many_dict_data is not supported for " << 
get_name();
     }
  
diff --git a/be/src/vec/columns/column_nullable.h 
b/be/src/vec/columns/column_nullable.h
index 0de1ea9..21c67a5 100644
--- a/be/src/vec/columns/column_nullable.h
+++ b/be/src/vec/columns/column_nullable.h
@@ -96,10 +96,9 @@ public:
         get_nested_column().insert_many_fix_len_data(pos, num);
     }
  
-    void insert_many_dict_data(const int32_t* data_array, size_t start_index, 
const uint32_t* start_offset_array, 
-        const uint32_t* len_array, char* dict_data, size_t num) override {
+    void insert_many_dict_data(const int32_t* data_array, size_t start_index, 
const StringRef* dict, size_t num) override {
         get_null_map_column().fill(0, num);
-        get_nested_column().insert_many_dict_data(data_array, start_index, 
start_offset_array, len_array, dict_data, num);
+        get_nested_column().insert_many_dict_data(data_array, start_index, 
dict, num);
     }
  
     void insert_many_binary_data(char* data_array, uint32_t* len_array, 
uint32_t* start_offset_array, size_t num) override {
diff --git a/be/src/vec/columns/column_string.h 
b/be/src/vec/columns/column_string.h
index 591fbe2..236216c 100644
--- a/be/src/vec/columns/column_string.h
+++ b/be/src/vec/columns/column_string.h
@@ -165,13 +165,10 @@ public:
         }
     };
  
-    void insert_many_dict_data (const int32_t* data_array, size_t start_index, 
const uint32_t* start_offset_array, 
-        const uint32_t* len_array, char* dict_data, size_t num) override {
-        for (int i = 0; i < num; i++, start_index++) {
+    void insert_many_dict_data(const int32_t* data_array, size_t start_index, 
const StringRef* dict, size_t num) override {
+        for (size_t end_index = start_index+num; start_index < end_index; 
++start_index) {
             int32_t codeword = data_array[start_index];
-            uint32_t start_offset = start_offset_array[codeword];
-            uint32_t str_len = len_array[codeword];
-            insert_data(dict_data + start_offset, str_len);
+            insert_data(dict[codeword].data, dict[codeword].size);
         }
     }
 
diff --git a/be/src/vec/columns/predicate_column.h 
b/be/src/vec/columns/predicate_column.h
index 890d9a2..69a89fd 100644
--- a/be/src/vec/columns/predicate_column.h
+++ b/be/src/vec/columns/predicate_column.h
@@ -169,12 +169,12 @@ public:
         LOG(FATAL) << "update_hash_with_value not supported in 
PredicateColumnType";
     }
 
-    void insert_string_value(char* data_ptr, size_t length) {
-        StringValue sv(data_ptr, length);
+    void insert_string_value(const char* data_ptr, size_t length) {
+        StringValue sv((char*)data_ptr, length);
         data.push_back_without_reserve(sv);
     }
 
-    void insert_decimal_value(char* data_ptr, size_t length) {
+    void insert_decimal_value(const char* data_ptr, size_t length) {
         decimal12_t dc12_value;
         dc12_value.integer = *(int64_t*)(data_ptr);
         dc12_value.fraction = *(int32_t*)(data_ptr + sizeof(int64_t));
@@ -182,27 +182,26 @@ public:
     }
 
     // used for int128
-    void insert_in_copy_way(char* data_ptr, size_t length) {
+    void insert_in_copy_way(const char* data_ptr, size_t length) {
         T val {};
         memcpy(&val, data_ptr, sizeof(val));
         data.push_back_without_reserve(val);
     }
 
-    void insert_default_type(char* data_ptr, size_t length) {
+    void insert_default_type(const char* data_ptr, size_t length) {
         T* val = (T*)data_ptr;
         data.push_back_without_reserve(*val);
     }
 
     void insert_data(const char* data_ptr, size_t length) override {
-        char* ch = const_cast<char*>(data_ptr);
         if constexpr (std::is_same_v<T, StringValue>) {
-            insert_string_value(ch, length);
+            insert_string_value(data_ptr, length);
         } else if constexpr (std::is_same_v<T, decimal12_t>) {
-            insert_decimal_value(ch, length);
+            insert_decimal_value(data_ptr, length);
         } else if constexpr (std::is_same_v<T, doris::vectorized::Int128>) {
-            insert_in_copy_way(ch, length);
+            insert_in_copy_way(data_ptr, length);
         } else {
-            insert_default_type(ch, length);
+            insert_default_type(data_ptr, length);
         }
     }
 
@@ -218,15 +217,11 @@ public:
         }
     }
 
-    void insert_many_dict_data(const int32_t* data_array, size_t start_index,
-                               const uint32_t* start_offset_array, const 
uint32_t* len_array,
-                               char* dict_data, size_t num) override {
+    void insert_many_dict_data(const int32_t* data_array, size_t start_index, 
const StringRef* dict, size_t num) override {
         if constexpr (std::is_same_v<T, StringValue>) {
-            for (int i = 0; i < num; i++, start_index++) {
+            for (size_t end_index = start_index+num; start_index < end_index; 
++start_index) {
                 int32_t codeword = data_array[start_index];
-                uint32_t start_offset = start_offset_array[codeword];
-                uint32_t str_len = len_array[codeword];
-                insert_string_value(dict_data + start_offset, str_len);
+                insert_string_value(dict[codeword].data, dict[codeword].size);
             }
         }
     }
diff --git a/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp 
b/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp
index a65b404..de48637 100644
--- a/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp
+++ b/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp
@@ -73,20 +73,14 @@ public:
         // because every slice is unique
         ASSERT_EQ(slices.size(), dict_page_decoder->count());
 
-        uint32_t dict_start_offset_array[dict_page_decoder->_num_elems];
-        uint32_t dict_len_array[dict_page_decoder->_num_elems];
-        for (int i = 0; i < dict_page_decoder->_num_elems; i++) {
-            const uint32_t start_offset = dict_page_decoder->offset(i);
-            uint32_t len = dict_page_decoder->offset(i + 1) - start_offset;
-            dict_start_offset_array[i] = start_offset;
-            dict_len_array[i] = len;
-        }
+        StringRef dict_word_info[dict_page_decoder->_num_elems];
+        dict_page_decoder->get_dict_word_info(dict_word_info);
 
         // decode
         PageDecoderOptions decoder_options;
         BinaryDictPageDecoder page_decoder(s.slice(), decoder_options);
 
-        page_decoder.set_dict_decoder(dict_page_decoder.get(), 
dict_start_offset_array, dict_len_array);
+        page_decoder.set_dict_decoder(dict_page_decoder.get(), dict_word_info);
 
         status = page_decoder.init();
         ASSERT_TRUE(status.ok());
@@ -177,21 +171,15 @@ public:
             status = dict_page_decoder->init();
             ASSERT_TRUE(status.ok());
 
-            uint32_t dict_start_offset_array[dict_page_decoder->_num_elems];
-            uint32_t dict_len_array[dict_page_decoder->_num_elems];
-            for (int i = 0; i < dict_page_decoder->_num_elems; i++) {
-                const uint32_t start_offset = dict_page_decoder->offset(i);
-                uint32_t len = dict_page_decoder->offset(i + 1) - start_offset;
-                dict_start_offset_array[i] = start_offset;
-                dict_len_array[i] = len;
-            }
+            StringRef dict_word_info[dict_page_decoder->_num_elems];
+            dict_page_decoder->get_dict_word_info(dict_word_info);
 
             // decode
             PageDecoderOptions decoder_options;
             BinaryDictPageDecoder page_decoder(results[slice_index].slice(), 
decoder_options);
             status = page_decoder.init();
 
-            page_decoder.set_dict_decoder(dict_page_decoder.get(), 
dict_start_offset_array, dict_len_array);
+            page_decoder.set_dict_decoder(dict_page_decoder.get(), 
dict_word_info);
             ASSERT_TRUE(status.ok());
 
             //check values
diff --git a/be/test/tools/benchmark_tool.cpp b/be/test/tools/benchmark_tool.cpp
index 7a1f708..e06cc3d 100644
--- a/be/test/tools/benchmark_tool.cpp
+++ b/be/test/tools/benchmark_tool.cpp
@@ -174,22 +174,15 @@ public:
                     new BinaryPlainPageDecoder(dict_slice.slice(), 
dict_decoder_options));
             dict_page_decoder->init();
 
-            uint32_t dict_start_offset_array[dict_page_decoder->_num_elems];
-            uint32_t dict_len_array[dict_page_decoder->_num_elems];
-            for (int i = 0; i < dict_page_decoder->_num_elems; i++) {
-                const uint32_t start_offset = dict_page_decoder->offset(i);
-                uint32_t len = dict_page_decoder->offset(i + 1) - start_offset;
-                dict_start_offset_array[i] = start_offset;
-                dict_len_array[i] = len;
-            }
+            StringRef dict_word_info[dict_page_decoder->_num_elems];
+            dict_page_decoder->get_dict_word_info(dict_word_info);
 
             // decode
             PageDecoderOptions decoder_options;
             BinaryDictPageDecoder page_decoder(src.slice(), decoder_options);
             page_decoder.init();
 
-            page_decoder.set_dict_decoder(dict_page_decoder.get(), 
dict_start_offset_array,
-                                          dict_len_array);
+            page_decoder.set_dict_decoder(dict_page_decoder.get(), 
dict_word_info);
 
             //check values
             size_t num = page_start_ids[slice_index + 1] - 
page_start_ids[slice_index];

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to