This is an automated email from the ASF dual-hosted git repository. wangbo pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/incubator-doris.git
The following commit(s) were added to refs/heads/master by this push: new 802fcbb (#8162)refactor binary dict 802fcbb is described below commit 802fcbbb056a0c62be2160461ebb5c5e6e11f576 Author: zuochunwei <zchw...@qq.com> AuthorDate: Tue Feb 22 11:23:54 2022 +0800 (#8162)refactor binary dict Co-authored-by: zuochunwei <zuochun...@meituan.com> --- be/src/olap/rowset/segment_v2/binary_dict_page.cpp | 14 +++++------ be/src/olap/rowset/segment_v2/binary_dict_page.h | 5 ++-- be/src/olap/rowset/segment_v2/binary_plain_page.h | 16 ++++++++++++ be/src/olap/rowset/segment_v2/column_reader.cpp | 16 +++--------- be/src/olap/rowset/segment_v2/column_reader.h | 3 +-- be/src/vec/columns/column.h | 3 +-- be/src/vec/columns/column_nullable.h | 5 ++-- be/src/vec/columns/column_string.h | 9 +++---- be/src/vec/columns/predicate_column.h | 29 +++++++++------------- .../rowset/segment_v2/binary_dict_page_test.cpp | 24 +++++------------- be/test/tools/benchmark_tool.cpp | 13 +++------- 11 files changed, 56 insertions(+), 81 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp index 413b082..974679b 100644 --- a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp +++ b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp @@ -209,7 +209,7 @@ Status BinaryDictPageDecoder::init() { TypeInfo* type_info = get_scalar_type_info(OLAP_FIELD_TYPE_INT); RETURN_IF_ERROR(ColumnVectorBatch::create(0, false, type_info, nullptr, &_batch)); - _data_page_decoder.reset(new BitShufflePageDecoder<OLAP_FIELD_TYPE_INT>(_data, _options)); + _data_page_decoder.reset(_bit_shuffle_ptr = new BitShufflePageDecoder<OLAP_FIELD_TYPE_INT>(_data, _options)); } else if (_encoding_type == PLAIN_ENCODING) { DCHECK_EQ(_encoding_type, PLAIN_ENCODING); _data_page_decoder.reset(new BinaryPlainPageDecoder(_data, _options)); @@ -233,11 +233,9 @@ bool BinaryDictPageDecoder::is_dict_encoding() const { return _encoding_type == DICT_ENCODING; } -void BinaryDictPageDecoder::set_dict_decoder(PageDecoder* dict_decoder, uint32_t* start_offset_array, uint32_t* len_array) { +void BinaryDictPageDecoder::set_dict_decoder(PageDecoder* dict_decoder, StringRef* dict_word_info) { _dict_decoder = (BinaryPlainPageDecoder*)dict_decoder; - _bit_shuffle_ptr = reinterpret_cast<BitShufflePageDecoder<OLAP_FIELD_TYPE_INT>*>(_data_page_decoder.get()); - _start_offset_array = start_offset_array; - _len_array = len_array; + _dict_word_info = dict_word_info; }; Status BinaryDictPageDecoder::next_batch(size_t* n, vectorized::MutableColumnPtr &dst) { @@ -259,8 +257,8 @@ Status BinaryDictPageDecoder::next_batch(size_t* n, vectorized::MutableColumnPtr const int32_t* data_array = reinterpret_cast<const int32_t*>(_bit_shuffle_ptr->_chunk.data); size_t start_index = _bit_shuffle_ptr->_cur_index; - dst->insert_many_dict_data(data_array, start_index, _start_offset_array, _len_array, - _dict_decoder->_data.mutable_data(), max_fetch); + dst->insert_many_dict_data(data_array, start_index, _dict_word_info, max_fetch); + _bit_shuffle_ptr->_cur_index += max_fetch; return Status::OK(); @@ -291,7 +289,7 @@ Status BinaryDictPageDecoder::next_batch(size_t* n, ColumnBlockView* dst) { for (int i = 0; i < len; ++i) { int32_t codeword = *reinterpret_cast<const int32_t*>(column_block.cell_ptr(i)); // get the string from the dict decoder - *out = Slice(&_dict_decoder->_data[_start_offset_array[codeword]], _len_array[codeword]); + *out = Slice(_dict_word_info[codeword].data, _dict_word_info[codeword].size); mem_len[i] = out->size; out++; } diff --git a/be/src/olap/rowset/segment_v2/binary_dict_page.h b/be/src/olap/rowset/segment_v2/binary_dict_page.h index 15f11aa..54754be 100644 --- a/be/src/olap/rowset/segment_v2/binary_dict_page.h +++ b/be/src/olap/rowset/segment_v2/binary_dict_page.h @@ -115,7 +115,7 @@ public: bool is_dict_encoding() const; - void set_dict_decoder(PageDecoder* dict_decoder, uint32_t* start_offset_array = nullptr, uint32_t* len_array = nullptr); + void set_dict_decoder(PageDecoder* dict_decoder, StringRef* dict_word_info); ~BinaryDictPageDecoder(); @@ -130,8 +130,7 @@ private: // use as data buf. std::unique_ptr<ColumnVectorBatch> _batch; - uint32_t* _start_offset_array = nullptr; - uint32_t* _len_array = nullptr; + StringRef* _dict_word_info = nullptr; }; } // namespace segment_v2 diff --git a/be/src/olap/rowset/segment_v2/binary_plain_page.h b/be/src/olap/rowset/segment_v2/binary_plain_page.h index 20e0c98..2060bd5 100644 --- a/be/src/olap/rowset/segment_v2/binary_plain_page.h +++ b/be/src/olap/rowset/segment_v2/binary_plain_page.h @@ -268,6 +268,22 @@ public: return Slice(&_data[start_offset], len); } + void get_dict_word_info(StringRef* dict_word_info) { + char* data_begin = (char*)&_data[0]; + char* offset_ptr = (char*)&_data[_offsets_pos]; + + for (uint32_t i = 0; i < _num_elems; ++i) { + dict_word_info[i].data = data_begin + decode_fixed32_le((uint8_t*)offset_ptr); + offset_ptr += sizeof(uint32_t); + } + + for (int i = 0; i < (int)_num_elems - 1; ++i) { + dict_word_info[i].size = (char*)dict_word_info[i+1].data - (char*)dict_word_info[i].data; + } + + dict_word_info[_num_elems-1].size = (data_begin + _offsets_pos) - (char*)dict_word_info[_num_elems-1].data; + } + private: // Return the offset within '_data' where the string value with index 'idx' can be found. uint32_t offset(size_t idx) const { diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp b/be/src/olap/rowset/segment_v2/column_reader.cpp index 9b14ff4..93a9151 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.cpp +++ b/be/src/olap/rowset/segment_v2/column_reader.cpp @@ -672,23 +672,15 @@ Status FileColumnIterator::_read_data_page(const OrdinalPageIndexIterator& iter) &_dict_page_handle, &dict_data, &dict_footer)); // ignore dict_footer.dict_page_footer().encoding() due to only // PLAIN_ENCODING is supported for dict page right now - _dict_decoder.reset(new BinaryPlainPageDecoder(dict_data)); + _dict_decoder = std::make_unique<BinaryPlainPageDecoder>(dict_data); RETURN_IF_ERROR(_dict_decoder->init()); auto* pd_decoder = (BinaryPlainPageDecoder*)_dict_decoder.get(); - _dict_start_offset_array.reset(new uint32_t[pd_decoder->_num_elems]); - _dict_len_array.reset(new uint32_t[pd_decoder->_num_elems]); - - // todo(wb) padding dict value for SIMD comparison - for (int i = 0; i < pd_decoder->_num_elems; i++) { - const uint32_t start_offset = pd_decoder->offset(i); - uint32_t len = pd_decoder->offset(i + 1) - start_offset; - _dict_start_offset_array[i] = start_offset; - _dict_len_array[i] = len; - } + _dict_word_info.reset(new StringRef[pd_decoder->_num_elems]); + pd_decoder->get_dict_word_info(_dict_word_info.get()); } - dict_page_decoder->set_dict_decoder(_dict_decoder.get(), _dict_start_offset_array.get(), _dict_len_array.get()); + dict_page_decoder->set_dict_decoder(_dict_decoder.get(), _dict_word_info.get()); } } return Status::OK(); diff --git a/be/src/olap/rowset/segment_v2/column_reader.h b/be/src/olap/rowset/segment_v2/column_reader.h index db77577..e1cb2aa 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.h +++ b/be/src/olap/rowset/segment_v2/column_reader.h @@ -304,8 +304,7 @@ private: // current value ordinal ordinal_t _current_ordinal = 0; - std::unique_ptr<uint32_t[]> _dict_start_offset_array; - std::unique_ptr<uint32_t[]> _dict_len_array; + std::unique_ptr<StringRef[]> _dict_word_info; }; class ArrayFileColumnIterator final : public ColumnIterator { diff --git a/be/src/vec/columns/column.h b/be/src/vec/columns/column.h index 10b1e00..cb04a1c 100644 --- a/be/src/vec/columns/column.h +++ b/be/src/vec/columns/column.h @@ -174,8 +174,7 @@ public: LOG(FATAL) << "Method insert_many_fix_len_data is not supported for " << get_name(); } - virtual void insert_many_dict_data(const int32_t* data_array, size_t start_index, const uint32_t* start_offset_array, - const uint32_t* len_array, char* dict_data, size_t num) { + virtual void insert_many_dict_data(const int32_t* data_array, size_t start_index, const StringRef* dict, size_t num) { LOG(FATAL) << "Method insert_many_dict_data is not supported for " << get_name(); } diff --git a/be/src/vec/columns/column_nullable.h b/be/src/vec/columns/column_nullable.h index 0de1ea9..21c67a5 100644 --- a/be/src/vec/columns/column_nullable.h +++ b/be/src/vec/columns/column_nullable.h @@ -96,10 +96,9 @@ public: get_nested_column().insert_many_fix_len_data(pos, num); } - void insert_many_dict_data(const int32_t* data_array, size_t start_index, const uint32_t* start_offset_array, - const uint32_t* len_array, char* dict_data, size_t num) override { + void insert_many_dict_data(const int32_t* data_array, size_t start_index, const StringRef* dict, size_t num) override { get_null_map_column().fill(0, num); - get_nested_column().insert_many_dict_data(data_array, start_index, start_offset_array, len_array, dict_data, num); + get_nested_column().insert_many_dict_data(data_array, start_index, dict, num); } void insert_many_binary_data(char* data_array, uint32_t* len_array, uint32_t* start_offset_array, size_t num) override { diff --git a/be/src/vec/columns/column_string.h b/be/src/vec/columns/column_string.h index 591fbe2..236216c 100644 --- a/be/src/vec/columns/column_string.h +++ b/be/src/vec/columns/column_string.h @@ -165,13 +165,10 @@ public: } }; - void insert_many_dict_data (const int32_t* data_array, size_t start_index, const uint32_t* start_offset_array, - const uint32_t* len_array, char* dict_data, size_t num) override { - for (int i = 0; i < num; i++, start_index++) { + void insert_many_dict_data(const int32_t* data_array, size_t start_index, const StringRef* dict, size_t num) override { + for (size_t end_index = start_index+num; start_index < end_index; ++start_index) { int32_t codeword = data_array[start_index]; - uint32_t start_offset = start_offset_array[codeword]; - uint32_t str_len = len_array[codeword]; - insert_data(dict_data + start_offset, str_len); + insert_data(dict[codeword].data, dict[codeword].size); } } diff --git a/be/src/vec/columns/predicate_column.h b/be/src/vec/columns/predicate_column.h index 890d9a2..69a89fd 100644 --- a/be/src/vec/columns/predicate_column.h +++ b/be/src/vec/columns/predicate_column.h @@ -169,12 +169,12 @@ public: LOG(FATAL) << "update_hash_with_value not supported in PredicateColumnType"; } - void insert_string_value(char* data_ptr, size_t length) { - StringValue sv(data_ptr, length); + void insert_string_value(const char* data_ptr, size_t length) { + StringValue sv((char*)data_ptr, length); data.push_back_without_reserve(sv); } - void insert_decimal_value(char* data_ptr, size_t length) { + void insert_decimal_value(const char* data_ptr, size_t length) { decimal12_t dc12_value; dc12_value.integer = *(int64_t*)(data_ptr); dc12_value.fraction = *(int32_t*)(data_ptr + sizeof(int64_t)); @@ -182,27 +182,26 @@ public: } // used for int128 - void insert_in_copy_way(char* data_ptr, size_t length) { + void insert_in_copy_way(const char* data_ptr, size_t length) { T val {}; memcpy(&val, data_ptr, sizeof(val)); data.push_back_without_reserve(val); } - void insert_default_type(char* data_ptr, size_t length) { + void insert_default_type(const char* data_ptr, size_t length) { T* val = (T*)data_ptr; data.push_back_without_reserve(*val); } void insert_data(const char* data_ptr, size_t length) override { - char* ch = const_cast<char*>(data_ptr); if constexpr (std::is_same_v<T, StringValue>) { - insert_string_value(ch, length); + insert_string_value(data_ptr, length); } else if constexpr (std::is_same_v<T, decimal12_t>) { - insert_decimal_value(ch, length); + insert_decimal_value(data_ptr, length); } else if constexpr (std::is_same_v<T, doris::vectorized::Int128>) { - insert_in_copy_way(ch, length); + insert_in_copy_way(data_ptr, length); } else { - insert_default_type(ch, length); + insert_default_type(data_ptr, length); } } @@ -218,15 +217,11 @@ public: } } - void insert_many_dict_data(const int32_t* data_array, size_t start_index, - const uint32_t* start_offset_array, const uint32_t* len_array, - char* dict_data, size_t num) override { + void insert_many_dict_data(const int32_t* data_array, size_t start_index, const StringRef* dict, size_t num) override { if constexpr (std::is_same_v<T, StringValue>) { - for (int i = 0; i < num; i++, start_index++) { + for (size_t end_index = start_index+num; start_index < end_index; ++start_index) { int32_t codeword = data_array[start_index]; - uint32_t start_offset = start_offset_array[codeword]; - uint32_t str_len = len_array[codeword]; - insert_string_value(dict_data + start_offset, str_len); + insert_string_value(dict[codeword].data, dict[codeword].size); } } } diff --git a/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp b/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp index a65b404..de48637 100644 --- a/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp +++ b/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp @@ -73,20 +73,14 @@ public: // because every slice is unique ASSERT_EQ(slices.size(), dict_page_decoder->count()); - uint32_t dict_start_offset_array[dict_page_decoder->_num_elems]; - uint32_t dict_len_array[dict_page_decoder->_num_elems]; - for (int i = 0; i < dict_page_decoder->_num_elems; i++) { - const uint32_t start_offset = dict_page_decoder->offset(i); - uint32_t len = dict_page_decoder->offset(i + 1) - start_offset; - dict_start_offset_array[i] = start_offset; - dict_len_array[i] = len; - } + StringRef dict_word_info[dict_page_decoder->_num_elems]; + dict_page_decoder->get_dict_word_info(dict_word_info); // decode PageDecoderOptions decoder_options; BinaryDictPageDecoder page_decoder(s.slice(), decoder_options); - page_decoder.set_dict_decoder(dict_page_decoder.get(), dict_start_offset_array, dict_len_array); + page_decoder.set_dict_decoder(dict_page_decoder.get(), dict_word_info); status = page_decoder.init(); ASSERT_TRUE(status.ok()); @@ -177,21 +171,15 @@ public: status = dict_page_decoder->init(); ASSERT_TRUE(status.ok()); - uint32_t dict_start_offset_array[dict_page_decoder->_num_elems]; - uint32_t dict_len_array[dict_page_decoder->_num_elems]; - for (int i = 0; i < dict_page_decoder->_num_elems; i++) { - const uint32_t start_offset = dict_page_decoder->offset(i); - uint32_t len = dict_page_decoder->offset(i + 1) - start_offset; - dict_start_offset_array[i] = start_offset; - dict_len_array[i] = len; - } + StringRef dict_word_info[dict_page_decoder->_num_elems]; + dict_page_decoder->get_dict_word_info(dict_word_info); // decode PageDecoderOptions decoder_options; BinaryDictPageDecoder page_decoder(results[slice_index].slice(), decoder_options); status = page_decoder.init(); - page_decoder.set_dict_decoder(dict_page_decoder.get(), dict_start_offset_array, dict_len_array); + page_decoder.set_dict_decoder(dict_page_decoder.get(), dict_word_info); ASSERT_TRUE(status.ok()); //check values diff --git a/be/test/tools/benchmark_tool.cpp b/be/test/tools/benchmark_tool.cpp index 7a1f708..e06cc3d 100644 --- a/be/test/tools/benchmark_tool.cpp +++ b/be/test/tools/benchmark_tool.cpp @@ -174,22 +174,15 @@ public: new BinaryPlainPageDecoder(dict_slice.slice(), dict_decoder_options)); dict_page_decoder->init(); - uint32_t dict_start_offset_array[dict_page_decoder->_num_elems]; - uint32_t dict_len_array[dict_page_decoder->_num_elems]; - for (int i = 0; i < dict_page_decoder->_num_elems; i++) { - const uint32_t start_offset = dict_page_decoder->offset(i); - uint32_t len = dict_page_decoder->offset(i + 1) - start_offset; - dict_start_offset_array[i] = start_offset; - dict_len_array[i] = len; - } + StringRef dict_word_info[dict_page_decoder->_num_elems]; + dict_page_decoder->get_dict_word_info(dict_word_info); // decode PageDecoderOptions decoder_options; BinaryDictPageDecoder page_decoder(src.slice(), decoder_options); page_decoder.init(); - page_decoder.set_dict_decoder(dict_page_decoder.get(), dict_start_offset_array, - dict_len_array); + page_decoder.set_dict_decoder(dict_page_decoder.get(), dict_word_info); //check values size_t num = page_start_ids[slice_index + 1] - page_start_ids[slice_index]; --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org