This is an automated email from the ASF dual-hosted git repository. gabriellee pushed a commit to branch opt_perf in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/opt_perf by this push: new ca59a30738 [Improvement](string) Optimize scanning for string (#12945) ca59a30738 is described below commit ca59a30738bac95f8fc29d430e119d3293f93482 Author: Gabriel <gabrielleeb...@gmail.com> AuthorDate: Sat Sep 24 21:47:22 2022 +0800 [Improvement](string) Optimize scanning for string (#12945) --- be/src/olap/rowset/segment_v2/binary_plain_page.h | 27 +++++++++--- be/src/olap/rowset/segment_v2/segment_iterator.cpp | 13 ++++-- be/src/vec/columns/column_string.h | 49 ++++++++++++++++++++++ be/src/vec/columns/predicate_column.h | 2 +- 4 files changed, 81 insertions(+), 10 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/binary_plain_page.h b/be/src/olap/rowset/segment_v2/binary_plain_page.h index 1242fd9b75..659df55fee 100644 --- a/be/src/olap/rowset/segment_v2/binary_plain_page.h +++ b/be/src/olap/rowset/segment_v2/binary_plain_page.h @@ -248,12 +248,14 @@ public: return Status::OK(); } const size_t max_fetch = std::min(*n, static_cast<size_t>(_num_elems - _cur_idx)); - uint32_t len_array[max_fetch]; uint32_t start_offset_array[max_fetch]; - for (int i = 0; i < max_fetch; i++, _cur_idx++) { - const uint32_t start_offset = offset(_cur_idx); - uint32_t len = offset(_cur_idx + 1) - start_offset; + + uint32_t last_offset = guarded_offset(_cur_idx); + for (int i = 0; i < max_fetch - 1; i++, _cur_idx++) { + const uint32_t start_offset = last_offset; + last_offset = guarded_offset(_cur_idx + 1); + uint32_t len = last_offset - start_offset; len_array[i] = len; start_offset_array[i] = start_offset; if constexpr (Type == OLAP_FIELD_TYPE_OBJECT) { @@ -262,6 +264,14 @@ public: } } } + _cur_idx++; + len_array[max_fetch - 1] = offset(_cur_idx) - last_offset; + start_offset_array[max_fetch - 1] = last_offset; + if constexpr (Type == OLAP_FIELD_TYPE_OBJECT) { + if (_options.need_check_bitmap) { + RETURN_IF_ERROR(BitmapTypeCode::validate(*(_data.data + last_offset))); + } + } dst->insert_many_binary_data(_data.mutable_data(), len_array, start_offset_array, max_fetch); @@ -340,13 +350,20 @@ public: } private: + static constexpr size_t SIZE_OF_INT32 = sizeof(uint32_t); // Return the offset within '_data' where the string value with index 'idx' can be found. uint32_t offset(size_t idx) const { if (idx >= _num_elems) { return _offsets_pos; } const uint8_t* p = - reinterpret_cast<const uint8_t*>(&_data[_offsets_pos + idx * sizeof(uint32_t)]); + reinterpret_cast<const uint8_t*>(&_data[_offsets_pos + idx * SIZE_OF_INT32]); + return decode_fixed32_le(p); + } + + uint32_t guarded_offset(size_t idx) const { + const uint8_t* p = + reinterpret_cast<const uint8_t*>(&_data[_offsets_pos + idx * SIZE_OF_INT32]); return decode_fixed32_le(p); } diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp index 98f2cfae27..e6435e8be1 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp +++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp @@ -1150,8 +1150,11 @@ Status SegmentIterator::next_batch(vectorized::Block* block) { } if (!_lazy_materialization_read) { - Status ret = _output_column_by_sel_idx(block, _first_read_column_ids, sel_rowid_idx, - selected_size); + Status ret = Status::OK(); + if (selected_size > 0) { + ret = _output_column_by_sel_idx(block, _first_read_column_ids, sel_rowid_idx, + selected_size); + } if (!ret.ok()) { return ret; } @@ -1176,8 +1179,10 @@ Status SegmentIterator::next_batch(vectorized::Block* block) { // when lazy materialization enables, _first_read_column_ids = distinct(_short_cir_pred_column_ids + _vec_pred_column_ids) // see _vec_init_lazy_materialization // todo(wb) need to tell input columnids from output columnids - RETURN_IF_ERROR(_output_column_by_sel_idx(block, _first_read_column_ids, sel_rowid_idx, - selected_size)); + if (selected_size > 0) { + RETURN_IF_ERROR(_output_column_by_sel_idx(block, _first_read_column_ids, sel_rowid_idx, + selected_size)); + } } // shrink char_type suffix zero data diff --git a/be/src/vec/columns/column_string.h b/be/src/vec/columns/column_string.h index 469bbdc6df..4dbaab3db4 100644 --- a/be/src/vec/columns/column_string.h +++ b/be/src/vec/columns/column_string.h @@ -207,6 +207,55 @@ public: } } + void insert_many_continuous_strings(const StringRef* strings, size_t num) { + DCHECK_NE(num, 0); + offsets.reserve(offsets.size() + num); + std::vector<const char*> start_points(1); + auto& head = strings[0]; + start_points[0] = head.data; + size_t new_size = head.size; + const char* cursor = head.data + new_size; + std::vector<const char*> end_points; + + const size_t old_size = chars.size(); + size_t offset = old_size; + offset += new_size; + offsets.push_back(offset); + if (num == 1) { + end_points.push_back(cursor); + } else { + for (size_t i = 1; i < num; i++) { + auto& str = strings[i]; + if (cursor != str.data) { + end_points.push_back(cursor); + start_points.push_back(str.data); + cursor = str.data; + } + size_t sz = str.size; + offset += sz; + new_size += sz; + cursor += sz; + offsets.push_back_without_reserve(offset); + } + end_points.push_back(cursor); + } + DCHECK_EQ(end_points.size(), start_points.size()); + + chars.resize(old_size + new_size); + + size_t num_range = start_points.size(); + Char* data = chars.data(); + + offset = old_size; + for (size_t i = 0; i < num_range; i++) { + uint32_t len = end_points[i] - start_points[i]; + if (len) { + memcpy(data + offset, start_points[i], len); + offset += len; + } + } + } + void insert_many_dict_data(const int32_t* data_array, size_t start_index, const StringRef* dict, size_t num, uint32_t /*dict_num*/) override { size_t offset_size = offsets.size(); diff --git a/be/src/vec/columns/predicate_column.h b/be/src/vec/columns/predicate_column.h index d5ad52b6ac..01a90c9eb9 100644 --- a/be/src/vec/columns/predicate_column.h +++ b/be/src/vec/columns/predicate_column.h @@ -97,7 +97,7 @@ private: refs[i].data = sv.ptr; refs[i].size = sv.len; } - res_ptr->insert_many_strings(refs, sel_size); + res_ptr->insert_many_continuous_strings(refs, sel_size); } void insert_decimal_to_res_column(const uint16_t* sel, size_t sel_size, --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org