This is an automated email from the ASF dual-hosted git repository. lihaopeng pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 62f765b7f5 [improvement](scan) speed up inserting strings into ColumnString (#13397) 62f765b7f5 is described below commit 62f765b7f530c5af2bb292dab820e0b8077b64d1 Author: Jerry Hu <mrh...@gmail.com> AuthorDate: Wed Nov 2 22:19:02 2022 +0800 [improvement](scan) speed up inserting strings into ColumnString (#13397) --- be/src/olap/rowset/segment_v2/binary_plain_page.h | 14 ++- be/src/vec/columns/column.h | 12 ++- be/src/vec/columns/column_complex.h | 11 +++ be/src/vec/columns/column_dictionary.h | 17 ++-- be/src/vec/columns/column_jsonb.h | 26 ++++++ be/src/vec/columns/column_nullable.h | 9 ++ be/src/vec/columns/column_string.h | 101 +++++++++++----------- be/src/vec/columns/predicate_column.h | 30 ++++++- 8 files changed, 150 insertions(+), 70 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/binary_plain_page.h b/be/src/olap/rowset/segment_v2/binary_plain_page.h index 659df55fee..96cfc392a5 100644 --- a/be/src/olap/rowset/segment_v2/binary_plain_page.h +++ b/be/src/olap/rowset/segment_v2/binary_plain_page.h @@ -248,16 +248,14 @@ public: return Status::OK(); } const size_t max_fetch = std::min(*n, static_cast<size_t>(_num_elems - _cur_idx)); - uint32_t len_array[max_fetch]; - uint32_t start_offset_array[max_fetch]; uint32_t last_offset = guarded_offset(_cur_idx); + uint32_t offsets[max_fetch + 1]; + offsets[0] = last_offset; for (int i = 0; i < max_fetch - 1; i++, _cur_idx++) { const uint32_t start_offset = last_offset; last_offset = guarded_offset(_cur_idx + 1); - uint32_t len = last_offset - start_offset; - len_array[i] = len; - start_offset_array[i] = start_offset; + offsets[i + 1] = last_offset; if constexpr (Type == OLAP_FIELD_TYPE_OBJECT) { if (_options.need_check_bitmap) { RETURN_IF_ERROR(BitmapTypeCode::validate(*(_data.data + start_offset))); @@ -265,15 +263,13 @@ public: } } _cur_idx++; - len_array[max_fetch - 1] = offset(_cur_idx) - last_offset; - start_offset_array[max_fetch - 1] = last_offset; + offsets[max_fetch] = offset(_cur_idx); if constexpr (Type == OLAP_FIELD_TYPE_OBJECT) { if (_options.need_check_bitmap) { RETURN_IF_ERROR(BitmapTypeCode::validate(*(_data.data + last_offset))); } } - dst->insert_many_binary_data(_data.mutable_data(), len_array, start_offset_array, - max_fetch); + dst->insert_many_continuous_binary_data(_data.data, offsets, max_fetch); *n = max_fetch; return Status::OK(); diff --git a/be/src/vec/columns/column.h b/be/src/vec/columns/column.h index ec794bf747..4b89a002af 100644 --- a/be/src/vec/columns/column.h +++ b/be/src/vec/columns/column.h @@ -243,6 +243,14 @@ public: LOG(FATAL) << "Method insert_many_binary_data is not supported for " << get_name(); } + /// Insert binary data into column from a continuous buffer, the implementation maybe copy all binary data + /// in one single time. + virtual void insert_many_continuous_binary_data(const char* data, const uint32_t* offsets, + const size_t num) { + LOG(FATAL) << "Method insert_many_continuous_binary_data is not supported for " + << get_name(); + } + virtual void insert_many_strings(const StringRef* strings, size_t num) { LOG(FATAL) << "Method insert_many_binary_data is not supported for " << get_name(); } @@ -271,10 +279,6 @@ public: } } - virtual void insert_elements(void* elements, size_t num) { - LOG(FATAL) << "Method insert_elements is not supported for " << get_name(); - } - /** Removes last n elements. * Is used to support exception-safety of several operations. * For example, sometimes insertion should be reverted if we catch an exception during operation processing. diff --git a/be/src/vec/columns/column_complex.h b/be/src/vec/columns/column_complex.h index 260729f736..ec15c65df7 100644 --- a/be/src/vec/columns/column_complex.h +++ b/be/src/vec/columns/column_complex.h @@ -79,6 +79,17 @@ public: } } + void insert_many_continuous_binary_data(const char* data, const uint32_t* offsets, + const size_t num) override { + if (UNLIKELY(num == 0)) { + return; + } + + for (size_t i = 0; i != num; ++i) { + insert_binary_data(data + offsets[i], offsets[i + 1] - offsets[i]); + } + } + void insert_many_binary_data(char* data_array, uint32_t* len_array, uint32_t* start_offset_array, size_t num) override { for (size_t i = 0; i < num; i++) { diff --git a/be/src/vec/columns/column_dictionary.h b/be/src/vec/columns/column_dictionary.h index e226976f0e..b8976e77c5 100644 --- a/be/src/vec/columns/column_dictionary.h +++ b/be/src/vec/columns/column_dictionary.h @@ -192,14 +192,17 @@ public: Status filter_by_selector(const uint16_t* sel, size_t sel_size, IColumn* col_ptr) override { auto* res_col = reinterpret_cast<vectorized::ColumnString*>(col_ptr); - res_col->get_offsets().reserve(sel_size); - res_col->get_chars().reserve(_dict.avg_str_len() * sel_size); - for (size_t i = 0; i < sel_size; i++) { - uint16_t n = sel[i]; - auto& code = reinterpret_cast<T&>(_codes[n]); - auto value = _dict.get_value(code); - res_col->insert_data_without_reserve(value.ptr, value.len); + StringRef strings[sel_size]; + size_t length = 0; + for (size_t i = 0; i != sel_size; ++i) { + auto& value = _dict.get_value(_codes[sel[i]]); + strings[i].data = value.ptr; + strings[i].size = value.len; + length += value.len; } + res_col->get_offsets().reserve(sel_size + res_col->get_offsets().size()); + res_col->get_chars().reserve(length + res_col->get_chars().size()); + res_col->insert_many_strings_without_reserve(strings, sel_size); return Status::OK(); } diff --git a/be/src/vec/columns/column_jsonb.h b/be/src/vec/columns/column_jsonb.h index 58789d0783..66e17d0e68 100644 --- a/be/src/vec/columns/column_jsonb.h +++ b/be/src/vec/columns/column_jsonb.h @@ -145,6 +145,32 @@ public: offsets.push_back(new_size); } + void insert_many_continuous_binary_data(const char* data, const uint32_t* offsets_, + const size_t num) override { + if (UNLIKELY(num == 0)) { + return; + } + + size_t new_size = offsets_[num] - offsets_[0] + num * sizeof(char); + const size_t old_size = chars.size(); + chars.resize(new_size + old_size); + + auto* data_ptr = chars.data(); + size_t offset = old_size; + + for (size_t i = 0; i != num; ++i) { + uint32_t len = offsets_[i + 1] - offsets_[i]; + if (LIKELY(len)) { + memcpy(data_ptr + offset, data + offsets_[i], len); + offset += len; + } + data_ptr[offset] = 0; + offset += 1; + offsets.push_back(offset); + } + DCHECK(offset == chars.size()); + } + void insert_many_binary_data(char* data_array, uint32_t* len_array, uint32_t* start_offset_array, size_t num) override { size_t new_size = 0; diff --git a/be/src/vec/columns/column_nullable.h b/be/src/vec/columns/column_nullable.h index 95c11d447b..acc0ba611c 100644 --- a/be/src/vec/columns/column_nullable.h +++ b/be/src/vec/columns/column_nullable.h @@ -124,6 +124,15 @@ public: dict_num); } + void insert_many_continuous_binary_data(const char* data, const uint32_t* offsets, + const size_t num) override { + if (UNLIKELY(num == 0)) { + return; + } + get_null_map_column().fill(0, num); + get_nested_column().insert_many_continuous_binary_data(data, offsets, num); + } + void insert_many_binary_data(char* data_array, uint32_t* len_array, uint32_t* start_offset_array, size_t num) override { get_null_map_column().fill(0, num); diff --git a/be/src/vec/columns/column_string.h b/be/src/vec/columns/column_string.h index cd70e228b6..26a734fb08 100644 --- a/be/src/vec/columns/column_string.h +++ b/be/src/vec/columns/column_string.h @@ -164,6 +164,58 @@ public: offsets.push_back_without_reserve(new_size); } + /// Before insert strings, the caller should calculate the total size of strings, + /// and reserve the chars & the offsets. + void insert_many_strings_without_reserve(const StringRef* strings, size_t num) { + Char* data = chars.data(); + size_t offset = chars.size(); + size_t length = 0; + + const char* ptr = strings[0].data; + for (size_t i = 0; i != num; i++) { + uint32_t len = strings[i].size; + length += len; + offset += len; + offsets.push_back(offset); + + if (i != num - 1 && strings[i].data + len == strings[i + 1].data) { + continue; + } + memcpy(data, ptr, length); + data += length; + if (LIKELY(i != num - 1)) { + ptr = strings[i + 1].data; + length = 0; + } + } + chars.resize(offset); + } + + void insert_many_continuous_binary_data(const char* data, const uint32_t* offsets_, + const size_t num) override { + static_assert(sizeof(offsets_[0]) == sizeof(*offsets.data())); + if (UNLIKELY(num == 0)) { + return; + } + const auto old_size = chars.size(); + const auto begin_offset = offsets_[0]; + const auto total_mem_size = offsets_[num] - begin_offset; + if (LIKELY(total_mem_size > 0)) { + chars.resize(total_mem_size + old_size); + memcpy(chars.data() + old_size, data + begin_offset, total_mem_size); + } + const auto old_rows = offsets.size(); + auto tail_offset = offsets.back(); + DCHECK(tail_offset == old_size); + offsets.resize(old_rows + num); + auto* offsets_ptr = &offsets[old_rows]; + + for (size_t i = 0; i < num; ++i) { + offsets_ptr[i] = tail_offset + offsets_[i + 1] - begin_offset; + } + DCHECK(chars.size() == offsets.back()); + } + void insert_many_binary_data(char* data_array, uint32_t* len_array, uint32_t* start_offset_array, size_t num) override { size_t new_size = 0; @@ -207,55 +259,6 @@ public: } } - void insert_many_continuous_strings(const StringRef* strings, size_t num) { - DCHECK_NE(num, 0); - offsets.reserve(offsets.size() + num); - std::vector<const char*> start_points(1); - auto& head = strings[0]; - start_points[0] = head.data; - size_t new_size = head.size; - const char* cursor = head.data + new_size; - std::vector<const char*> end_points; - - const size_t old_size = chars.size(); - size_t offset = old_size; - offset += new_size; - offsets.push_back(offset); - if (num == 1) { - end_points.push_back(cursor); - } else { - for (size_t i = 1; i < num; i++) { - auto& str = strings[i]; - if (cursor != str.data) { - end_points.push_back(cursor); - start_points.push_back(str.data); - cursor = str.data; - } - size_t sz = str.size; - offset += sz; - new_size += sz; - cursor += sz; - offsets.push_back_without_reserve(offset); - } - end_points.push_back(cursor); - } - DCHECK_EQ(end_points.size(), start_points.size()); - - chars.resize(old_size + new_size); - - size_t num_range = start_points.size(); - Char* data = chars.data(); - - offset = old_size; - for (size_t i = 0; i < num_range; i++) { - uint32_t len = end_points[i] - start_points[i]; - if (len) { - memcpy(data + offset, start_points[i], len); - offset += len; - } - } - } - void insert_many_dict_data(const int32_t* data_array, size_t start_index, const StringRef* dict, size_t num, uint32_t /*dict_num*/) override { size_t offset_size = offsets.size(); diff --git a/be/src/vec/columns/predicate_column.h b/be/src/vec/columns/predicate_column.h index 128b33bf3e..23c165adcc 100644 --- a/be/src/vec/columns/predicate_column.h +++ b/be/src/vec/columns/predicate_column.h @@ -91,13 +91,17 @@ private: void insert_string_to_res_column(const uint16_t* sel, size_t sel_size, vectorized::ColumnString* res_ptr) { StringRef refs[sel_size]; + size_t length = 0; for (size_t i = 0; i < sel_size; i++) { uint16_t n = sel[i]; auto& sv = reinterpret_cast<StringValue&>(data[n]); refs[i].data = sv.ptr; refs[i].size = sv.len; + length += sv.len; } - res_ptr->insert_many_continuous_strings(refs, sel_size); + res_ptr->get_offsets().reserve(sel_size + res_ptr->get_offsets().size()); + res_ptr->get_chars().reserve(length + res_ptr->get_chars().size()); + res_ptr->insert_many_strings_without_reserve(refs, sel_size); } void insert_decimal_to_res_column(const uint16_t* sel, size_t sel_size, @@ -256,6 +260,30 @@ public: } } + void insert_many_continuous_binary_data(const char* data_, const uint32_t* offsets, + const size_t num) override { + if (UNLIKELY(num == 0)) { + return; + } + if constexpr (std::is_same_v<T, StringValue>) { + if (_pool == nullptr) { + _pool.reset(new MemPool()); + } + const auto total_mem_size = offsets[num] - offsets[0]; + char* destination = (char*)_pool->allocate(total_mem_size); + memcpy(destination, data_ + offsets[0], total_mem_size); + size_t org_elem_num = data.size(); + data.resize(org_elem_num + num); + + auto* data_ptr = &data[org_elem_num]; + for (size_t i = 0; i != num; ++i) { + data_ptr[i].ptr = destination + offsets[i] - offsets[0]; + data_ptr[i].len = offsets[i + 1] - offsets[i]; + } + DCHECK(data_ptr[num - 1].ptr + data_ptr[num - 1].len == destination + total_mem_size); + } + } + void insert_many_binary_data(char* data_array, uint32_t* len_array, uint32_t* start_offset_array, size_t num) override { if (num == 0) { --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org