This is an automated email from the ASF dual-hosted git repository. gabriellee pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 3aa6d044df1 [refactor](column) Simplify column functions (#44272) 3aa6d044df1 is described below commit 3aa6d044df16bf93570a6586c91e3fb986406aee Author: Gabriel <liwenqi...@selectdb.com> AuthorDate: Thu Nov 21 10:18:58 2024 +0800 [refactor](column) Simplify column functions (#44272) `insert_many_binary_data` has the same logics as `insert_many_strings`. This PR unify those 2 functions. --- be/src/olap/rowset/segment_v2/binary_plain_page.h | 19 +++++++----------- be/src/olap/rowset/segment_v2/segment_iterator.cpp | 6 ++++-- be/src/vec/columns/column.h | 6 ------ be/src/vec/columns/column_complex.h | 5 ++--- be/src/vec/columns/column_fixed_length_object.h | 7 ++----- be/src/vec/columns/column_nullable.cpp | 11 ++++++++--- be/src/vec/columns/column_nullable.h | 6 ------ be/src/vec/columns/column_object.h | 6 ------ be/src/vec/columns/column_string.h | 23 ---------------------- be/src/vec/columns/predicate_column.h | 19 ++++++++---------- 10 files changed, 31 insertions(+), 77 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/binary_plain_page.h b/be/src/olap/rowset/segment_v2/binary_plain_page.h index e043164ef28..3b3c6ad3fea 100644 --- a/be/src/olap/rowset/segment_v2/binary_plain_page.h +++ b/be/src/olap/rowset/segment_v2/binary_plain_page.h @@ -266,8 +266,7 @@ public: auto total = *n; size_t read_count = 0; - _len_array.resize(total); - _start_offset_array.resize(total); + _binary_data.resize(total); for (size_t i = 0; i < total; ++i) { ordinal_t ord = rowids[i] - page_first_ordinal; if (UNLIKELY(ord >= _num_elems)) { @@ -275,14 +274,13 @@ public: } const uint32_t start_offset = offset(ord); - _start_offset_array[read_count] = start_offset; - _len_array[read_count] = offset(ord + 1) - start_offset; + _binary_data[read_count].data = _data.mutable_data() + start_offset; + _binary_data[read_count].size = offset(ord + 1) - start_offset; read_count++; } if (LIKELY(read_count > 0)) { - dst->insert_many_binary_data(_data.mutable_data(), _len_array.data(), - _start_offset_array.data(), read_count); + dst->insert_many_strings(_binary_data.data(), read_count); } *n = read_count; @@ -342,13 +340,11 @@ private: if (idx >= _num_elems) { return _offsets_pos; } - const uint8_t* p = - reinterpret_cast<const uint8_t*>(&_data[_offsets_pos + idx * SIZE_OF_INT32]); - return decode_fixed32_le(p); + return guarded_offset(idx); } uint32_t guarded_offset(size_t idx) const { - const uint8_t* p = + const auto* p = reinterpret_cast<const uint8_t*>(&_data[_offsets_pos + idx * SIZE_OF_INT32]); return decode_fixed32_le(p); } @@ -361,8 +357,7 @@ private: uint32_t _offsets_pos; std::vector<uint32_t> _offsets; - std::vector<uint32_t> _len_array; - std::vector<uint32_t> _start_offset_array; + std::vector<StringRef> _binary_data; // Index of the currently seeked element in the page. uint32_t _cur_idx; diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp index 2c7aa5a0ce1..96b0bea2ae8 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp +++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp @@ -2264,8 +2264,10 @@ Status SegmentIterator::_next_batch_internal(vectorized::Block* block) { size_t rows = block->rows(); for (const auto& entry : *block) { if (entry.column->size() != rows) { - throw doris::Exception(ErrorCode::INTERNAL_ERROR, "unmatched size {}, expected {}", - entry.column->size(), rows); + throw doris::Exception(ErrorCode::INTERNAL_ERROR, + "unmatched size {}, expected {}, column: {}, type: {}", + entry.column->size(), rows, entry.column->get_name(), + entry.type->get_name()); } } #endif diff --git a/be/src/vec/columns/column.h b/be/src/vec/columns/column.h index ce155aefad2..e92c246741f 100644 --- a/be/src/vec/columns/column.h +++ b/be/src/vec/columns/column.h @@ -249,12 +249,6 @@ public: "Method insert_many_dict_data is not supported for " + get_name()); } - virtual void insert_many_binary_data(char* data_array, uint32_t* len_array, - uint32_t* start_offset_array, size_t num) { - throw doris::Exception(ErrorCode::NOT_IMPLEMENTED_ERROR, - "Method insert_many_binary_data is not supported for " + get_name()); - } - /// Insert binary data into column from a continuous buffer, the implementation maybe copy all binary data /// in one single time. virtual void insert_many_continuous_binary_data(const char* data, const uint32_t* offsets, diff --git a/be/src/vec/columns/column_complex.h b/be/src/vec/columns/column_complex.h index 1ff074b1129..24b6b7ddbd7 100644 --- a/be/src/vec/columns/column_complex.h +++ b/be/src/vec/columns/column_complex.h @@ -98,10 +98,9 @@ public: } } - void insert_many_binary_data(char* data_array, uint32_t* len_array, - uint32_t* start_offset_array, size_t num) override { + void insert_many_strings(const StringRef* strings, size_t num) override { for (size_t i = 0; i < num; i++) { - insert_binary_data(data_array + start_offset_array[i], len_array[i]); + insert_binary_data(strings[i].data, strings[i].size); } } diff --git a/be/src/vec/columns/column_fixed_length_object.h b/be/src/vec/columns/column_fixed_length_object.h index ea3437dbb01..947d859ce23 100644 --- a/be/src/vec/columns/column_fixed_length_object.h +++ b/be/src/vec/columns/column_fixed_length_object.h @@ -284,8 +284,7 @@ public: memcpy(_data.data() + old_size, data + begin_offset, total_mem_size); } - void insert_many_binary_data(char* data_array, uint32_t* len_array, - uint32_t* start_offset_array, size_t num) override { + void insert_many_strings(const StringRef* strings, size_t num) override { if (UNLIKELY(num == 0)) { return; } @@ -294,10 +293,8 @@ public: resize(old_count + num); auto* dst = _data.data() + old_count * _item_size; for (size_t i = 0; i < num; i++) { - auto* src = data_array + start_offset_array[i]; - uint32_t len = len_array[i]; dst += i * _item_size; - memcpy(dst, src, len); + memcpy(dst, strings[i].data, strings[i].size); } } diff --git a/be/src/vec/columns/column_nullable.cpp b/be/src/vec/columns/column_nullable.cpp index 493aabdae31..5e34ad4d8d4 100644 --- a/be/src/vec/columns/column_nullable.cpp +++ b/be/src/vec/columns/column_nullable.cpp @@ -185,16 +185,21 @@ void ColumnNullable::insert_data(const char* pos, size_t length) { } void ColumnNullable::insert_many_strings(const StringRef* strings, size_t num) { + auto not_null_count = 0; for (size_t i = 0; i != num; ++i) { if (strings[i].data == nullptr) { - nested_column->insert_default(); + _push_false_to_nullmap(not_null_count); + not_null_count = 0; get_null_map_data().push_back(1); _has_null = true; } else { - nested_column->insert_data(strings[i].data, strings[i].size); - _push_false_to_nullmap(1); + not_null_count++; } } + if (not_null_count) { + _push_false_to_nullmap(not_null_count); + } + nested_column->insert_many_strings(strings, num); } void ColumnNullable::insert_many_from(const IColumn& src, size_t position, size_t length) { diff --git a/be/src/vec/columns/column_nullable.h b/be/src/vec/columns/column_nullable.h index 0e5e104dce9..2b87aa982ca 100644 --- a/be/src/vec/columns/column_nullable.h +++ b/be/src/vec/columns/column_nullable.h @@ -245,12 +245,6 @@ public: get_nested_column().insert_many_continuous_binary_data(data, offsets, num); } - void insert_many_binary_data(char* data_array, uint32_t* len_array, - uint32_t* start_offset_array, size_t num) override { - _push_false_to_nullmap(num); - get_nested_column().insert_many_binary_data(data_array, len_array, start_offset_array, num); - } - void insert_default() override { get_nested_column().insert_default(); get_null_map_data().push_back(1); diff --git a/be/src/vec/columns/column_object.h b/be/src/vec/columns/column_object.h index 244692e9ddd..1c8f38056c9 100644 --- a/be/src/vec/columns/column_object.h +++ b/be/src/vec/columns/column_object.h @@ -465,12 +465,6 @@ public: "insert_many_dict_data" + get_name()); } - void insert_many_binary_data(char* data_array, uint32_t* len_array, - uint32_t* start_offset_array, size_t num) override { - throw doris::Exception(ErrorCode::NOT_IMPLEMENTED_ERROR, - "insert_many_binary_data" + get_name()); - } - void insert_many_continuous_binary_data(const char* data, const uint32_t* offsets, const size_t num) override { throw doris::Exception(ErrorCode::NOT_IMPLEMENTED_ERROR, diff --git a/be/src/vec/columns/column_string.h b/be/src/vec/columns/column_string.h index a72a3ec5cdc..f116d4ce1f1 100644 --- a/be/src/vec/columns/column_string.h +++ b/be/src/vec/columns/column_string.h @@ -269,29 +269,6 @@ public: DCHECK(chars.size() == offsets.back()); } - void insert_many_binary_data(char* data_array, uint32_t* len_array, - uint32_t* start_offset_array, size_t num) override { - size_t new_size = 0; - for (size_t i = 0; i < num; i++) { - new_size += len_array[i]; - } - - const size_t old_size = chars.size(); - check_chars_length(old_size + new_size, offsets.size() + num); - chars.resize(old_size + new_size); - - Char* data = chars.data(); - size_t offset = old_size; - for (size_t i = 0; i < num; i++) { - uint32_t len = len_array[i]; - uint32_t start_offset = start_offset_array[i]; - // memcpy will deal len == 0, not do it here - memcpy(data + offset, data_array + start_offset, len); - offset += len; - offsets.push_back(offset); - } - } - void insert_many_strings(const StringRef* strings, size_t num) override { size_t new_size = 0; for (size_t i = 0; i < num; i++) { diff --git a/be/src/vec/columns/predicate_column.h b/be/src/vec/columns/predicate_column.h index c9ceec230f7..c2c6456d862 100644 --- a/be/src/vec/columns/predicate_column.h +++ b/be/src/vec/columns/predicate_column.h @@ -269,8 +269,7 @@ public: } } - void insert_many_binary_data(char* data_array, uint32_t* len_array, - uint32_t* start_offset_array, size_t num) override { + void insert_many_strings(const StringRef* strings, size_t num) override { if (num == 0) { return; } @@ -281,26 +280,25 @@ public: size_t total_mem_size = 0; for (size_t i = 0; i < num; i++) { - total_mem_size += len_array[i]; + total_mem_size += strings[i].size; } char* destination = _arena->alloc(total_mem_size); char* org_dst = destination; size_t org_elem_num = data.size(); data.resize(org_elem_num + num); - uint32_t fragment_start_offset = start_offset_array[0]; + uint32_t fragment_start_offset = 0; size_t fragment_len = 0; for (size_t i = 0; i < num; i++) { data[org_elem_num + i].data = destination + fragment_len; - data[org_elem_num + i].size = len_array[i]; - fragment_len += len_array[i]; + data[org_elem_num + i].size = strings[i].size; + fragment_len += strings[i].size; // Compute the largest continuous memcpy block and copy them. // If this is the last element in data array, then should copy the current memory block. - if (i == num - 1 || - start_offset_array[i + 1] != start_offset_array[i] + len_array[i]) { - memcpy(destination, data_array + fragment_start_offset, fragment_len); + if (i == num - 1 || strings[i + 1].data != strings[i].data + strings[i].size) { + memcpy(destination, strings[fragment_start_offset].data, fragment_len); destination += fragment_len; - fragment_start_offset = (i == num - 1 ? 0 : start_offset_array[i + 1]); + fragment_start_offset = i == num - 1 ? 0 : i + 1; fragment_len = 0; } } @@ -309,7 +307,6 @@ public: } else { throw doris::Exception(ErrorCode::INTERNAL_ERROR, "Method insert_many_binary_data is not supported"); - __builtin_unreachable(); } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org