This is an automated email from the ASF dual-hosted git repository.

gabriellee pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 3aa6d044df1 [refactor](column) Simplify column functions (#44272)
3aa6d044df1 is described below

commit 3aa6d044df16bf93570a6586c91e3fb986406aee
Author: Gabriel <liwenqi...@selectdb.com>
AuthorDate: Thu Nov 21 10:18:58 2024 +0800

    [refactor](column) Simplify column functions (#44272)
    
    `insert_many_binary_data` has the same logics as `insert_many_strings`.
    This PR unify those 2 functions.
---
 be/src/olap/rowset/segment_v2/binary_plain_page.h  | 19 +++++++-----------
 be/src/olap/rowset/segment_v2/segment_iterator.cpp |  6 ++++--
 be/src/vec/columns/column.h                        |  6 ------
 be/src/vec/columns/column_complex.h                |  5 ++---
 be/src/vec/columns/column_fixed_length_object.h    |  7 ++-----
 be/src/vec/columns/column_nullable.cpp             | 11 ++++++++---
 be/src/vec/columns/column_nullable.h               |  6 ------
 be/src/vec/columns/column_object.h                 |  6 ------
 be/src/vec/columns/column_string.h                 | 23 ----------------------
 be/src/vec/columns/predicate_column.h              | 19 ++++++++----------
 10 files changed, 31 insertions(+), 77 deletions(-)

diff --git a/be/src/olap/rowset/segment_v2/binary_plain_page.h 
b/be/src/olap/rowset/segment_v2/binary_plain_page.h
index e043164ef28..3b3c6ad3fea 100644
--- a/be/src/olap/rowset/segment_v2/binary_plain_page.h
+++ b/be/src/olap/rowset/segment_v2/binary_plain_page.h
@@ -266,8 +266,7 @@ public:
 
         auto total = *n;
         size_t read_count = 0;
-        _len_array.resize(total);
-        _start_offset_array.resize(total);
+        _binary_data.resize(total);
         for (size_t i = 0; i < total; ++i) {
             ordinal_t ord = rowids[i] - page_first_ordinal;
             if (UNLIKELY(ord >= _num_elems)) {
@@ -275,14 +274,13 @@ public:
             }
 
             const uint32_t start_offset = offset(ord);
-            _start_offset_array[read_count] = start_offset;
-            _len_array[read_count] = offset(ord + 1) - start_offset;
+            _binary_data[read_count].data = _data.mutable_data() + 
start_offset;
+            _binary_data[read_count].size = offset(ord + 1) - start_offset;
             read_count++;
         }
 
         if (LIKELY(read_count > 0)) {
-            dst->insert_many_binary_data(_data.mutable_data(), 
_len_array.data(),
-                                         _start_offset_array.data(), 
read_count);
+            dst->insert_many_strings(_binary_data.data(), read_count);
         }
 
         *n = read_count;
@@ -342,13 +340,11 @@ private:
         if (idx >= _num_elems) {
             return _offsets_pos;
         }
-        const uint8_t* p =
-                reinterpret_cast<const uint8_t*>(&_data[_offsets_pos + idx * 
SIZE_OF_INT32]);
-        return decode_fixed32_le(p);
+        return guarded_offset(idx);
     }
 
     uint32_t guarded_offset(size_t idx) const {
-        const uint8_t* p =
+        const auto* p =
                 reinterpret_cast<const uint8_t*>(&_data[_offsets_pos + idx * 
SIZE_OF_INT32]);
         return decode_fixed32_le(p);
     }
@@ -361,8 +357,7 @@ private:
     uint32_t _offsets_pos;
 
     std::vector<uint32_t> _offsets;
-    std::vector<uint32_t> _len_array;
-    std::vector<uint32_t> _start_offset_array;
+    std::vector<StringRef> _binary_data;
 
     // Index of the currently seeked element in the page.
     uint32_t _cur_idx;
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp 
b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
index 2c7aa5a0ce1..96b0bea2ae8 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
@@ -2264,8 +2264,10 @@ Status 
SegmentIterator::_next_batch_internal(vectorized::Block* block) {
     size_t rows = block->rows();
     for (const auto& entry : *block) {
         if (entry.column->size() != rows) {
-            throw doris::Exception(ErrorCode::INTERNAL_ERROR, "unmatched size 
{}, expected {}",
-                                   entry.column->size(), rows);
+            throw doris::Exception(ErrorCode::INTERNAL_ERROR,
+                                   "unmatched size {}, expected {}, column: 
{}, type: {}",
+                                   entry.column->size(), rows, 
entry.column->get_name(),
+                                   entry.type->get_name());
         }
     }
 #endif
diff --git a/be/src/vec/columns/column.h b/be/src/vec/columns/column.h
index ce155aefad2..e92c246741f 100644
--- a/be/src/vec/columns/column.h
+++ b/be/src/vec/columns/column.h
@@ -249,12 +249,6 @@ public:
                                "Method insert_many_dict_data is not supported 
for " + get_name());
     }
 
-    virtual void insert_many_binary_data(char* data_array, uint32_t* len_array,
-                                         uint32_t* start_offset_array, size_t 
num) {
-        throw doris::Exception(ErrorCode::NOT_IMPLEMENTED_ERROR,
-                               "Method insert_many_binary_data is not 
supported for " + get_name());
-    }
-
     /// Insert binary data into column from a continuous buffer, the 
implementation maybe copy all binary data
     /// in one single time.
     virtual void insert_many_continuous_binary_data(const char* data, const 
uint32_t* offsets,
diff --git a/be/src/vec/columns/column_complex.h 
b/be/src/vec/columns/column_complex.h
index 1ff074b1129..24b6b7ddbd7 100644
--- a/be/src/vec/columns/column_complex.h
+++ b/be/src/vec/columns/column_complex.h
@@ -98,10 +98,9 @@ public:
         }
     }
 
-    void insert_many_binary_data(char* data_array, uint32_t* len_array,
-                                 uint32_t* start_offset_array, size_t num) 
override {
+    void insert_many_strings(const StringRef* strings, size_t num) override {
         for (size_t i = 0; i < num; i++) {
-            insert_binary_data(data_array + start_offset_array[i], 
len_array[i]);
+            insert_binary_data(strings[i].data, strings[i].size);
         }
     }
 
diff --git a/be/src/vec/columns/column_fixed_length_object.h 
b/be/src/vec/columns/column_fixed_length_object.h
index ea3437dbb01..947d859ce23 100644
--- a/be/src/vec/columns/column_fixed_length_object.h
+++ b/be/src/vec/columns/column_fixed_length_object.h
@@ -284,8 +284,7 @@ public:
         memcpy(_data.data() + old_size, data + begin_offset, total_mem_size);
     }
 
-    void insert_many_binary_data(char* data_array, uint32_t* len_array,
-                                 uint32_t* start_offset_array, size_t num) 
override {
+    void insert_many_strings(const StringRef* strings, size_t num) override {
         if (UNLIKELY(num == 0)) {
             return;
         }
@@ -294,10 +293,8 @@ public:
         resize(old_count + num);
         auto* dst = _data.data() + old_count * _item_size;
         for (size_t i = 0; i < num; i++) {
-            auto* src = data_array + start_offset_array[i];
-            uint32_t len = len_array[i];
             dst += i * _item_size;
-            memcpy(dst, src, len);
+            memcpy(dst, strings[i].data, strings[i].size);
         }
     }
 
diff --git a/be/src/vec/columns/column_nullable.cpp 
b/be/src/vec/columns/column_nullable.cpp
index 493aabdae31..5e34ad4d8d4 100644
--- a/be/src/vec/columns/column_nullable.cpp
+++ b/be/src/vec/columns/column_nullable.cpp
@@ -185,16 +185,21 @@ void ColumnNullable::insert_data(const char* pos, size_t 
length) {
 }
 
 void ColumnNullable::insert_many_strings(const StringRef* strings, size_t num) 
{
+    auto not_null_count = 0;
     for (size_t i = 0; i != num; ++i) {
         if (strings[i].data == nullptr) {
-            nested_column->insert_default();
+            _push_false_to_nullmap(not_null_count);
+            not_null_count = 0;
             get_null_map_data().push_back(1);
             _has_null = true;
         } else {
-            nested_column->insert_data(strings[i].data, strings[i].size);
-            _push_false_to_nullmap(1);
+            not_null_count++;
         }
     }
+    if (not_null_count) {
+        _push_false_to_nullmap(not_null_count);
+    }
+    nested_column->insert_many_strings(strings, num);
 }
 
 void ColumnNullable::insert_many_from(const IColumn& src, size_t position, 
size_t length) {
diff --git a/be/src/vec/columns/column_nullable.h 
b/be/src/vec/columns/column_nullable.h
index 0e5e104dce9..2b87aa982ca 100644
--- a/be/src/vec/columns/column_nullable.h
+++ b/be/src/vec/columns/column_nullable.h
@@ -245,12 +245,6 @@ public:
         get_nested_column().insert_many_continuous_binary_data(data, offsets, 
num);
     }
 
-    void insert_many_binary_data(char* data_array, uint32_t* len_array,
-                                 uint32_t* start_offset_array, size_t num) 
override {
-        _push_false_to_nullmap(num);
-        get_nested_column().insert_many_binary_data(data_array, len_array, 
start_offset_array, num);
-    }
-
     void insert_default() override {
         get_nested_column().insert_default();
         get_null_map_data().push_back(1);
diff --git a/be/src/vec/columns/column_object.h 
b/be/src/vec/columns/column_object.h
index 244692e9ddd..1c8f38056c9 100644
--- a/be/src/vec/columns/column_object.h
+++ b/be/src/vec/columns/column_object.h
@@ -465,12 +465,6 @@ public:
                                "insert_many_dict_data" + get_name());
     }
 
-    void insert_many_binary_data(char* data_array, uint32_t* len_array,
-                                 uint32_t* start_offset_array, size_t num) 
override {
-        throw doris::Exception(ErrorCode::NOT_IMPLEMENTED_ERROR,
-                               "insert_many_binary_data" + get_name());
-    }
-
     void insert_many_continuous_binary_data(const char* data, const uint32_t* 
offsets,
                                             const size_t num) override {
         throw doris::Exception(ErrorCode::NOT_IMPLEMENTED_ERROR,
diff --git a/be/src/vec/columns/column_string.h 
b/be/src/vec/columns/column_string.h
index a72a3ec5cdc..f116d4ce1f1 100644
--- a/be/src/vec/columns/column_string.h
+++ b/be/src/vec/columns/column_string.h
@@ -269,29 +269,6 @@ public:
         DCHECK(chars.size() == offsets.back());
     }
 
-    void insert_many_binary_data(char* data_array, uint32_t* len_array,
-                                 uint32_t* start_offset_array, size_t num) 
override {
-        size_t new_size = 0;
-        for (size_t i = 0; i < num; i++) {
-            new_size += len_array[i];
-        }
-
-        const size_t old_size = chars.size();
-        check_chars_length(old_size + new_size, offsets.size() + num);
-        chars.resize(old_size + new_size);
-
-        Char* data = chars.data();
-        size_t offset = old_size;
-        for (size_t i = 0; i < num; i++) {
-            uint32_t len = len_array[i];
-            uint32_t start_offset = start_offset_array[i];
-            // memcpy will deal len == 0, not do it here
-            memcpy(data + offset, data_array + start_offset, len);
-            offset += len;
-            offsets.push_back(offset);
-        }
-    }
-
     void insert_many_strings(const StringRef* strings, size_t num) override {
         size_t new_size = 0;
         for (size_t i = 0; i < num; i++) {
diff --git a/be/src/vec/columns/predicate_column.h 
b/be/src/vec/columns/predicate_column.h
index c9ceec230f7..c2c6456d862 100644
--- a/be/src/vec/columns/predicate_column.h
+++ b/be/src/vec/columns/predicate_column.h
@@ -269,8 +269,7 @@ public:
         }
     }
 
-    void insert_many_binary_data(char* data_array, uint32_t* len_array,
-                                 uint32_t* start_offset_array, size_t num) 
override {
+    void insert_many_strings(const StringRef* strings, size_t num) override {
         if (num == 0) {
             return;
         }
@@ -281,26 +280,25 @@ public:
 
             size_t total_mem_size = 0;
             for (size_t i = 0; i < num; i++) {
-                total_mem_size += len_array[i];
+                total_mem_size += strings[i].size;
             }
 
             char* destination = _arena->alloc(total_mem_size);
             char* org_dst = destination;
             size_t org_elem_num = data.size();
             data.resize(org_elem_num + num);
-            uint32_t fragment_start_offset = start_offset_array[0];
+            uint32_t fragment_start_offset = 0;
             size_t fragment_len = 0;
             for (size_t i = 0; i < num; i++) {
                 data[org_elem_num + i].data = destination + fragment_len;
-                data[org_elem_num + i].size = len_array[i];
-                fragment_len += len_array[i];
+                data[org_elem_num + i].size = strings[i].size;
+                fragment_len += strings[i].size;
                 // Compute the largest continuous memcpy block and copy them.
                 // If this is the last element in data array, then should copy 
the current memory block.
-                if (i == num - 1 ||
-                    start_offset_array[i + 1] != start_offset_array[i] + 
len_array[i]) {
-                    memcpy(destination, data_array + fragment_start_offset, 
fragment_len);
+                if (i == num - 1 || strings[i + 1].data != strings[i].data + 
strings[i].size) {
+                    memcpy(destination, strings[fragment_start_offset].data, 
fragment_len);
                     destination += fragment_len;
-                    fragment_start_offset = (i == num - 1 ? 0 : 
start_offset_array[i + 1]);
+                    fragment_start_offset = i == num - 1 ? 0 : i + 1;
                     fragment_len = 0;
                 }
             }
@@ -309,7 +307,6 @@ public:
         } else {
             throw doris::Exception(ErrorCode::INTERNAL_ERROR,
                                    "Method insert_many_binary_data is not 
supported");
-            __builtin_unreachable();
         }
     }
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to