This is an automated email from the ASF dual-hosted git repository.

lihaopeng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 62f765b7f5 [improvement](scan) speed up inserting strings into 
ColumnString (#13397)
62f765b7f5 is described below

commit 62f765b7f530c5af2bb292dab820e0b8077b64d1
Author: Jerry Hu <mrh...@gmail.com>
AuthorDate: Wed Nov 2 22:19:02 2022 +0800

    [improvement](scan) speed up inserting strings into ColumnString (#13397)
---
 be/src/olap/rowset/segment_v2/binary_plain_page.h |  14 ++-
 be/src/vec/columns/column.h                       |  12 ++-
 be/src/vec/columns/column_complex.h               |  11 +++
 be/src/vec/columns/column_dictionary.h            |  17 ++--
 be/src/vec/columns/column_jsonb.h                 |  26 ++++++
 be/src/vec/columns/column_nullable.h              |   9 ++
 be/src/vec/columns/column_string.h                | 101 +++++++++++-----------
 be/src/vec/columns/predicate_column.h             |  30 ++++++-
 8 files changed, 150 insertions(+), 70 deletions(-)

diff --git a/be/src/olap/rowset/segment_v2/binary_plain_page.h 
b/be/src/olap/rowset/segment_v2/binary_plain_page.h
index 659df55fee..96cfc392a5 100644
--- a/be/src/olap/rowset/segment_v2/binary_plain_page.h
+++ b/be/src/olap/rowset/segment_v2/binary_plain_page.h
@@ -248,16 +248,14 @@ public:
             return Status::OK();
         }
         const size_t max_fetch = std::min(*n, static_cast<size_t>(_num_elems - 
_cur_idx));
-        uint32_t len_array[max_fetch];
-        uint32_t start_offset_array[max_fetch];
 
         uint32_t last_offset = guarded_offset(_cur_idx);
+        uint32_t offsets[max_fetch + 1];
+        offsets[0] = last_offset;
         for (int i = 0; i < max_fetch - 1; i++, _cur_idx++) {
             const uint32_t start_offset = last_offset;
             last_offset = guarded_offset(_cur_idx + 1);
-            uint32_t len = last_offset - start_offset;
-            len_array[i] = len;
-            start_offset_array[i] = start_offset;
+            offsets[i + 1] = last_offset;
             if constexpr (Type == OLAP_FIELD_TYPE_OBJECT) {
                 if (_options.need_check_bitmap) {
                     RETURN_IF_ERROR(BitmapTypeCode::validate(*(_data.data + 
start_offset)));
@@ -265,15 +263,13 @@ public:
             }
         }
         _cur_idx++;
-        len_array[max_fetch - 1] = offset(_cur_idx) - last_offset;
-        start_offset_array[max_fetch - 1] = last_offset;
+        offsets[max_fetch] = offset(_cur_idx);
         if constexpr (Type == OLAP_FIELD_TYPE_OBJECT) {
             if (_options.need_check_bitmap) {
                 RETURN_IF_ERROR(BitmapTypeCode::validate(*(_data.data + 
last_offset)));
             }
         }
-        dst->insert_many_binary_data(_data.mutable_data(), len_array, 
start_offset_array,
-                                     max_fetch);
+        dst->insert_many_continuous_binary_data(_data.data, offsets, 
max_fetch);
 
         *n = max_fetch;
         return Status::OK();
diff --git a/be/src/vec/columns/column.h b/be/src/vec/columns/column.h
index ec794bf747..4b89a002af 100644
--- a/be/src/vec/columns/column.h
+++ b/be/src/vec/columns/column.h
@@ -243,6 +243,14 @@ public:
         LOG(FATAL) << "Method insert_many_binary_data is not supported for " 
<< get_name();
     }
 
+    /// Insert binary data into column from a continuous buffer, the 
implementation maybe copy all binary data
+    /// in one single time.
+    virtual void insert_many_continuous_binary_data(const char* data, const 
uint32_t* offsets,
+                                                    const size_t num) {
+        LOG(FATAL) << "Method insert_many_continuous_binary_data is not 
supported for "
+                   << get_name();
+    }
+
     virtual void insert_many_strings(const StringRef* strings, size_t num) {
         LOG(FATAL) << "Method insert_many_binary_data is not supported for " 
<< get_name();
     }
@@ -271,10 +279,6 @@ public:
         }
     }
 
-    virtual void insert_elements(void* elements, size_t num) {
-        LOG(FATAL) << "Method insert_elements is not supported for " << 
get_name();
-    }
-
     /** Removes last n elements.
       * Is used to support exception-safety of several operations.
       *  For example, sometimes insertion should be reverted if we catch an 
exception during operation processing.
diff --git a/be/src/vec/columns/column_complex.h 
b/be/src/vec/columns/column_complex.h
index 260729f736..ec15c65df7 100644
--- a/be/src/vec/columns/column_complex.h
+++ b/be/src/vec/columns/column_complex.h
@@ -79,6 +79,17 @@ public:
         }
     }
 
+    void insert_many_continuous_binary_data(const char* data, const uint32_t* 
offsets,
+                                            const size_t num) override {
+        if (UNLIKELY(num == 0)) {
+            return;
+        }
+
+        for (size_t i = 0; i != num; ++i) {
+            insert_binary_data(data + offsets[i], offsets[i + 1] - offsets[i]);
+        }
+    }
+
     void insert_many_binary_data(char* data_array, uint32_t* len_array,
                                  uint32_t* start_offset_array, size_t num) 
override {
         for (size_t i = 0; i < num; i++) {
diff --git a/be/src/vec/columns/column_dictionary.h 
b/be/src/vec/columns/column_dictionary.h
index e226976f0e..b8976e77c5 100644
--- a/be/src/vec/columns/column_dictionary.h
+++ b/be/src/vec/columns/column_dictionary.h
@@ -192,14 +192,17 @@ public:
 
     Status filter_by_selector(const uint16_t* sel, size_t sel_size, IColumn* 
col_ptr) override {
         auto* res_col = reinterpret_cast<vectorized::ColumnString*>(col_ptr);
-        res_col->get_offsets().reserve(sel_size);
-        res_col->get_chars().reserve(_dict.avg_str_len() * sel_size);
-        for (size_t i = 0; i < sel_size; i++) {
-            uint16_t n = sel[i];
-            auto& code = reinterpret_cast<T&>(_codes[n]);
-            auto value = _dict.get_value(code);
-            res_col->insert_data_without_reserve(value.ptr, value.len);
+        StringRef strings[sel_size];
+        size_t length = 0;
+        for (size_t i = 0; i != sel_size; ++i) {
+            auto& value = _dict.get_value(_codes[sel[i]]);
+            strings[i].data = value.ptr;
+            strings[i].size = value.len;
+            length += value.len;
         }
+        res_col->get_offsets().reserve(sel_size + 
res_col->get_offsets().size());
+        res_col->get_chars().reserve(length + res_col->get_chars().size());
+        res_col->insert_many_strings_without_reserve(strings, sel_size);
         return Status::OK();
     }
 
diff --git a/be/src/vec/columns/column_jsonb.h 
b/be/src/vec/columns/column_jsonb.h
index 58789d0783..66e17d0e68 100644
--- a/be/src/vec/columns/column_jsonb.h
+++ b/be/src/vec/columns/column_jsonb.h
@@ -145,6 +145,32 @@ public:
         offsets.push_back(new_size);
     }
 
+    void insert_many_continuous_binary_data(const char* data, const uint32_t* 
offsets_,
+                                            const size_t num) override {
+        if (UNLIKELY(num == 0)) {
+            return;
+        }
+
+        size_t new_size = offsets_[num] - offsets_[0] + num * sizeof(char);
+        const size_t old_size = chars.size();
+        chars.resize(new_size + old_size);
+
+        auto* data_ptr = chars.data();
+        size_t offset = old_size;
+
+        for (size_t i = 0; i != num; ++i) {
+            uint32_t len = offsets_[i + 1] - offsets_[i];
+            if (LIKELY(len)) {
+                memcpy(data_ptr + offset, data + offsets_[i], len);
+                offset += len;
+            }
+            data_ptr[offset] = 0;
+            offset += 1;
+            offsets.push_back(offset);
+        }
+        DCHECK(offset == chars.size());
+    }
+
     void insert_many_binary_data(char* data_array, uint32_t* len_array,
                                  uint32_t* start_offset_array, size_t num) 
override {
         size_t new_size = 0;
diff --git a/be/src/vec/columns/column_nullable.h 
b/be/src/vec/columns/column_nullable.h
index 95c11d447b..acc0ba611c 100644
--- a/be/src/vec/columns/column_nullable.h
+++ b/be/src/vec/columns/column_nullable.h
@@ -124,6 +124,15 @@ public:
                                                   dict_num);
     }
 
+    void insert_many_continuous_binary_data(const char* data, const uint32_t* 
offsets,
+                                            const size_t num) override {
+        if (UNLIKELY(num == 0)) {
+            return;
+        }
+        get_null_map_column().fill(0, num);
+        get_nested_column().insert_many_continuous_binary_data(data, offsets, 
num);
+    }
+
     void insert_many_binary_data(char* data_array, uint32_t* len_array,
                                  uint32_t* start_offset_array, size_t num) 
override {
         get_null_map_column().fill(0, num);
diff --git a/be/src/vec/columns/column_string.h 
b/be/src/vec/columns/column_string.h
index cd70e228b6..26a734fb08 100644
--- a/be/src/vec/columns/column_string.h
+++ b/be/src/vec/columns/column_string.h
@@ -164,6 +164,58 @@ public:
         offsets.push_back_without_reserve(new_size);
     }
 
+    /// Before insert strings, the caller should calculate the total size of 
strings,
+    /// and reserve the chars & the offsets.
+    void insert_many_strings_without_reserve(const StringRef* strings, size_t 
num) {
+        Char* data = chars.data();
+        size_t offset = chars.size();
+        size_t length = 0;
+
+        const char* ptr = strings[0].data;
+        for (size_t i = 0; i != num; i++) {
+            uint32_t len = strings[i].size;
+            length += len;
+            offset += len;
+            offsets.push_back(offset);
+
+            if (i != num - 1 && strings[i].data + len == strings[i + 1].data) {
+                continue;
+            }
+            memcpy(data, ptr, length);
+            data += length;
+            if (LIKELY(i != num - 1)) {
+                ptr = strings[i + 1].data;
+                length = 0;
+            }
+        }
+        chars.resize(offset);
+    }
+
+    void insert_many_continuous_binary_data(const char* data, const uint32_t* 
offsets_,
+                                            const size_t num) override {
+        static_assert(sizeof(offsets_[0]) == sizeof(*offsets.data()));
+        if (UNLIKELY(num == 0)) {
+            return;
+        }
+        const auto old_size = chars.size();
+        const auto begin_offset = offsets_[0];
+        const auto total_mem_size = offsets_[num] - begin_offset;
+        if (LIKELY(total_mem_size > 0)) {
+            chars.resize(total_mem_size + old_size);
+            memcpy(chars.data() + old_size, data + begin_offset, 
total_mem_size);
+        }
+        const auto old_rows = offsets.size();
+        auto tail_offset = offsets.back();
+        DCHECK(tail_offset == old_size);
+        offsets.resize(old_rows + num);
+        auto* offsets_ptr = &offsets[old_rows];
+
+        for (size_t i = 0; i < num; ++i) {
+            offsets_ptr[i] = tail_offset + offsets_[i + 1] - begin_offset;
+        }
+        DCHECK(chars.size() == offsets.back());
+    }
+
     void insert_many_binary_data(char* data_array, uint32_t* len_array,
                                  uint32_t* start_offset_array, size_t num) 
override {
         size_t new_size = 0;
@@ -207,55 +259,6 @@ public:
         }
     }
 
-    void insert_many_continuous_strings(const StringRef* strings, size_t num) {
-        DCHECK_NE(num, 0);
-        offsets.reserve(offsets.size() + num);
-        std::vector<const char*> start_points(1);
-        auto& head = strings[0];
-        start_points[0] = head.data;
-        size_t new_size = head.size;
-        const char* cursor = head.data + new_size;
-        std::vector<const char*> end_points;
-
-        const size_t old_size = chars.size();
-        size_t offset = old_size;
-        offset += new_size;
-        offsets.push_back(offset);
-        if (num == 1) {
-            end_points.push_back(cursor);
-        } else {
-            for (size_t i = 1; i < num; i++) {
-                auto& str = strings[i];
-                if (cursor != str.data) {
-                    end_points.push_back(cursor);
-                    start_points.push_back(str.data);
-                    cursor = str.data;
-                }
-                size_t sz = str.size;
-                offset += sz;
-                new_size += sz;
-                cursor += sz;
-                offsets.push_back_without_reserve(offset);
-            }
-            end_points.push_back(cursor);
-        }
-        DCHECK_EQ(end_points.size(), start_points.size());
-
-        chars.resize(old_size + new_size);
-
-        size_t num_range = start_points.size();
-        Char* data = chars.data();
-
-        offset = old_size;
-        for (size_t i = 0; i < num_range; i++) {
-            uint32_t len = end_points[i] - start_points[i];
-            if (len) {
-                memcpy(data + offset, start_points[i], len);
-                offset += len;
-            }
-        }
-    }
-
     void insert_many_dict_data(const int32_t* data_array, size_t start_index, 
const StringRef* dict,
                                size_t num, uint32_t /*dict_num*/) override {
         size_t offset_size = offsets.size();
diff --git a/be/src/vec/columns/predicate_column.h 
b/be/src/vec/columns/predicate_column.h
index 128b33bf3e..23c165adcc 100644
--- a/be/src/vec/columns/predicate_column.h
+++ b/be/src/vec/columns/predicate_column.h
@@ -91,13 +91,17 @@ private:
     void insert_string_to_res_column(const uint16_t* sel, size_t sel_size,
                                      vectorized::ColumnString* res_ptr) {
         StringRef refs[sel_size];
+        size_t length = 0;
         for (size_t i = 0; i < sel_size; i++) {
             uint16_t n = sel[i];
             auto& sv = reinterpret_cast<StringValue&>(data[n]);
             refs[i].data = sv.ptr;
             refs[i].size = sv.len;
+            length += sv.len;
         }
-        res_ptr->insert_many_continuous_strings(refs, sel_size);
+        res_ptr->get_offsets().reserve(sel_size + 
res_ptr->get_offsets().size());
+        res_ptr->get_chars().reserve(length + res_ptr->get_chars().size());
+        res_ptr->insert_many_strings_without_reserve(refs, sel_size);
     }
 
     void insert_decimal_to_res_column(const uint16_t* sel, size_t sel_size,
@@ -256,6 +260,30 @@ public:
         }
     }
 
+    void insert_many_continuous_binary_data(const char* data_, const uint32_t* 
offsets,
+                                            const size_t num) override {
+        if (UNLIKELY(num == 0)) {
+            return;
+        }
+        if constexpr (std::is_same_v<T, StringValue>) {
+            if (_pool == nullptr) {
+                _pool.reset(new MemPool());
+            }
+            const auto total_mem_size = offsets[num] - offsets[0];
+            char* destination = (char*)_pool->allocate(total_mem_size);
+            memcpy(destination, data_ + offsets[0], total_mem_size);
+            size_t org_elem_num = data.size();
+            data.resize(org_elem_num + num);
+
+            auto* data_ptr = &data[org_elem_num];
+            for (size_t i = 0; i != num; ++i) {
+                data_ptr[i].ptr = destination + offsets[i] - offsets[0];
+                data_ptr[i].len = offsets[i + 1] - offsets[i];
+            }
+            DCHECK(data_ptr[num - 1].ptr + data_ptr[num - 1].len == 
destination + total_mem_size);
+        }
+    }
+
     void insert_many_binary_data(char* data_array, uint32_t* len_array,
                                  uint32_t* start_offset_array, size_t num) 
override {
         if (num == 0) {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to