This is an automated email from the ASF dual-hosted git repository.

gabriellee pushed a commit to branch opt_perf
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/opt_perf by this push:
     new ca59a30738 [Improvement](string) Optimize scanning for string (#12945)
ca59a30738 is described below

commit ca59a30738bac95f8fc29d430e119d3293f93482
Author: Gabriel <gabrielleeb...@gmail.com>
AuthorDate: Sat Sep 24 21:47:22 2022 +0800

    [Improvement](string) Optimize scanning for string (#12945)
---
 be/src/olap/rowset/segment_v2/binary_plain_page.h  | 27 +++++++++---
 be/src/olap/rowset/segment_v2/segment_iterator.cpp | 13 ++++--
 be/src/vec/columns/column_string.h                 | 49 ++++++++++++++++++++++
 be/src/vec/columns/predicate_column.h              |  2 +-
 4 files changed, 81 insertions(+), 10 deletions(-)

diff --git a/be/src/olap/rowset/segment_v2/binary_plain_page.h 
b/be/src/olap/rowset/segment_v2/binary_plain_page.h
index 1242fd9b75..659df55fee 100644
--- a/be/src/olap/rowset/segment_v2/binary_plain_page.h
+++ b/be/src/olap/rowset/segment_v2/binary_plain_page.h
@@ -248,12 +248,14 @@ public:
             return Status::OK();
         }
         const size_t max_fetch = std::min(*n, static_cast<size_t>(_num_elems - 
_cur_idx));
-
         uint32_t len_array[max_fetch];
         uint32_t start_offset_array[max_fetch];
-        for (int i = 0; i < max_fetch; i++, _cur_idx++) {
-            const uint32_t start_offset = offset(_cur_idx);
-            uint32_t len = offset(_cur_idx + 1) - start_offset;
+
+        uint32_t last_offset = guarded_offset(_cur_idx);
+        for (int i = 0; i < max_fetch - 1; i++, _cur_idx++) {
+            const uint32_t start_offset = last_offset;
+            last_offset = guarded_offset(_cur_idx + 1);
+            uint32_t len = last_offset - start_offset;
             len_array[i] = len;
             start_offset_array[i] = start_offset;
             if constexpr (Type == OLAP_FIELD_TYPE_OBJECT) {
@@ -262,6 +264,14 @@ public:
                 }
             }
         }
+        _cur_idx++;
+        len_array[max_fetch - 1] = offset(_cur_idx) - last_offset;
+        start_offset_array[max_fetch - 1] = last_offset;
+        if constexpr (Type == OLAP_FIELD_TYPE_OBJECT) {
+            if (_options.need_check_bitmap) {
+                RETURN_IF_ERROR(BitmapTypeCode::validate(*(_data.data + 
last_offset)));
+            }
+        }
         dst->insert_many_binary_data(_data.mutable_data(), len_array, 
start_offset_array,
                                      max_fetch);
 
@@ -340,13 +350,20 @@ public:
     }
 
 private:
+    static constexpr size_t SIZE_OF_INT32 = sizeof(uint32_t);
     // Return the offset within '_data' where the string value with index 
'idx' can be found.
     uint32_t offset(size_t idx) const {
         if (idx >= _num_elems) {
             return _offsets_pos;
         }
         const uint8_t* p =
-                reinterpret_cast<const uint8_t*>(&_data[_offsets_pos + idx * 
sizeof(uint32_t)]);
+                reinterpret_cast<const uint8_t*>(&_data[_offsets_pos + idx * 
SIZE_OF_INT32]);
+        return decode_fixed32_le(p);
+    }
+
+    uint32_t guarded_offset(size_t idx) const {
+        const uint8_t* p =
+                reinterpret_cast<const uint8_t*>(&_data[_offsets_pos + idx * 
SIZE_OF_INT32]);
         return decode_fixed32_le(p);
     }
 
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp 
b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
index 98f2cfae27..e6435e8be1 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
@@ -1150,8 +1150,11 @@ Status SegmentIterator::next_batch(vectorized::Block* 
block) {
         }
 
         if (!_lazy_materialization_read) {
-            Status ret = _output_column_by_sel_idx(block, 
_first_read_column_ids, sel_rowid_idx,
-                                                   selected_size);
+            Status ret = Status::OK();
+            if (selected_size > 0) {
+                ret = _output_column_by_sel_idx(block, _first_read_column_ids, 
sel_rowid_idx,
+                                                selected_size);
+            }
             if (!ret.ok()) {
                 return ret;
             }
@@ -1176,8 +1179,10 @@ Status SegmentIterator::next_batch(vectorized::Block* 
block) {
         // when lazy materialization enables, _first_read_column_ids = 
distinct(_short_cir_pred_column_ids + _vec_pred_column_ids)
         // see _vec_init_lazy_materialization
         // todo(wb) need to tell input columnids from output columnids
-        RETURN_IF_ERROR(_output_column_by_sel_idx(block, 
_first_read_column_ids, sel_rowid_idx,
-                                                  selected_size));
+        if (selected_size > 0) {
+            RETURN_IF_ERROR(_output_column_by_sel_idx(block, 
_first_read_column_ids, sel_rowid_idx,
+                                                      selected_size));
+        }
     }
 
     // shrink char_type suffix zero data
diff --git a/be/src/vec/columns/column_string.h 
b/be/src/vec/columns/column_string.h
index 469bbdc6df..4dbaab3db4 100644
--- a/be/src/vec/columns/column_string.h
+++ b/be/src/vec/columns/column_string.h
@@ -207,6 +207,55 @@ public:
         }
     }
 
+    void insert_many_continuous_strings(const StringRef* strings, size_t num) {
+        DCHECK_NE(num, 0);
+        offsets.reserve(offsets.size() + num);
+        std::vector<const char*> start_points(1);
+        auto& head = strings[0];
+        start_points[0] = head.data;
+        size_t new_size = head.size;
+        const char* cursor = head.data + new_size;
+        std::vector<const char*> end_points;
+
+        const size_t old_size = chars.size();
+        size_t offset = old_size;
+        offset += new_size;
+        offsets.push_back(offset);
+        if (num == 1) {
+            end_points.push_back(cursor);
+        } else {
+            for (size_t i = 1; i < num; i++) {
+                auto& str = strings[i];
+                if (cursor != str.data) {
+                    end_points.push_back(cursor);
+                    start_points.push_back(str.data);
+                    cursor = str.data;
+                }
+                size_t sz = str.size;
+                offset += sz;
+                new_size += sz;
+                cursor += sz;
+                offsets.push_back_without_reserve(offset);
+            }
+            end_points.push_back(cursor);
+        }
+        DCHECK_EQ(end_points.size(), start_points.size());
+
+        chars.resize(old_size + new_size);
+
+        size_t num_range = start_points.size();
+        Char* data = chars.data();
+
+        offset = old_size;
+        for (size_t i = 0; i < num_range; i++) {
+            uint32_t len = end_points[i] - start_points[i];
+            if (len) {
+                memcpy(data + offset, start_points[i], len);
+                offset += len;
+            }
+        }
+    }
+
     void insert_many_dict_data(const int32_t* data_array, size_t start_index, 
const StringRef* dict,
                                size_t num, uint32_t /*dict_num*/) override {
         size_t offset_size = offsets.size();
diff --git a/be/src/vec/columns/predicate_column.h 
b/be/src/vec/columns/predicate_column.h
index d5ad52b6ac..01a90c9eb9 100644
--- a/be/src/vec/columns/predicate_column.h
+++ b/be/src/vec/columns/predicate_column.h
@@ -97,7 +97,7 @@ private:
             refs[i].data = sv.ptr;
             refs[i].size = sv.len;
         }
-        res_ptr->insert_many_strings(refs, sel_size);
+        res_ptr->insert_many_continuous_strings(refs, sel_size);
     }
 
     void insert_decimal_to_res_column(const uint16_t* sel, size_t sel_size,


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to