This is an automated email from the ASF dual-hosted git repository.

panxiaolei pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new 2b5b9577346 [Bug](join) fix columnstr64's offset overflow on 
serialize_value_into… #46461 (#46939)
2b5b9577346 is described below

commit 2b5b95773463c55d392fba7029f0ca2a1d47e561
Author: Pxl <x...@selectdb.com>
AuthorDate: Tue Jan 14 16:36:19 2025 +0800

    [Bug](join) fix columnstr64's offset overflow on serialize_value_into… 
#46461 (#46939)
    
    …_arena (#46461)
    
    ```sql
    mysql [test]>select /*+ LEADING(a,b) */ count(*) from d_table as a, 
d_table2 as b where a.k4=b.k4 and a.k1=b.k1;
    
    +----------+
    | count(*) |
    +----------+
    | 50000000 |
    +----------+
    1 row in set (4.87 sec)
    
    mysql [test]>select /*+ LEADING(b,a) */ count(*) from d_table as a, 
d_table2 as b where a.k4=b.k4 and a.k1=b.k1;
    
    +----------+
    | count(*) |
    +----------+
    | 42949673 |
    +----------+
    1 row in set (21.32 sec)
    ```
    
    None
    
    - Test <!-- At least one of them must be included. -->
        - [x] Regression test
        - [ ] Unit Test
        - [ ] Manual test (add detailed scripts or steps below)
        - [ ] No need to test or manual test. Explain why:
    - [ ] This is a refactor/code format and no logic has been changed.
    - [ ] Previous test can cover this change. - [ ] No code files have been
    changed. - [ ] Other reason <!-- Add your reason? -->
    
    - Behavior changed:
        - [x] No.
        - [ ] Yes. <!-- Explain the behavior change -->
    
    - Does this need documentation?
        - [x] No.
    - [ ] Yes. <!-- Add document PR link here. eg:
    https://github.com/apache/doris-website/pull/1214 -->
    
    - [ ] Confirm the release note
    - [ ] Confirm test cases
    - [ ] Confirm document
    - [ ] Add branch pick label <!-- Add branch pick label that this PR
    should merge into -->
    
    ### What problem does this PR solve?
    
    Issue Number: close #xxx
    
    Related PR: #xxx
    
    Problem Summary:
    
    ### Release note
    
    None
    
    ### Check List (For Author)
    
    - Test <!-- At least one of them must be included. -->
        - [ ] Regression test
        - [ ] Unit Test
        - [ ] Manual test (add detailed scripts or steps below)
        - [ ] No need to test or manual test. Explain why:
    - [ ] This is a refactor/code format and no logic has been changed.
            - [ ] Previous test can cover this change.
            - [ ] No code files have been changed.
            - [ ] Other reason <!-- Add your reason?  -->
    
    - Behavior changed:
        - [ ] No.
        - [ ] Yes. <!-- Explain the behavior change -->
    
    - Does this need documentation?
        - [ ] No.
    - [ ] Yes. <!-- Add document PR link here. eg:
    https://github.com/apache/doris-website/pull/1214 -->
    
    ### Check List (For Reviewer who merge this PR)
    
    - [ ] Confirm the release note
    - [ ] Confirm test cases
    - [ ] Confirm document
    - [ ] Add branch pick label <!-- Add branch pick label that this PR
    should merge into -->
---
 be/src/vec/columns/column_string.cpp               | 20 ++++----
 be/src/vec/columns/column_string.h                 |  9 ++--
 .../query_p1/str64_serialize/str64_serialize.out   |  7 +++
 .../str64_serialize/str64_serialize.groovy         | 57 ++++++++++++++++++++++
 4 files changed, 80 insertions(+), 13 deletions(-)

diff --git a/be/src/vec/columns/column_string.cpp 
b/be/src/vec/columns/column_string.cpp
index 6eb3e45b2e0..8bcc6565467 100644
--- a/be/src/vec/columns/column_string.cpp
+++ b/be/src/vec/columns/column_string.cpp
@@ -380,8 +380,8 @@ ColumnPtr ColumnStr<T>::permute(const IColumn::Permutation& 
perm, size_t limit)
 template <typename T>
 StringRef ColumnStr<T>::serialize_value_into_arena(size_t n, Arena& arena,
                                                    char const*& begin) const {
-    uint32_t string_size(size_at(n));
-    uint32_t offset(offset_at(n));
+    auto string_size(size_at(n));
+    auto offset(offset_at(n));
 
     StringRef res;
     res.size = sizeof(string_size) + string_size;
@@ -395,7 +395,7 @@ StringRef ColumnStr<T>::serialize_value_into_arena(size_t 
n, Arena& arena,
 
 template <typename T>
 const char* ColumnStr<T>::deserialize_and_insert_from_arena(const char* pos) {
-    const uint32_t string_size = unaligned_load<uint32_t>(pos);
+    const auto string_size = unaligned_load<uint32_t>(pos);
     pos += sizeof(string_size);
 
     const size_t old_size = chars.size();
@@ -413,7 +413,7 @@ size_t ColumnStr<T>::get_max_row_byte_size() const {
     size_t max_size = 0;
     size_t num_rows = offsets.size();
     for (size_t i = 0; i < num_rows; ++i) {
-        max_size = std::max(max_size, size_at(i));
+        max_size = std::max(max_size, size_t(size_at(i)));
     }
 
     return max_size + sizeof(uint32_t);
@@ -423,8 +423,8 @@ template <typename T>
 void ColumnStr<T>::serialize_vec(std::vector<StringRef>& keys, size_t num_rows,
                                  size_t max_row_byte_size) const {
     for (size_t i = 0; i < num_rows; ++i) {
-        uint32_t offset(offset_at(i));
-        uint32_t string_size(size_at(i));
+        auto offset(offset_at(i));
+        auto string_size(size_at(i));
 
         auto* ptr = const_cast<char*>(keys[i].data + keys[i].size);
         memcpy_fixed<uint32_t>(ptr, (char*)&string_size);
@@ -450,7 +450,7 @@ void 
ColumnStr<T>::serialize_vec_with_null_map(std::vector<StringRef>& keys, siz
                 UInt32 offset(offset_at(i));
                 UInt32 string_size(size_at(i));
 
-                memcpy_fixed<UInt32>(dest + 1, (char*)&string_size);
+                memcpy_fixed<uint32_t>(dest + 1, (char*)&string_size);
                 memcpy(dest + 1 + sizeof(string_size), &chars[offset], 
string_size);
                 keys[i].size += sizeof(string_size) + string_size + 
sizeof(UInt8);
             } else {
@@ -467,7 +467,7 @@ void 
ColumnStr<T>::serialize_vec_with_null_map(std::vector<StringRef>& keys, siz
             UInt32 offset(offset_at(i));
             UInt32 string_size(size_at(i));
 
-            memcpy_fixed<UInt32>(dest + 1, (char*)&string_size);
+            memcpy_fixed<uint32_t>(dest + 1, (char*)&string_size);
             memcpy(dest + 1 + sizeof(string_size), &chars[offset], 
string_size);
             keys[i].size += sizeof(string_size) + string_size + sizeof(UInt8);
         }
@@ -477,7 +477,7 @@ void 
ColumnStr<T>::serialize_vec_with_null_map(std::vector<StringRef>& keys, siz
 template <typename T>
 void ColumnStr<T>::deserialize_vec(std::vector<StringRef>& keys, const size_t 
num_rows) {
     for (size_t i = 0; i != num_rows; ++i) {
-        auto original_ptr = keys[i].data;
+        const auto* original_ptr = keys[i].data;
         keys[i].data = deserialize_and_insert_from_arena(original_ptr);
         keys[i].size -= keys[i].data - original_ptr;
     }
@@ -488,7 +488,7 @@ void 
ColumnStr<T>::deserialize_vec_with_null_map(std::vector<StringRef>& keys,
                                                  const size_t num_rows, const 
uint8_t* null_map) {
     for (size_t i = 0; i != num_rows; ++i) {
         if (null_map[i] == 0) {
-            auto original_ptr = keys[i].data;
+            const auto* original_ptr = keys[i].data;
             keys[i].data = deserialize_and_insert_from_arena(original_ptr);
             keys[i].size -= keys[i].data - original_ptr;
         } else {
diff --git a/be/src/vec/columns/column_string.h 
b/be/src/vec/columns/column_string.h
index 906f62b52aa..e696b6f0764 100644
--- a/be/src/vec/columns/column_string.h
+++ b/be/src/vec/columns/column_string.h
@@ -87,8 +87,11 @@ private:
 
     size_t ALWAYS_INLINE offset_at(ssize_t i) const { return offsets[i - 1]; }
 
-    /// Size of i-th element, including terminating zero.
-    size_t ALWAYS_INLINE size_at(ssize_t i) const { return offsets[i] - 
offsets[i - 1]; }
+    // Size of i-th element, including terminating zero.
+    // assume that the length of a single element is less than 32-bit
+    uint32_t ALWAYS_INLINE size_at(ssize_t i) const {
+        return uint32_t(offsets[i] - offsets[i - 1]);
+    }
 
     template <bool positive>
     struct less;
@@ -373,7 +376,7 @@ public:
 
         for (size_t i = start_index; i < start_index + num; i++) {
             int32_t codeword = data_array[i];
-            auto& src = dict[codeword];
+            const auto& src = dict[codeword];
             memcpy(chars.data() + old_size, src.data, src.size);
             old_size += src.size;
         }
diff --git a/regression-test/data/query_p1/str64_serialize/str64_serialize.out 
b/regression-test/data/query_p1/str64_serialize/str64_serialize.out
new file mode 100644
index 00000000000..99c168b99d4
--- /dev/null
+++ b/regression-test/data/query_p1/str64_serialize/str64_serialize.out
@@ -0,0 +1,7 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !test --
+50000000
+
+-- !test --
+50000000
+
diff --git 
a/regression-test/suites/query_p1/str64_serialize/str64_serialize.groovy 
b/regression-test/suites/query_p1/str64_serialize/str64_serialize.groovy
new file mode 100644
index 00000000000..b0e3ffa99e9
--- /dev/null
+++ b/regression-test/suites/query_p1/str64_serialize/str64_serialize.groovy
@@ -0,0 +1,57 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("str64_serialize") {
+    
+    sql """ DROP TABLE IF EXISTS d_table; """
+    sql """ DROP TABLE IF EXISTS d_table2; """
+
+
+    sql """
+            create table d_table (
+                k1 int null,
+                k2 int not null,
+                k3 bigint null,
+                k4 varchar(100) null
+            )
+            duplicate key (k1,k2,k3)
+            distributed BY hash(k1) buckets 3
+            properties("replication_num" = "1");
+        """
+    sql """
+            create table d_table2 (
+                k1 int null,
+                k2 int not null,
+                k3 bigint null,
+                k4 varchar(100) null
+            )
+            duplicate key (k1,k2,k3)
+            distributed BY hash(k1) buckets 3
+            properties("replication_num" = "1");
+        """
+
+    sql """insert into d_table select 
1,1,1,'1234567890abcdefghigalsdhaluihdicandejionxaoxwdeuhwenudzmwoedxneiowdxiowedjxneiowdjixoneiiexdnuiexef'
 from (select 1 k1) as t lateral view explode_numbers(50000000) tmp1 as e1;
+"""
+
+    sql """insert into d_table2 select 
1,1,1,'1234567890abcdefghigalsdhaluihdicandejionxaoxwdeuhwenudzmwoedxneiowdxiowedjxneiowdjixoneiiexdnuiexef';
+"""
+    sql "set parallel_pipeline_task_num=1;"
+
+    qt_test "select /*+ LEADING(a,b) */ count(*) from d_table as a, d_table2 
as b where a.k4=b.k4 and a.k1=b.k1;"
+    qt_test "select /*+ LEADING(b,a) */ count(*) from d_table as a, d_table2 
as b where a.k4=b.k4 and a.k1=b.k1;"
+}
+


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to