This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-2.1 by this push:
     new d4712aed1a9 branch-2.1: [fix](string64) fix coredump caused by 
ColumnArray<ColumnStr<uint64_t>>::insert_indices_from (#43862)
d4712aed1a9 is described below

commit d4712aed1a9c74494b6cf1319e4f60c590105232
Author: github-actions[bot] 
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Wed Nov 13 19:31:11 2024 +0800

    branch-2.1: [fix](string64) fix coredump caused by 
ColumnArray<ColumnStr<uint64_t>>::insert_indices_from (#43862)
    
    Cherry-picked from #43624
    
    Co-authored-by: TengJianPing <tengjianp...@selectdb.com>
---
 be/src/vec/columns/column_array.cpp     |   3 +-
 be/src/vec/columns/column_string.cpp    |  65 ++++++++------
 be/test/vec/core/column_array_test.cpp  |  47 ++++++++++
 be/test/vec/core/column_map_test.cpp    | 155 ++++++++++++++++++++++++++++++++
 be/test/vec/core/column_string_test.cpp |  38 ++++++++
 be/test/vec/core/column_struct_test.cpp |  79 ++++++++++++++++
 6 files changed, 361 insertions(+), 26 deletions(-)

diff --git a/be/src/vec/columns/column_array.cpp 
b/be/src/vec/columns/column_array.cpp
index 110c7f492b1..c735e4413ff 100644
--- a/be/src/vec/columns/column_array.cpp
+++ b/be/src/vec/columns/column_array.cpp
@@ -431,7 +431,8 @@ void ColumnArray::insert_from(const IColumn& src_, size_t 
n) {
 
     if (!get_data().is_nullable() && src.get_data().is_nullable()) {
         // Note: we can't process the case of 'Array(Nullable(nest))'
-        DCHECK(false);
+        throw Exception(ErrorCode::INTERNAL_ERROR, "insert '{}' into '{}'", 
src.get_name(),
+                        get_name());
     } else if (get_data().is_nullable() && !src.get_data().is_nullable()) {
         // Note: here we should process the case of 'Array(NotNullable(nest))'
         reinterpret_cast<ColumnNullable*>(&get_data())
diff --git a/be/src/vec/columns/column_string.cpp 
b/be/src/vec/columns/column_string.cpp
index 3799bab34f6..bf3e2b5c753 100644
--- a/be/src/vec/columns/column_string.cpp
+++ b/be/src/vec/columns/column_string.cpp
@@ -22,7 +22,6 @@
 
 #include <algorithm>
 #include <boost/iterator/iterator_facade.hpp>
-#include <ostream>
 
 #include "util/simd/bits.h"
 #include "vec/columns/columns_common.h"
@@ -93,6 +92,13 @@ MutableColumnPtr ColumnStr<T>::get_shrinked_column() {
     return shrinked_column;
 }
 
+// This method is only called by MutableBlock::merge_ignore_overflow
+// by hash join operator to collect build data to avoid
+// the total string length of a ColumnStr<uint32_t> column exceeds the 4G 
limit.
+//
+// After finishing collecting build data, a ColumnStr<uint32_t> column
+// will be converted to ColumnStr<uint64_t> if the total string length
+// exceeds the 4G limit by calling Block::replace_if_overflow.
 template <typename T>
 void ColumnStr<T>::insert_range_from_ignore_overflow(const 
doris::vectorized::IColumn& src,
                                                      size_t start, size_t 
length) {
@@ -122,6 +128,8 @@ void ColumnStr<T>::insert_range_from_ignore_overflow(const 
doris::vectorized::IC
         offsets.resize(old_size + length);
 
         for (size_t i = 0; i < length; ++i) {
+            // unsinged integer overflow is well defined in C++,
+            // so we don't need to check the overflow here.
             offsets[old_size + i] =
                     src_concrete.offsets[start + i] - nested_offset + 
prev_max_offset;
         }
@@ -133,34 +141,41 @@ void ColumnStr<T>::insert_range_from(const IColumn& src, 
size_t start, size_t le
     if (length == 0) {
         return;
     }
+    auto do_insert = [&](const auto& src_concrete) {
+        const auto& src_offsets = src_concrete.get_offsets();
+        const auto& src_chars = src_concrete.get_chars();
+        if (start + length > src_offsets.size()) {
+            throw doris::Exception(
+                    doris::ErrorCode::INTERNAL_ERROR,
+                    "Parameter out of bound in 
IColumnStr<T>::insert_range_from method.");
+        }
+        size_t nested_offset = src_offsets[static_cast<ssize_t>(start) - 1];
+        size_t nested_length = src_offsets[start + length - 1] - nested_offset;
 
-    const auto& src_concrete = assert_cast<const ColumnStr<T>&>(src);
-
-    if (start + length > src_concrete.offsets.size()) {
-        throw doris::Exception(
-                doris::ErrorCode::INTERNAL_ERROR,
-                "Parameter out of bound in IColumnStr<T>::insert_range_from 
method.");
-    }
-
-    size_t nested_offset = src_concrete.offset_at(start);
-    size_t nested_length = src_concrete.offsets[start + length - 1] - 
nested_offset;
-
-    size_t old_chars_size = chars.size();
-    check_chars_length(old_chars_size + nested_length, offsets.size() + 
length);
-    chars.resize(old_chars_size + nested_length);
-    memcpy(&chars[old_chars_size], &src_concrete.chars[nested_offset], 
nested_length);
+        size_t old_chars_size = chars.size();
+        check_chars_length(old_chars_size + nested_length, offsets.size() + 
length);
+        chars.resize(old_chars_size + nested_length);
+        memcpy(&chars[old_chars_size], &src_chars[nested_offset], 
nested_length);
 
-    if (start == 0 && offsets.empty()) {
-        offsets.assign(src_concrete.offsets.begin(), 
src_concrete.offsets.begin() + length);
-    } else {
-        size_t old_size = offsets.size();
-        size_t prev_max_offset = offsets.back(); /// -1th index is Ok, see 
PaddedPODArray
-        offsets.resize(old_size + length);
+        using OffsetsType = std::decay_t<decltype(src_offsets)>;
+        if (std::is_same_v<T, typename OffsetsType::value_type> && start == 0 
&& offsets.empty()) {
+            offsets.assign(src_offsets.begin(), src_offsets.begin() + length);
+        } else {
+            size_t old_size = offsets.size();
+            size_t prev_max_offset = offsets.back(); /// -1th index is Ok, see 
PaddedPODArray
+            offsets.resize(old_size + length);
 
-        for (size_t i = 0; i < length; ++i) {
-            offsets[old_size + i] =
-                    src_concrete.offsets[start + i] - nested_offset + 
prev_max_offset;
+            for (size_t i = 0; i < length; ++i) {
+                offsets[old_size + i] = src_offsets[start + i] - nested_offset 
+ prev_max_offset;
+            }
         }
+    };
+    // insert_range_from maybe called by 
ColumnArray::insert_indices_from(which is used by hash join operator),
+    // so we need to support both ColumnStr<uint32_t> and ColumnStr<uint64_t>
+    if (src.is_column_string64()) {
+        do_insert(assert_cast<const ColumnStr<uint64_t>&>(src));
+    } else {
+        do_insert(assert_cast<const ColumnStr<uint32_t>&>(src));
     }
 }
 
diff --git a/be/test/vec/core/column_array_test.cpp 
b/be/test/vec/core/column_array_test.cpp
index c371dda25e8..0c8a486b5bf 100644
--- a/be/test/vec/core/column_array_test.cpp
+++ b/be/test/vec/core/column_array_test.cpp
@@ -108,6 +108,53 @@ TEST(ColumnArrayTest, StringArrayTest) {
     }
 }
 
+TEST(ColumnArrayTest, String64ArrayTest) {
+    auto off_column = ColumnVector<ColumnArray::Offset64>::create();
+    auto str64_column = ColumnString64::create();
+    // init column array with [["abc","d"],["ef"],[], [""]];
+    std::vector<ColumnArray::Offset64> offs = {0, 2, 3, 3, 4};
+    std::vector<std::string> vals = {"abc", "d", "ef", ""};
+    for (size_t i = 1; i < offs.size(); ++i) {
+        off_column->insert_data((const char*)(&offs[i]), 0);
+    }
+    for (auto& v : vals) {
+        str64_column->insert_data(v.data(), v.size());
+    }
+
+    ColumnArray str64_array_column(std::move(str64_column), 
std::move(off_column));
+    EXPECT_EQ(str64_array_column.size(), offs.size() - 1);
+    for (size_t i = 0; i < str64_array_column.size(); ++i) {
+        auto v = get<Array>(str64_array_column[i]);
+        EXPECT_EQ(v.size(), offs[i + 1] - offs[i]);
+        for (size_t j = 0; j < v.size(); ++j) {
+            EXPECT_EQ(vals[offs[i] + j], get<std::string>(v[j]));
+        }
+    }
+    // test insert ColumnArray<ColumnStr<uint64_t>> into 
ColumnArray<ColumnStr<uint32_t>>
+    auto str32_column = ColumnString::create();
+    ColumnArray str32_array_column(std::move(str32_column));
+    std::vector<uint32_t> indices;
+    indices.push_back(0);
+    indices.push_back(1);
+    indices.push_back(3);
+    str32_array_column.insert_indices_from(str64_array_column, indices.data(),
+                                           indices.data() + indices.size());
+    EXPECT_EQ(str32_array_column.size(), 3);
+
+    auto v = get<Array>(str32_array_column[0]);
+    EXPECT_EQ(v.size(), 2);
+    EXPECT_EQ(get<std::string>(v[0]), vals[0]);
+    EXPECT_EQ(get<std::string>(v[1]), vals[1]);
+
+    v = get<Array>(str32_array_column[1]);
+    EXPECT_EQ(v.size(), 1);
+    EXPECT_EQ(get<std::string>(v[0]), vals[2]);
+
+    v = get<Array>(str32_array_column[2]);
+    EXPECT_EQ(v.size(), 1);
+    EXPECT_EQ(get<std::string>(v[0]), vals[3]);
+}
+
 TEST(ColumnArrayTest, IntArrayPermuteTest) {
     auto off_column = ColumnVector<ColumnArray::Offset64>::create();
     auto data_column = ColumnVector<int32_t>::create();
diff --git a/be/test/vec/core/column_map_test.cpp 
b/be/test/vec/core/column_map_test.cpp
new file mode 100644
index 00000000000..d59247dae33
--- /dev/null
+++ b/be/test/vec/core/column_map_test.cpp
@@ -0,0 +1,155 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "vec/columns/column_map.h"
+
+#include <gtest/gtest-message.h>
+#include <gtest/gtest-test-part.h>
+
+#include "gtest/gtest_pred_impl.h"
+#include "vec/columns/column.h"
+#include "vec/columns/column_string.h"
+#include "vec/columns/column_vector.h"
+#include "vec/core/field.h"
+
+namespace doris::vectorized {
+TEST(ColumnMapTest, StringKeyTest) {
+    auto col_map_str64 = ColumnMap(ColumnString64::create(), 
ColumnInt64::create(),
+                                   ColumnArray::ColumnOffsets::create());
+    Array k1 = {"a", "b", "c"};
+    Array v1 = {1, 2, 3};
+    {
+        Map map;
+        map.push_back(k1);
+        map.push_back(v1);
+        col_map_str64.insert(map);
+    }
+    Array k2 = {"aa", "bb", "cc"};
+    Array v2 = {11, 22, 33};
+    {
+        Map map;
+        map.push_back(k2);
+        map.push_back(v2);
+        col_map_str64.insert(map);
+    }
+    Array k3 = {"aaa", "bbb", "ccc"};
+    Array v3 = {111, 222, 333};
+    {
+        Map map;
+        map.push_back(k3);
+        map.push_back(v3);
+        col_map_str64.insert(map);
+    }
+
+    // test insert ColumnMap<ColumnStr<uint64_t>, Column> into 
ColumnMap<ColumnStr<uint32_t>, Column>
+    auto col_map_str32 = ColumnMap(ColumnString::create(), 
ColumnInt64::create(),
+                                   ColumnArray::ColumnOffsets::create());
+    std::vector<uint32_t> indices;
+    indices.push_back(0);
+    indices.push_back(2);
+    col_map_str32.insert_indices_from(col_map_str64, indices.data(),
+                                      indices.data() + indices.size());
+    EXPECT_EQ(col_map_str32.size(), 2);
+
+    auto map = get<Map>(col_map_str32[0]);
+    auto k = get<Array>(map[0]);
+    auto v = get<Array>(map[1]);
+    EXPECT_EQ(k.size(), 3);
+    for (size_t i = 0; i < k.size(); ++i) {
+        EXPECT_EQ(k[i], k1[i]);
+    }
+    EXPECT_EQ(v.size(), 3);
+    for (size_t i = 0; i < v.size(); ++i) {
+        EXPECT_EQ(v[i], v1[i]);
+    }
+
+    map = get<Map>(col_map_str32[1]);
+    k = get<Array>(map[0]);
+    v = get<Array>(map[1]);
+    EXPECT_EQ(k.size(), 3);
+    for (size_t i = 0; i < k.size(); ++i) {
+        EXPECT_EQ(k[i], k3[i]);
+    }
+    EXPECT_EQ(v.size(), 3);
+    for (size_t i = 0; i < v.size(); ++i) {
+        EXPECT_EQ(v[i], v3[i]);
+    }
+};
+
+TEST(ColumnMapTest, StringValueTest) {
+    auto col_map_str64 = ColumnMap(ColumnInt64::create(), 
ColumnString64::create(),
+                                   ColumnArray::ColumnOffsets::create());
+    Array k1 = {1, 2, 3};
+    Array v1 = {"a", "b", "c"};
+    {
+        Map map;
+        map.push_back(k1);
+        map.push_back(v1);
+        col_map_str64.insert(map);
+    }
+    Array k2 = {11, 22, 33};
+    Array v2 = {"aa", "bb", "cc"};
+    {
+        Map map;
+        map.push_back(k2);
+        map.push_back(v2);
+        col_map_str64.insert(map);
+    }
+    Array k3 = {111, 222, 333};
+    Array v3 = {"aaa", "bbb", "ccc"};
+    {
+        Map map;
+        map.push_back(k3);
+        map.push_back(v3);
+        col_map_str64.insert(map);
+    }
+
+    // test insert ColumnMap<ColumnStr<uint64_t>, Column> into 
ColumnMap<ColumnStr<uint32_t>, Column>
+    auto col_map_str32 = ColumnMap(ColumnInt64::create(), 
ColumnString::create(),
+                                   ColumnArray::ColumnOffsets::create());
+    std::vector<uint32_t> indices;
+    indices.push_back(0);
+    indices.push_back(2);
+    col_map_str32.insert_indices_from(col_map_str64, indices.data(),
+                                      indices.data() + indices.size());
+    EXPECT_EQ(col_map_str32.size(), 2);
+
+    auto map = get<Map>(col_map_str32[0]);
+    auto k = get<Array>(map[0]);
+    auto v = get<Array>(map[1]);
+    EXPECT_EQ(k.size(), 3);
+    for (size_t i = 0; i < k.size(); ++i) {
+        EXPECT_EQ(k[i], k1[i]);
+    }
+    EXPECT_EQ(v.size(), 3);
+    for (size_t i = 0; i < v.size(); ++i) {
+        EXPECT_EQ(v[i], v1[i]);
+    }
+
+    map = get<Map>(col_map_str32[1]);
+    k = get<Array>(map[0]);
+    v = get<Array>(map[1]);
+    EXPECT_EQ(k.size(), 3);
+    for (size_t i = 0; i < k.size(); ++i) {
+        EXPECT_EQ(k[i], k3[i]);
+    }
+    EXPECT_EQ(v.size(), 3);
+    for (size_t i = 0; i < v.size(); ++i) {
+        EXPECT_EQ(v[i], v3[i]);
+    }
+};
+} // namespace doris::vectorized
\ No newline at end of file
diff --git a/be/test/vec/core/column_string_test.cpp 
b/be/test/vec/core/column_string_test.cpp
index 81f41bd11c4..aea6d87b323 100644
--- a/be/test/vec/core/column_string_test.cpp
+++ b/be/test/vec/core/column_string_test.cpp
@@ -19,6 +19,7 @@
 
 #include <gtest/gtest.h>
 
+#include "vec/columns/column.h"
 #include "vec/core/block.h"
 #include "vec/data_types/data_type_string.h"
 #include "vec/functions/function_string.h"
@@ -56,4 +57,41 @@ TEST(ColumnStringTest, TestConcat) {
     auto actual_res_col_str = assert_cast<const 
ColumnString*>(actual_res_col.get());
     actual_res_col_str->sanity_check();
 }
+
+TEST(ColumnStringTest, TestStringInsert) {
+    {
+        auto str32_column = ColumnString::create();
+        std::vector<std::string> vals_tmp = {"x", "yy", "zzz", ""};
+        auto str32_column_tmp = ColumnString::create();
+        for (auto& v : vals_tmp) {
+            str32_column_tmp->insert_data(v.data(), v.size());
+        }
+        str32_column->insert_range_from(*str32_column_tmp, 0, vals_tmp.size());
+        str32_column->insert_range_from(*str32_column_tmp, 0, vals_tmp.size());
+        auto row_count = str32_column->size();
+        EXPECT_EQ(row_count, vals_tmp.size() * 2);
+        for (size_t i = 0; i < row_count; ++i) {
+            auto row_data = str32_column->get_data_at(i);
+            EXPECT_EQ(row_data.to_string(), vals_tmp[i % vals_tmp.size()]);
+        }
+    }
+
+    {
+        // test insert ColumnString64 to ColumnString
+        auto str32_column = ColumnString::create();
+        std::vector<std::string> vals_tmp = {"x", "yy", "zzz", ""};
+        auto str64_column_tmp = ColumnString64::create();
+        for (auto& v : vals_tmp) {
+            str64_column_tmp->insert_data(v.data(), v.size());
+        }
+        str32_column->insert_range_from(*str64_column_tmp, 0, vals_tmp.size());
+        str32_column->insert_range_from(*str64_column_tmp, 0, vals_tmp.size());
+        auto row_count = str32_column->size();
+        EXPECT_EQ(row_count, vals_tmp.size() * 2);
+        for (size_t i = 0; i < row_count; ++i) {
+            auto row_data = str32_column->get_data_at(i);
+            EXPECT_EQ(row_data.to_string(), vals_tmp[i % vals_tmp.size()]);
+        }
+    }
+}
 } // namespace doris::vectorized
\ No newline at end of file
diff --git a/be/test/vec/core/column_struct_test.cpp 
b/be/test/vec/core/column_struct_test.cpp
new file mode 100644
index 00000000000..1720b01638c
--- /dev/null
+++ b/be/test/vec/core/column_struct_test.cpp
@@ -0,0 +1,79 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "vec/columns/column_struct.h"
+
+#include <gtest/gtest-message.h>
+#include <gtest/gtest-test-part.h>
+
+#include <cstddef>
+
+#include "gtest/gtest_pred_impl.h"
+#include "vec/columns/column.h"
+#include "vec/columns/column_string.h"
+#include "vec/columns/columns_number.h"
+#include "vec/core/field.h"
+
+namespace doris::vectorized {
+TEST(ColumnStructTest, StringTest) {
+    auto str64_column = ColumnString64::create();
+    auto i32_column = ColumnInt32::create();
+
+    std::vector<std::string> str_vals = {"aaa", "bbb", "ccc", "ddd"};
+    std::vector<int> int_vals = {111, 222, 333, 444};
+    for (size_t i = 0; i < str_vals.size(); ++i) {
+        str64_column->insert_data(str_vals[i].data(), str_vals[i].size());
+        i32_column->insert_data((const char*)(&int_vals[i]), 0);
+    }
+
+    std::vector<ColumnPtr> vector_columns;
+    vector_columns.emplace_back(str64_column->get_ptr());
+    vector_columns.emplace_back(i32_column->get_ptr());
+    auto str64_struct_column = ColumnStruct::create(vector_columns);
+
+    auto str32_column = ColumnString::create();
+    auto i32_column2 = ColumnInt32::create();
+    std::vector<ColumnPtr> vector_columns2;
+    vector_columns2.emplace_back(str32_column->get_ptr());
+    vector_columns2.emplace_back(i32_column2->get_ptr());
+    auto str32_struct_column = ColumnStruct::create(vector_columns2);
+
+    std::vector<uint32_t> indices;
+    indices.push_back(0);
+    indices.push_back(2);
+    indices.push_back(3);
+    std::move(*str32_struct_column)
+            .mutate()
+            ->insert_indices_from(*str64_struct_column, indices.data(),
+                                  indices.data() + indices.size());
+    EXPECT_EQ(str32_struct_column->size(), indices.size());
+    auto t = get<Tuple>(str32_struct_column->operator[](0));
+    EXPECT_EQ(t.size(), 2);
+    EXPECT_EQ(t[0], "aaa");
+    EXPECT_EQ(t[1], 111);
+
+    t = get<Tuple>(str32_struct_column->operator[](1));
+    EXPECT_EQ(t.size(), 2);
+    EXPECT_EQ(t[0], "ccc");
+    EXPECT_EQ(t[1], 333);
+
+    t = get<Tuple>(str32_struct_column->operator[](2));
+    EXPECT_EQ(t.size(), 2);
+    EXPECT_EQ(t[0], "ddd");
+    EXPECT_EQ(t[1], 444);
+};
+} // namespace doris::vectorized
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to