This is an automated email from the ASF dual-hosted git repository. panxiaolei pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.0 by this push: new 2b5b9577346 [Bug](join) fix columnstr64's offset overflow on serialize_value_into… #46461 (#46939) 2b5b9577346 is described below commit 2b5b95773463c55d392fba7029f0ca2a1d47e561 Author: Pxl <x...@selectdb.com> AuthorDate: Tue Jan 14 16:36:19 2025 +0800 [Bug](join) fix columnstr64's offset overflow on serialize_value_into… #46461 (#46939) …_arena (#46461) ```sql mysql [test]>select /*+ LEADING(a,b) */ count(*) from d_table as a, d_table2 as b where a.k4=b.k4 and a.k1=b.k1; +----------+ | count(*) | +----------+ | 50000000 | +----------+ 1 row in set (4.87 sec) mysql [test]>select /*+ LEADING(b,a) */ count(*) from d_table as a, d_table2 as b where a.k4=b.k4 and a.k1=b.k1; +----------+ | count(*) | +----------+ | 42949673 | +----------+ 1 row in set (21.32 sec) ``` None - Test <!-- At least one of them must be included. --> - [x] Regression test - [ ] Unit Test - [ ] Manual test (add detailed scripts or steps below) - [ ] No need to test or manual test. Explain why: - [ ] This is a refactor/code format and no logic has been changed. - [ ] Previous test can cover this change. - [ ] No code files have been changed. - [ ] Other reason <!-- Add your reason? --> - Behavior changed: - [x] No. - [ ] Yes. <!-- Explain the behavior change --> - Does this need documentation? - [x] No. - [ ] Yes. <!-- Add document PR link here. eg: https://github.com/apache/doris-website/pull/1214 --> - [ ] Confirm the release note - [ ] Confirm test cases - [ ] Confirm document - [ ] Add branch pick label <!-- Add branch pick label that this PR should merge into --> ### What problem does this PR solve? Issue Number: close #xxx Related PR: #xxx Problem Summary: ### Release note None ### Check List (For Author) - Test <!-- At least one of them must be included. --> - [ ] Regression test - [ ] Unit Test - [ ] Manual test (add detailed scripts or steps below) - [ ] No need to test or manual test. Explain why: - [ ] This is a refactor/code format and no logic has been changed. - [ ] Previous test can cover this change. - [ ] No code files have been changed. - [ ] Other reason <!-- Add your reason? --> - Behavior changed: - [ ] No. - [ ] Yes. <!-- Explain the behavior change --> - Does this need documentation? - [ ] No. - [ ] Yes. <!-- Add document PR link here. eg: https://github.com/apache/doris-website/pull/1214 --> ### Check List (For Reviewer who merge this PR) - [ ] Confirm the release note - [ ] Confirm test cases - [ ] Confirm document - [ ] Add branch pick label <!-- Add branch pick label that this PR should merge into --> --- be/src/vec/columns/column_string.cpp | 20 ++++---- be/src/vec/columns/column_string.h | 9 ++-- .../query_p1/str64_serialize/str64_serialize.out | 7 +++ .../str64_serialize/str64_serialize.groovy | 57 ++++++++++++++++++++++ 4 files changed, 80 insertions(+), 13 deletions(-) diff --git a/be/src/vec/columns/column_string.cpp b/be/src/vec/columns/column_string.cpp index 6eb3e45b2e0..8bcc6565467 100644 --- a/be/src/vec/columns/column_string.cpp +++ b/be/src/vec/columns/column_string.cpp @@ -380,8 +380,8 @@ ColumnPtr ColumnStr<T>::permute(const IColumn::Permutation& perm, size_t limit) template <typename T> StringRef ColumnStr<T>::serialize_value_into_arena(size_t n, Arena& arena, char const*& begin) const { - uint32_t string_size(size_at(n)); - uint32_t offset(offset_at(n)); + auto string_size(size_at(n)); + auto offset(offset_at(n)); StringRef res; res.size = sizeof(string_size) + string_size; @@ -395,7 +395,7 @@ StringRef ColumnStr<T>::serialize_value_into_arena(size_t n, Arena& arena, template <typename T> const char* ColumnStr<T>::deserialize_and_insert_from_arena(const char* pos) { - const uint32_t string_size = unaligned_load<uint32_t>(pos); + const auto string_size = unaligned_load<uint32_t>(pos); pos += sizeof(string_size); const size_t old_size = chars.size(); @@ -413,7 +413,7 @@ size_t ColumnStr<T>::get_max_row_byte_size() const { size_t max_size = 0; size_t num_rows = offsets.size(); for (size_t i = 0; i < num_rows; ++i) { - max_size = std::max(max_size, size_at(i)); + max_size = std::max(max_size, size_t(size_at(i))); } return max_size + sizeof(uint32_t); @@ -423,8 +423,8 @@ template <typename T> void ColumnStr<T>::serialize_vec(std::vector<StringRef>& keys, size_t num_rows, size_t max_row_byte_size) const { for (size_t i = 0; i < num_rows; ++i) { - uint32_t offset(offset_at(i)); - uint32_t string_size(size_at(i)); + auto offset(offset_at(i)); + auto string_size(size_at(i)); auto* ptr = const_cast<char*>(keys[i].data + keys[i].size); memcpy_fixed<uint32_t>(ptr, (char*)&string_size); @@ -450,7 +450,7 @@ void ColumnStr<T>::serialize_vec_with_null_map(std::vector<StringRef>& keys, siz UInt32 offset(offset_at(i)); UInt32 string_size(size_at(i)); - memcpy_fixed<UInt32>(dest + 1, (char*)&string_size); + memcpy_fixed<uint32_t>(dest + 1, (char*)&string_size); memcpy(dest + 1 + sizeof(string_size), &chars[offset], string_size); keys[i].size += sizeof(string_size) + string_size + sizeof(UInt8); } else { @@ -467,7 +467,7 @@ void ColumnStr<T>::serialize_vec_with_null_map(std::vector<StringRef>& keys, siz UInt32 offset(offset_at(i)); UInt32 string_size(size_at(i)); - memcpy_fixed<UInt32>(dest + 1, (char*)&string_size); + memcpy_fixed<uint32_t>(dest + 1, (char*)&string_size); memcpy(dest + 1 + sizeof(string_size), &chars[offset], string_size); keys[i].size += sizeof(string_size) + string_size + sizeof(UInt8); } @@ -477,7 +477,7 @@ void ColumnStr<T>::serialize_vec_with_null_map(std::vector<StringRef>& keys, siz template <typename T> void ColumnStr<T>::deserialize_vec(std::vector<StringRef>& keys, const size_t num_rows) { for (size_t i = 0; i != num_rows; ++i) { - auto original_ptr = keys[i].data; + const auto* original_ptr = keys[i].data; keys[i].data = deserialize_and_insert_from_arena(original_ptr); keys[i].size -= keys[i].data - original_ptr; } @@ -488,7 +488,7 @@ void ColumnStr<T>::deserialize_vec_with_null_map(std::vector<StringRef>& keys, const size_t num_rows, const uint8_t* null_map) { for (size_t i = 0; i != num_rows; ++i) { if (null_map[i] == 0) { - auto original_ptr = keys[i].data; + const auto* original_ptr = keys[i].data; keys[i].data = deserialize_and_insert_from_arena(original_ptr); keys[i].size -= keys[i].data - original_ptr; } else { diff --git a/be/src/vec/columns/column_string.h b/be/src/vec/columns/column_string.h index 906f62b52aa..e696b6f0764 100644 --- a/be/src/vec/columns/column_string.h +++ b/be/src/vec/columns/column_string.h @@ -87,8 +87,11 @@ private: size_t ALWAYS_INLINE offset_at(ssize_t i) const { return offsets[i - 1]; } - /// Size of i-th element, including terminating zero. - size_t ALWAYS_INLINE size_at(ssize_t i) const { return offsets[i] - offsets[i - 1]; } + // Size of i-th element, including terminating zero. + // assume that the length of a single element is less than 32-bit + uint32_t ALWAYS_INLINE size_at(ssize_t i) const { + return uint32_t(offsets[i] - offsets[i - 1]); + } template <bool positive> struct less; @@ -373,7 +376,7 @@ public: for (size_t i = start_index; i < start_index + num; i++) { int32_t codeword = data_array[i]; - auto& src = dict[codeword]; + const auto& src = dict[codeword]; memcpy(chars.data() + old_size, src.data, src.size); old_size += src.size; } diff --git a/regression-test/data/query_p1/str64_serialize/str64_serialize.out b/regression-test/data/query_p1/str64_serialize/str64_serialize.out new file mode 100644 index 00000000000..99c168b99d4 --- /dev/null +++ b/regression-test/data/query_p1/str64_serialize/str64_serialize.out @@ -0,0 +1,7 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !test -- +50000000 + +-- !test -- +50000000 + diff --git a/regression-test/suites/query_p1/str64_serialize/str64_serialize.groovy b/regression-test/suites/query_p1/str64_serialize/str64_serialize.groovy new file mode 100644 index 00000000000..b0e3ffa99e9 --- /dev/null +++ b/regression-test/suites/query_p1/str64_serialize/str64_serialize.groovy @@ -0,0 +1,57 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("str64_serialize") { + + sql """ DROP TABLE IF EXISTS d_table; """ + sql """ DROP TABLE IF EXISTS d_table2; """ + + + sql """ + create table d_table ( + k1 int null, + k2 int not null, + k3 bigint null, + k4 varchar(100) null + ) + duplicate key (k1,k2,k3) + distributed BY hash(k1) buckets 3 + properties("replication_num" = "1"); + """ + sql """ + create table d_table2 ( + k1 int null, + k2 int not null, + k3 bigint null, + k4 varchar(100) null + ) + duplicate key (k1,k2,k3) + distributed BY hash(k1) buckets 3 + properties("replication_num" = "1"); + """ + + sql """insert into d_table select 1,1,1,'1234567890abcdefghigalsdhaluihdicandejionxaoxwdeuhwenudzmwoedxneiowdxiowedjxneiowdjixoneiiexdnuiexef' from (select 1 k1) as t lateral view explode_numbers(50000000) tmp1 as e1; +""" + + sql """insert into d_table2 select 1,1,1,'1234567890abcdefghigalsdhaluihdicandejionxaoxwdeuhwenudzmwoedxneiowdxiowedjxneiowdjixoneiiexdnuiexef'; +""" + sql "set parallel_pipeline_task_num=1;" + + qt_test "select /*+ LEADING(a,b) */ count(*) from d_table as a, d_table2 as b where a.k4=b.k4 and a.k1=b.k1;" + qt_test "select /*+ LEADING(b,a) */ count(*) from d_table as a, d_table2 as b where a.k4=b.k4 and a.k1=b.k1;" +} + --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org