This is an automated email from the ASF dual-hosted git repository.
dataroaring pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new 2b3f8eef6ee [Improvement](column) add santy check and add some fix
for ColumnString #47964 (#48513)
2b3f8eef6ee is described below
commit 2b3f8eef6ee3050664d739cbb0071be177f9f0b8
Author: Pxl <[email protected]>
AuthorDate: Thu Mar 20 14:32:01 2025 +0800
[Improvement](column) add santy check and add some fix for ColumnString
#47964 (#48513)
pick part of https://github.com/apache/doris/pull/47964
---
be/src/vec/columns/column_string.cpp | 56 +++++++++++++++++++-----
be/src/vec/columns/column_string.h | 12 +++++
be/src/vec/functions/date_time_transforms.h | 2 +
be/src/vec/functions/function_decode_varchar.cpp | 2 +-
be/src/vec/functions/function_ip.h | 2 +
be/src/vec/functions/function_uuid.cpp | 1 +
6 files changed, 62 insertions(+), 13 deletions(-)
diff --git a/be/src/vec/columns/column_string.cpp
b/be/src/vec/columns/column_string.cpp
index 8bcc6565467..b61e28ce3f8 100644
--- a/be/src/vec/columns/column_string.cpp
+++ b/be/src/vec/columns/column_string.cpp
@@ -36,20 +36,30 @@ namespace doris::vectorized {
template <typename T>
void ColumnStr<T>::sanity_check() const {
- auto count = offsets.size();
+#ifndef NDEBUG
+ sanity_check_simple();
+ auto count = (int)offsets.size();
+ for (int i = 0; i < count; ++i) {
+ if (offsets[i] < offsets[i - 1]) {
+ throw Exception(Status::InternalError("row count: {}, offsets[{}]:
{}, offsets[{}]: {}",
+ count, i, offsets[i], i - 1,
offsets[i - 1]));
+ }
+ }
+#endif
+}
+
+template <typename T>
+void ColumnStr<T>::sanity_check_simple() const {
+#ifndef NDEBUG
+ auto count = (int)offsets.size();
if (chars.size() != offsets[count - 1]) {
- LOG(FATAL) << "row count: " << count << ", chars.size(): " <<
chars.size() << ", offset["
- << count - 1 << "]: " << offsets[count - 1];
+ throw Exception(Status::InternalError("row count: {}, chars.size():
{}, offset[{}]: {}",
+ count, chars.size(), count - 1,
offsets[count - 1]));
}
if (offsets[-1] != 0) {
- LOG(FATAL) << "wrong offsets[-1]: " << offsets[-1];
- }
- for (size_t i = 0; i < count; ++i) {
- if (offsets[i] < offsets[i - 1]) {
- LOG(FATAL) << "row count: " << count << ", offsets[" << i << "]: "
<< offsets[i]
- << ", offsets[" << i - 1 << "]: " << offsets[i - 1];
- }
+ throw Exception(Status::InternalError("wrong offsets[-1]: {}",
offsets[-1]));
}
+#endif
}
template <typename T>
@@ -77,6 +87,8 @@ MutableColumnPtr ColumnStr<T>::clone_resized(size_t to_size)
const {
res->offsets.resize_fill(to_size, chars.size());
}
+ res->sanity_check_simple();
+
return res;
}
@@ -135,6 +147,14 @@ void ColumnStr<T>::insert_range_from_ignore_overflow(const
doris::vectorized::IC
src_concrete.offsets[start + i] - nested_offset +
prev_max_offset;
}
}
+#ifndef NDEBUG
+ auto count = int64_t(offsets.size());
+ // offsets may overflow, so we make chars.size() as T to do same overflow
check
+ if (offsets.back() != T(chars.size())) {
+ throw Exception(Status::InternalError("row count: {}, chars.size():
{}, offset[{}]: {}",
+ count, chars.size(), count - 1,
offsets[count - 1]));
+ }
+#endif
}
template <typename T>
@@ -178,6 +198,7 @@ void ColumnStr<T>::insert_range_from(const IColumn& src,
size_t start, size_t le
} else {
do_insert(assert_cast<const ColumnStr<uint32_t>&>(src));
}
+ sanity_check_simple();
}
template <typename T>
@@ -244,6 +265,7 @@ void ColumnStr<T>::insert_indices_from(const IColumn& src,
const uint32_t* indic
} else {
do_insert(assert_cast<const ColumnStr<uint32_t>&>(src));
}
+ sanity_check_simple();
}
template <typename T>
@@ -297,7 +319,9 @@ size_t ColumnStr<T>::filter(const IColumn::Filter& filter) {
}
if constexpr (std::is_same_v<UInt32, T>) {
- return filter_arrays_impl<UInt8, IColumn::Offset>(chars, offsets,
filter);
+ auto res = filter_arrays_impl<UInt8, IColumn::Offset>(chars, offsets,
filter);
+ sanity_check_simple();
+ return res;
} else {
throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR,
"should not call filter in ColumnStr<UInt64>");
@@ -373,7 +397,7 @@ ColumnPtr ColumnStr<T>::permute(const IColumn::Permutation&
perm, size_t limit)
current_new_offset += string_size;
res_offsets[i] = current_new_offset;
}
-
+ sanity_check_simple();
return res;
}
@@ -405,6 +429,7 @@ const char*
ColumnStr<T>::deserialize_and_insert_from_arena(const char* pos) {
memcpy(chars.data() + old_size, pos, string_size);
offsets.push_back(new_size);
+ sanity_check_simple();
return pos + string_size;
}
@@ -495,6 +520,7 @@ void
ColumnStr<T>::deserialize_vec_with_null_map(std::vector<StringRef>& keys,
insert_default();
}
}
+ sanity_check_simple();
}
template <typename T>
@@ -558,6 +584,7 @@ ColumnPtr ColumnStr<T>::replicate(const IColumn::Offsets&
replicate_offsets) con
res_chars.resize(res_chars.size() + string_size);
memcpy_small_allow_read_write_overflow15(&res_chars[res_chars.size() -
string_size],
&chars[prev_string_offset], string_size);
+ check_chars_length(res_chars.size(), res_offsets.size());
}
prev_replicate_offset = replicate_offsets[i];
@@ -565,6 +592,7 @@ ColumnPtr ColumnStr<T>::replicate(const IColumn::Offsets&
replicate_offsets) con
}
check_chars_length(res_chars.size(), res_offsets.size());
+ sanity_check_simple();
return res;
}
@@ -572,6 +600,7 @@ template <typename T>
void ColumnStr<T>::reserve(size_t n) {
offsets.reserve(n);
chars.reserve(n);
+ sanity_check_simple();
}
template <typename T>
@@ -579,9 +608,11 @@ void ColumnStr<T>::resize(size_t n) {
auto origin_size = size();
if (origin_size > n) {
offsets.resize(n);
+ chars.resize(offsets[n - 1]);
} else if (origin_size < n) {
insert_many_defaults(n - origin_size);
}
+ sanity_check_simple();
}
template <typename T>
@@ -637,6 +668,7 @@ ColumnPtr ColumnStr<T>::convert_column_if_overflow() {
}
offsets.clear();
+ new_col->sanity_check_simple();
return new_col;
}
return this->get_ptr();
diff --git a/be/src/vec/columns/column_string.h
b/be/src/vec/columns/column_string.h
index e696b6f0764..33afabfc576 100644
--- a/be/src/vec/columns/column_string.h
+++ b/be/src/vec/columns/column_string.h
@@ -108,6 +108,7 @@ public:
bool is_variable_length() const override { return true; }
// used in string ut testd
void sanity_check() const;
+ void sanity_check_simple() const;
const char* get_family_name() const override { return "String"; }
size_t size() const override { return offsets.size(); }
@@ -162,6 +163,7 @@ public:
chars.resize(new_size);
memcpy(chars.data() + old_size, s.data, size_to_append);
offsets.push_back(new_size);
+ sanity_check_simple();
}
void insert_many_from(const IColumn& src, size_t position, size_t length)
override;
@@ -188,6 +190,7 @@ public:
size_to_append);
offsets.push_back(new_size);
}
+ sanity_check_simple();
}
void insert_data(const char* pos, size_t length) override {
@@ -200,6 +203,7 @@ public:
memcpy(chars.data() + old_size, pos, length);
}
offsets.push_back(new_size);
+ sanity_check_simple();
}
void insert_data_without_reserve(const char* pos, size_t length) {
@@ -212,6 +216,7 @@ public:
memcpy(chars.data() + old_size, pos, length);
}
offsets.push_back_without_reserve(new_size);
+ sanity_check_simple();
}
/// Before insert strings, the caller should calculate the total size of
strings,
@@ -245,6 +250,7 @@ public:
}
check_chars_length(offset, offsets.size());
chars.resize(offset);
+ sanity_check_simple();
}
void insert_many_continuous_binary_data(const char* data, const uint32_t*
offsets_,
@@ -270,6 +276,7 @@ public:
offsets_ptr[i] = tail_offset + offsets_[i + 1] - begin_offset;
}
DCHECK(chars.size() == offsets.back());
+ sanity_check_simple();
}
void insert_many_binary_data(char* data_array, uint32_t* len_array,
@@ -293,6 +300,7 @@ public:
offset += len;
offsets.push_back(offset);
}
+ sanity_check_simple();
}
void insert_many_strings(const StringRef* strings, size_t num) override {
@@ -315,6 +323,7 @@ public:
}
offsets.push_back(offset);
}
+ sanity_check_simple();
}
template <size_t copy_length>
@@ -339,6 +348,7 @@ public:
offsets.push_back(offset);
}
chars.resize(old_size + new_size);
+ sanity_check_simple();
}
void insert_many_strings_overflow(const StringRef* strings, size_t num,
@@ -380,12 +390,14 @@ public:
memcpy(chars.data() + old_size, src.data, src.size);
old_size += src.size;
}
+ sanity_check_simple();
}
void pop_back(size_t n) override {
size_t nested_n = offsets.back() - offset_at(offsets.size() - n);
chars.resize(chars.size() - nested_n);
offsets.resize_assume_reserved(offsets.size() - n);
+ sanity_check_simple();
}
StringRef serialize_value_into_arena(size_t n, Arena& arena, char const*&
begin) const override;
diff --git a/be/src/vec/functions/date_time_transforms.h
b/be/src/vec/functions/date_time_transforms.h
index 73155afae3a..33accd9f706 100644
--- a/be/src/vec/functions/date_time_transforms.h
+++ b/be/src/vec/functions/date_time_transforms.h
@@ -316,6 +316,7 @@ struct TransformerToStringOneArgument {
res_offsets[i] = Transform::execute(date_time_value, res_data,
offset);
null_map[i] = !date_time_value.is_valid_date();
}
+ res_data.resize(res_offsets[res_offsets.size() - 1]);
}
static void vector(FunctionContext* context,
@@ -334,6 +335,7 @@ struct TransformerToStringOneArgument {
res_offsets[i] = Transform::execute(date_time_value, res_data,
offset);
DCHECK(date_time_value.is_valid_date());
}
+ res_data.resize(res_offsets[res_offsets.size() - 1]);
}
};
diff --git a/be/src/vec/functions/function_decode_varchar.cpp
b/be/src/vec/functions/function_decode_varchar.cpp
index 59f7ecfac04..c4aabc92eb1 100644
--- a/be/src/vec/functions/function_decode_varchar.cpp
+++ b/be/src/vec/functions/function_decode_varchar.cpp
@@ -106,7 +106,7 @@ public:
simd::reverse_copy_bytes(col_res_data.data() + col_res_offset[i -
1], str_size,
ui8_ptr + sizeof(IntegerType) - str_size,
str_size);
}
-
+ col_res_data.resize(col_res_offset[col_res_offset.size() - 1]);
block.get_by_position(result).column = std::move(col_res);
return Status::OK();
diff --git a/be/src/vec/functions/function_ip.h
b/be/src/vec/functions/function_ip.h
index 00b43955372..9c4737c84a7 100644
--- a/be/src/vec/functions/function_ip.h
+++ b/be/src/vec/functions/function_ip.h
@@ -335,6 +335,7 @@ public:
process_ipv6_column<ColumnString>(column, input_rows_count,
vec_res, offsets_res,
null_map, ipv6_address_data);
}
+ vec_res.resize(offsets_res[offsets_res.size() - 1]);
block.replace_by_position(result,
ColumnNullable::create(std::move(col_res),
std::move(null_map)));
@@ -1388,6 +1389,7 @@ public:
cut_address(address, pos, bytes_to_cut_count);
offsets_res[i] = pos - begin;
}
+ chars_res.resize(offsets_res[offsets_res.size() - 1]);
block.replace_by_position(result, std::move(col_res));
return Status::OK();
diff --git a/be/src/vec/functions/function_uuid.cpp
b/be/src/vec/functions/function_uuid.cpp
index cee5fd7a363..bc4dec00705 100644
--- a/be/src/vec/functions/function_uuid.cpp
+++ b/be/src/vec/functions/function_uuid.cpp
@@ -180,6 +180,7 @@ public:
col_offset[row] = col_offset[row - 1] + str_length;
deserialize((char*)arg, col_data.data() + str_length * row);
}
+ col_data.resize(str_length * input_rows_count);
block.replace_by_position(result, std::move(result_column));
return Status::OK();
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]