This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-4.0 by this push:
new d59c0302524 [branch-4.0][fix](parquet) Fix wrong condition #63509
(#63739)
d59c0302524 is described below
commit d59c03025245dea81f888e88335614895b70be56
Author: Gabriel <[email protected]>
AuthorDate: Thu May 28 14:08:05 2026 +0800
[branch-4.0][fix](parquet) Fix wrong condition #63509 (#63739)
### What problem does this PR solve?
pick #63509
---
.../format/parquet/byte_array_dict_decoder.cpp | 29 +++++++++++----
.../format/parquet/byte_array_plain_decoder.cpp | 42 ++++++++++-----------
.../parquet/byte_array_dict_decoder_test.cpp | 17 +++++++++
.../parquet/byte_array_plain_decoder_test.cpp | 43 ++++++++++++++++++++++
.../tvf/test_hdfs_parquet_group6.groovy | 4 +-
5 files changed, 105 insertions(+), 30 deletions(-)
diff --git a/be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp
b/be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp
index 49ab5cd584b..efb0c262d6a 100644
--- a/be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp
+++ b/be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp
@@ -36,13 +36,24 @@ Status
ByteArrayDictDecoder::set_dict(DorisUniqueBufferPtr<uint8_t>& dict, int32
return Status::Corruption("Wrong dictionary data for byte array type,
dict is null.");
}
_dict_items.reserve(num_values);
- uint32_t offset_cursor = 0;
+ if (UNLIKELY(length < 0)) {
+ return Status::Corruption("Wrong data length in dictionary");
+ }
+ const size_t dict_length = cast_set<size_t>(length);
+ size_t offset_cursor = 0;
char* dict_item_address = reinterpret_cast<char*>(_dict.get());
size_t total_length = 0;
for (int i = 0; i < num_values; ++i) {
+ if (UNLIKELY(offset_cursor > dict_length ||
+ dict_length - offset_cursor < sizeof(uint32_t))) {
+ return Status::Corruption("Wrong data length in dictionary");
+ }
uint32_t l = decode_fixed32_le(_dict.get() + offset_cursor);
- offset_cursor += 4;
+ offset_cursor += sizeof(uint32_t);
+ if (UNLIKELY(l > dict_length - offset_cursor)) {
+ return Status::Corruption("Wrong data length in dictionary");
+ }
offset_cursor += l;
total_length += l;
}
@@ -53,20 +64,24 @@ Status
ByteArrayDictDecoder::set_dict(DorisUniqueBufferPtr<uint8_t>& dict, int32
size_t offset = 0;
offset_cursor = 0;
for (int i = 0; i < num_values; ++i) {
+ if (UNLIKELY(offset_cursor > dict_length ||
+ dict_length - offset_cursor < sizeof(uint32_t))) {
+ return Status::Corruption("Wrong data length in dictionary");
+ }
uint32_t l = decode_fixed32_le(_dict.get() + offset_cursor);
- offset_cursor += 4;
+ offset_cursor += sizeof(uint32_t);
+ if (UNLIKELY(l > dict_length - offset_cursor)) {
+ return Status::Corruption("Wrong data length in dictionary");
+ }
memcpy(&_dict_data[offset], dict_item_address + offset_cursor, l);
_dict_items.emplace_back(&_dict_data[offset], l);
offset_cursor += l;
offset += l;
- if (offset_cursor > length) {
- return Status::Corruption("Wrong data length in dictionary");
- }
if (l > _max_value_length) {
_max_value_length = l;
}
}
- if (offset_cursor != length) {
+ if (offset_cursor != dict_length) {
return Status::Corruption("Wrong dictionary data for byte array type");
}
return Status::OK();
diff --git a/be/src/vec/exec/format/parquet/byte_array_plain_decoder.cpp
b/be/src/vec/exec/format/parquet/byte_array_plain_decoder.cpp
index 7092a4fb292..9f3b0c6491d 100644
--- a/be/src/vec/exec/format/parquet/byte_array_plain_decoder.cpp
+++ b/be/src/vec/exec/format/parquet/byte_array_plain_decoder.cpp
@@ -26,15 +26,23 @@
namespace doris::vectorized {
#include "common/compile_check_begin.h"
+
+namespace {
+Status read_length(const Slice* data, uint32_t* offset, uint32_t* length) {
+ if (UNLIKELY(*offset > data->size || data->size - *offset <
sizeof(uint32_t))) {
+ return Status::IOError("Can't read byte array length from plain
decoder");
+ }
+ *length = decode_fixed32_le(reinterpret_cast<const uint8_t*>(data->data) +
*offset);
+ *offset += sizeof(uint32_t);
+ return Status::OK();
+}
+} // namespace
+
Status ByteArrayPlainDecoder::skip_values(size_t num_values) {
for (int i = 0; i < num_values; ++i) {
- if (UNLIKELY(_offset + 4 > _data->size)) {
- return Status::IOError("Can't read byte array length from plain
decoder");
- }
- uint32_t length =
- decode_fixed32_le(reinterpret_cast<const
uint8_t*>(_data->data) + _offset);
- _offset += 4;
- if (UNLIKELY(_offset + length) > _data->size) {
+ uint32_t length = 0;
+ RETURN_IF_ERROR(read_length(_data, &_offset, &length));
+ if (UNLIKELY(_offset > _data->size || length > _data->size - _offset))
{
return Status::IOError("Can't skip enough bytes in plain decoder");
}
_offset += length;
@@ -63,13 +71,9 @@ Status
ByteArrayPlainDecoder::_decode_values(MutableColumnPtr& doris_column, Dat
std::vector<StringRef> string_values;
string_values.reserve(run_length);
for (size_t i = 0; i < run_length; ++i) {
- if (UNLIKELY(_offset + 4 > _data->size)) {
- return Status::IOError("Can't read byte array length from
plain decoder");
- }
- uint32_t length =
- decode_fixed32_le(reinterpret_cast<const
uint8_t*>(_data->data) + _offset);
- _offset += 4;
- if (UNLIKELY(_offset + length) > _data->size) {
+ uint32_t length = 0;
+ RETURN_IF_ERROR(read_length(_data, &_offset, &length));
+ if (UNLIKELY(_offset > _data->size || length > _data->size -
_offset)) {
return Status::IOError("Can't read enough bytes in plain
decoder");
}
string_values.emplace_back(_data->data + _offset, length);
@@ -84,13 +88,9 @@ Status
ByteArrayPlainDecoder::_decode_values(MutableColumnPtr& doris_column, Dat
}
case ColumnSelectVector::FILTERED_CONTENT: {
for (int i = 0; i < run_length; ++i) {
- if (UNLIKELY(_offset + 4 > _data->size)) {
- return Status::IOError("Can't read byte array length from
plain decoder");
- }
- uint32_t length =
- decode_fixed32_le(reinterpret_cast<const
uint8_t*>(_data->data) + _offset);
- _offset += 4;
- if (UNLIKELY(_offset + length) > _data->size) {
+ uint32_t length = 0;
+ RETURN_IF_ERROR(read_length(_data, &_offset, &length));
+ if (UNLIKELY(_offset > _data->size || length > _data->size -
_offset)) {
return Status::IOError("Can't read enough bytes in plain
decoder");
}
_offset += length;
diff --git a/be/test/vec/exec/format/parquet/byte_array_dict_decoder_test.cpp
b/be/test/vec/exec/format/parquet/byte_array_dict_decoder_test.cpp
index c08d3c01df7..618e7e99193 100644
--- a/be/test/vec/exec/format/parquet/byte_array_dict_decoder_test.cpp
+++ b/be/test/vec/exec/format/parquet/byte_array_dict_decoder_test.cpp
@@ -511,4 +511,21 @@ TEST_F(ByteArrayDictDecoderTest, test_skip_value) {
}
}
+TEST_F(ByteArrayDictDecoderTest,
test_set_dict_rejects_truncated_length_prefix) {
+ auto dict_data = make_unique_buffer<uint8_t>(2);
+ dict_data[0] = 1;
+ dict_data[1] = 0;
+
+ ByteArrayDictDecoder decoder;
+ ASSERT_FALSE(decoder.set_dict(dict_data, 2, 1).ok());
+}
+
+TEST_F(ByteArrayDictDecoderTest,
test_set_dict_rejects_truncated_payload_after_prefix) {
+ auto dict_data = make_unique_buffer<uint8_t>(4);
+ encode_fixed32_le(dict_data.get(), 1);
+
+ ByteArrayDictDecoder decoder;
+ ASSERT_FALSE(decoder.set_dict(dict_data, 4, 1).ok());
+}
+
} // namespace doris::vectorized
diff --git a/be/test/vec/exec/format/parquet/byte_array_plain_decoder_test.cpp
b/be/test/vec/exec/format/parquet/byte_array_plain_decoder_test.cpp
index 2ae009ee362..38bcbec2568 100644
--- a/be/test/vec/exec/format/parquet/byte_array_plain_decoder_test.cpp
+++ b/be/test/vec/exec/format/parquet/byte_array_plain_decoder_test.cpp
@@ -244,4 +244,47 @@ TEST_F(ByteArrayPlainDecoderTest, test_skip_value) {
EXPECT_EQ(result_column->get_data_at(0).to_string(), "cherry");
}
+TEST_F(ByteArrayPlainDecoderTest, test_decode_truncated_length_prefix) {
+ uint8_t data[] = {0x01, 0x00};
+ _data_slice = Slice(data, sizeof(data));
+
+ ByteArrayPlainDecoder decoder;
+ ASSERT_TRUE(decoder.set_data(&_data_slice).ok());
+
+ MutableColumnPtr column = ColumnString::create();
+ DataTypePtr data_type = std::make_shared<DataTypeString>();
+ size_t num_values = 1;
+ std::vector<uint16_t> run_length_null_map(1, num_values);
+ std::vector<uint8_t> filter_data(num_values, 1);
+ FilterMap filter_map;
+ ASSERT_TRUE(filter_map.init(filter_data.data(), filter_data.size(),
false).ok());
+ ColumnSelectVector select_vector;
+ ASSERT_TRUE(select_vector.init(run_length_null_map, num_values, nullptr,
&filter_map, 0).ok());
+
+ ASSERT_FALSE(decoder.decode_values(column, data_type, select_vector,
false).ok());
+}
+
+TEST_F(ByteArrayPlainDecoderTest, test_decode_rejects_overflow_length) {
+ uint8_t data[8] = {};
+ encode_fixed32_le(data, 0);
+ encode_fixed32_le(data + 4, UINT32_MAX);
+ _data_slice = Slice(data, sizeof(data));
+
+ ByteArrayPlainDecoder decoder;
+ ASSERT_TRUE(decoder.set_data(&_data_slice).ok());
+ ASSERT_TRUE(decoder.skip_values(1).ok());
+
+ MutableColumnPtr column = ColumnString::create();
+ DataTypePtr data_type = std::make_shared<DataTypeString>();
+ size_t num_values = 1;
+ std::vector<uint16_t> run_length_null_map(1, num_values);
+ std::vector<uint8_t> filter_data(num_values, 1);
+ FilterMap filter_map;
+ ASSERT_TRUE(filter_map.init(filter_data.data(), filter_data.size(),
false).ok());
+ ColumnSelectVector select_vector;
+ ASSERT_TRUE(select_vector.init(run_length_null_map, num_values, nullptr,
&filter_map, 0).ok());
+
+ ASSERT_FALSE(decoder.decode_values(column, data_type, select_vector,
false).ok());
+}
+
} // namespace doris::vectorized
diff --git
a/regression-test/suites/external_table_p0/tvf/test_hdfs_parquet_group6.groovy
b/regression-test/suites/external_table_p0/tvf/test_hdfs_parquet_group6.groovy
index d9896ea68be..522ad648e5d 100644
---
a/regression-test/suites/external_table_p0/tvf/test_hdfs_parquet_group6.groovy
+++
b/regression-test/suites/external_table_p0/tvf/test_hdfs_parquet_group6.groovy
@@ -677,8 +677,8 @@
suite("test_hdfs_parquet_group6","external,hive,tvf,external_docker") {
sql """ select * from HDFS(
"uri" = "${uri}",
"hadoop.username" = "${hdfsUserName}",
- "format" = "parquet") limit 10; """
- exception "Can't read byte array length from plain decoder"
+ "format" = "parquet"); """
+ exception "Can't read enough bytes in plain decode"
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]