This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-4.1
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-4.1 by this push:
new 3ebad14ec8d branch-4.1: [fix](serde) Support large string arrow
builder for variant serde (#64048)
3ebad14ec8d is described below
commit 3ebad14ec8d92ec8592e35e2cf107b2322875c3d
Author: lihangyu <[email protected]>
AuthorDate: Wed Jun 3 14:33:13 2026 +0800
branch-4.1: [fix](serde) Support large string arrow builder for variant
serde (#64048)
cherry-pick #63718
---
.../data_type_serde/data_type_variant_serde.cpp | 53 +++++++++++++++-------
.../core/data_type_serde/data_type_serde_test.cpp | 25 ++++++++++
2 files changed, 62 insertions(+), 16 deletions(-)
diff --git a/be/src/core/data_type_serde/data_type_variant_serde.cpp
b/be/src/core/data_type_serde/data_type_variant_serde.cpp
index 8692136cbca..efd35971dcb 100644
--- a/be/src/core/data_type_serde/data_type_variant_serde.cpp
+++ b/be/src/core/data_type_serde/data_type_variant_serde.cpp
@@ -17,6 +17,8 @@
#include "core/data_type_serde/data_type_variant_serde.h"
+#include <arrow/array/builder_binary.h>
+
#include <cstdint>
#include <string>
@@ -37,6 +39,32 @@
#include "util/jsonb_writer.h"
namespace doris {
+namespace {
+
+template <typename BuilderType>
+Status write_variant_column_to_arrow_impl(const IColumn& column, const
ColumnVariant& var,
+ const NullMap* null_map,
BuilderType& builder,
+ int64_t start, int64_t end, const
cctz::time_zone& ctz) {
+ DataTypeSerDe::FormatOptions options;
+ options.timezone = &ctz;
+ for (int64_t i = start; i < end; ++i) {
+ if (null_map && (*null_map)[cast_set<size_t>(i)]) {
+ RETURN_IF_ERROR(checkArrowStatus(builder.AppendNull(),
column.get_name(),
+ builder.type()->name()));
+ continue;
+ }
+
+ std::string serialized_value;
+ var.serialize_one_row_to_string(i, &serialized_value, options);
+ const auto serialized_size =
+ cast_set<typename
BuilderType::offset_type>(serialized_value.size());
+
RETURN_IF_ERROR(checkArrowStatus(builder.Append(serialized_value.data(),
serialized_size),
+ column.get_name(),
builder.type()->name()));
+ }
+ return Status::OK();
+}
+
+} // namespace
#include "common/compile_check_begin.h"
@@ -130,23 +158,16 @@ Status DataTypeVariantSerDe::write_column_to_arrow(const
IColumn& column, const
int64_t start, int64_t end,
const cctz::time_zone& ctz)
const {
const auto* var = check_and_get_column<ColumnVariant>(column);
- auto& builder = assert_cast<arrow::StringBuilder&>(*array_builder);
- FormatOptions options;
- options.timezone = &ctz;
- for (size_t i = start; i < end; ++i) {
- if (null_map && (*null_map)[i]) {
- RETURN_IF_ERROR(checkArrowStatus(builder.AppendNull(),
column.get_name(),
- array_builder->type()->name()));
- } else {
- std::string serialized_value;
- var->serialize_one_row_to_string(i, &serialized_value, options);
- RETURN_IF_ERROR(
- checkArrowStatus(builder.Append(serialized_value.data(),
-
static_cast<int>(serialized_value.size())),
- column.get_name(),
array_builder->type()->name()));
- }
+ if (array_builder->type()->id() == arrow::Type::LARGE_STRING) {
+ auto& builder =
assert_cast<arrow::LargeStringBuilder&>(*array_builder);
+ return write_variant_column_to_arrow_impl(column, *var, null_map,
builder, start, end, ctz);
+ } else if (array_builder->type()->id() == arrow::Type::STRING) {
+ auto& builder = assert_cast<arrow::StringBuilder&>(*array_builder);
+ return write_variant_column_to_arrow_impl(column, *var, null_map,
builder, start, end, ctz);
+ } else {
+ return Status::InvalidArgument("Unsupported arrow type for variant
column: {}",
+ array_builder->type()->name());
}
- return Status::OK();
}
void DataTypeVariantSerDe::to_string(const IColumn& column, size_t row_num,
BufferWritable& bw,
diff --git a/be/test/core/data_type_serde/data_type_serde_test.cpp
b/be/test/core/data_type_serde/data_type_serde_test.cpp
index 9e402179a63..6cfb0cb4d10 100644
--- a/be/test/core/data_type_serde/data_type_serde_test.cpp
+++ b/be/test/core/data_type_serde/data_type_serde_test.cpp
@@ -18,6 +18,7 @@
#include "core/data_type_serde/data_type_serde.h"
+#include <arrow/api.h>
#include <gen_cpp/types.pb.h>
#include <gtest/gtest-message.h>
#include <gtest/gtest-test-part.h>
@@ -48,6 +49,7 @@
#include "core/data_type/data_type_number.h"
#include "core/data_type/data_type_quantilestate.h"
#include "core/data_type/data_type_string.h"
+#include "core/data_type/data_type_variant.h"
#include "core/types.h"
#include "core/value/bitmap_value.h"
#include "core/value/hll.h"
@@ -600,4 +602,27 @@ TEST(DataTypeSerDeTest, DeserializeFromSparseColumnTest) {
EXPECT_EQ(subcolumn.get_least_common_base_type_id(),
PrimitiveType::TYPE_JSONB);
}
}
+
+TEST(DataTypeSerDeTest, VariantWriteColumnToArrowSupportsLargeString) {
+ auto variant_column = ColumnVariant::create(0, false);
+ VariantMap root;
+ root.try_emplace(PathInData(), FieldWithDataType {.field =
Field::create_field<TYPE_STRING>(
+ String("variant
value", 13))});
+
variant_column->try_insert(Field::create_field<TYPE_VARIANT>(std::move(root)));
+
+ auto data_type = std::make_shared<DataTypeVariant>();
+ auto serde = data_type->get_serde(0);
+ arrow::LargeStringBuilder builder;
+ auto ctz = cctz::utc_time_zone();
+ auto st = serde->write_column_to_arrow(*variant_column, nullptr, &builder,
0,
+ variant_column->size(), ctz);
+ EXPECT_TRUE(st.ok()) << st.to_string();
+
+ std::shared_ptr<arrow::Array> array;
+ ASSERT_TRUE(builder.Finish(&array).ok());
+ auto* string_array = dynamic_cast<arrow::LargeStringArray*>(array.get());
+ ASSERT_NE(string_array, nullptr);
+ ASSERT_EQ(string_array->length(), 1);
+ EXPECT_EQ(string_array->GetString(0), "variant value");
+}
} // namespace doris
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]