This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-4.1
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-4.1 by this push:
     new 3ebad14ec8d branch-4.1: [fix](serde) Support large string arrow 
builder for variant serde (#64048)
3ebad14ec8d is described below

commit 3ebad14ec8d92ec8592e35e2cf107b2322875c3d
Author: lihangyu <[email protected]>
AuthorDate: Wed Jun 3 14:33:13 2026 +0800

    branch-4.1: [fix](serde) Support large string arrow builder for variant 
serde (#64048)
    
    cherry-pick #63718
---
 .../data_type_serde/data_type_variant_serde.cpp    | 53 +++++++++++++++-------
 .../core/data_type_serde/data_type_serde_test.cpp  | 25 ++++++++++
 2 files changed, 62 insertions(+), 16 deletions(-)

diff --git a/be/src/core/data_type_serde/data_type_variant_serde.cpp 
b/be/src/core/data_type_serde/data_type_variant_serde.cpp
index 8692136cbca..efd35971dcb 100644
--- a/be/src/core/data_type_serde/data_type_variant_serde.cpp
+++ b/be/src/core/data_type_serde/data_type_variant_serde.cpp
@@ -17,6 +17,8 @@
 
 #include "core/data_type_serde/data_type_variant_serde.h"
 
+#include <arrow/array/builder_binary.h>
+
 #include <cstdint>
 #include <string>
 
@@ -37,6 +39,32 @@
 #include "util/jsonb_writer.h"
 
 namespace doris {
+namespace {
+
+template <typename BuilderType>
+Status write_variant_column_to_arrow_impl(const IColumn& column, const 
ColumnVariant& var,
+                                          const NullMap* null_map, 
BuilderType& builder,
+                                          int64_t start, int64_t end, const 
cctz::time_zone& ctz) {
+    DataTypeSerDe::FormatOptions options;
+    options.timezone = &ctz;
+    for (int64_t i = start; i < end; ++i) {
+        if (null_map && (*null_map)[cast_set<size_t>(i)]) {
+            RETURN_IF_ERROR(checkArrowStatus(builder.AppendNull(), 
column.get_name(),
+                                             builder.type()->name()));
+            continue;
+        }
+
+        std::string serialized_value;
+        var.serialize_one_row_to_string(i, &serialized_value, options);
+        const auto serialized_size =
+                cast_set<typename 
BuilderType::offset_type>(serialized_value.size());
+        
RETURN_IF_ERROR(checkArrowStatus(builder.Append(serialized_value.data(), 
serialized_size),
+                                         column.get_name(), 
builder.type()->name()));
+    }
+    return Status::OK();
+}
+
+} // namespace
 
 #include "common/compile_check_begin.h"
 
@@ -130,23 +158,16 @@ Status DataTypeVariantSerDe::write_column_to_arrow(const 
IColumn& column, const
                                                    int64_t start, int64_t end,
                                                    const cctz::time_zone& ctz) 
const {
     const auto* var = check_and_get_column<ColumnVariant>(column);
-    auto& builder = assert_cast<arrow::StringBuilder&>(*array_builder);
-    FormatOptions options;
-    options.timezone = &ctz;
-    for (size_t i = start; i < end; ++i) {
-        if (null_map && (*null_map)[i]) {
-            RETURN_IF_ERROR(checkArrowStatus(builder.AppendNull(), 
column.get_name(),
-                                             array_builder->type()->name()));
-        } else {
-            std::string serialized_value;
-            var->serialize_one_row_to_string(i, &serialized_value, options);
-            RETURN_IF_ERROR(
-                    checkArrowStatus(builder.Append(serialized_value.data(),
-                                                    
static_cast<int>(serialized_value.size())),
-                                     column.get_name(), 
array_builder->type()->name()));
-        }
+    if (array_builder->type()->id() == arrow::Type::LARGE_STRING) {
+        auto& builder = 
assert_cast<arrow::LargeStringBuilder&>(*array_builder);
+        return write_variant_column_to_arrow_impl(column, *var, null_map, 
builder, start, end, ctz);
+    } else if (array_builder->type()->id() == arrow::Type::STRING) {
+        auto& builder = assert_cast<arrow::StringBuilder&>(*array_builder);
+        return write_variant_column_to_arrow_impl(column, *var, null_map, 
builder, start, end, ctz);
+    } else {
+        return Status::InvalidArgument("Unsupported arrow type for variant 
column: {}",
+                                       array_builder->type()->name());
     }
-    return Status::OK();
 }
 
 void DataTypeVariantSerDe::to_string(const IColumn& column, size_t row_num, 
BufferWritable& bw,
diff --git a/be/test/core/data_type_serde/data_type_serde_test.cpp 
b/be/test/core/data_type_serde/data_type_serde_test.cpp
index 9e402179a63..6cfb0cb4d10 100644
--- a/be/test/core/data_type_serde/data_type_serde_test.cpp
+++ b/be/test/core/data_type_serde/data_type_serde_test.cpp
@@ -18,6 +18,7 @@
 
 #include "core/data_type_serde/data_type_serde.h"
 
+#include <arrow/api.h>
 #include <gen_cpp/types.pb.h>
 #include <gtest/gtest-message.h>
 #include <gtest/gtest-test-part.h>
@@ -48,6 +49,7 @@
 #include "core/data_type/data_type_number.h"
 #include "core/data_type/data_type_quantilestate.h"
 #include "core/data_type/data_type_string.h"
+#include "core/data_type/data_type_variant.h"
 #include "core/types.h"
 #include "core/value/bitmap_value.h"
 #include "core/value/hll.h"
@@ -600,4 +602,27 @@ TEST(DataTypeSerDeTest, DeserializeFromSparseColumnTest) {
         EXPECT_EQ(subcolumn.get_least_common_base_type_id(), 
PrimitiveType::TYPE_JSONB);
     }
 }
+
+TEST(DataTypeSerDeTest, VariantWriteColumnToArrowSupportsLargeString) {
+    auto variant_column = ColumnVariant::create(0, false);
+    VariantMap root;
+    root.try_emplace(PathInData(), FieldWithDataType {.field = 
Field::create_field<TYPE_STRING>(
+                                                              String("variant 
value", 13))});
+    
variant_column->try_insert(Field::create_field<TYPE_VARIANT>(std::move(root)));
+
+    auto data_type = std::make_shared<DataTypeVariant>();
+    auto serde = data_type->get_serde(0);
+    arrow::LargeStringBuilder builder;
+    auto ctz = cctz::utc_time_zone();
+    auto st = serde->write_column_to_arrow(*variant_column, nullptr, &builder, 
0,
+                                           variant_column->size(), ctz);
+    EXPECT_TRUE(st.ok()) << st.to_string();
+
+    std::shared_ptr<arrow::Array> array;
+    ASSERT_TRUE(builder.Finish(&array).ok());
+    auto* string_array = dynamic_cast<arrow::LargeStringArray*>(array.get());
+    ASSERT_NE(string_array, nullptr);
+    ASSERT_EQ(string_array->length(), 1);
+    EXPECT_EQ(string_array->GetString(0), "variant value");
+}
 } // namespace doris


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to