csun5285 commented on code in PR #63970:
URL: https://github.com/apache/doris/pull/63970#discussion_r3347510258


##########
be/src/exec/common/variant_util.cpp:
##########
@@ -956,13 +956,17 @@ Status 
VariantCompactionUtil::aggregate_variant_extended_info(
 void VariantCompactionUtil::get_subpaths(int32_t max_subcolumns_count,
                                          const PathToNoneNullValues& stats,
                                          TabletSchema::PathsSetInfo& 
paths_set_info) {
-    // max_subcolumns_count is 0 means no limit
-    if (max_subcolumns_count > 0 && stats.size() > max_subcolumns_count) {
-        std::vector<std::pair<size_t, std::string_view>> paths_with_sizes;
-        paths_with_sizes.reserve(stats.size());
-        for (const auto& [path, size] : stats) {
-            paths_with_sizes.emplace_back(size, path);
+    std::vector<std::pair<size_t, std::string_view>> paths_with_sizes;
+    paths_with_sizes.reserve(stats.size());

Review Comment:
   这里为什么改动了,不影响这里,typed_path 也没 size



##########
be/src/storage/segment/variant/variant_column_writer_impl.h:
##########
@@ -194,6 +194,8 @@ class VariantColumnWriterImpl {
                                 size_t num_rows, int& column_id);
     Status _process_subcolumns(ColumnVariant* ptr, OlapBlockDataConvertor* 
converter,
                                size_t num_rows, int& column_id);
+    Status _process_regular_doc_value_staging(ColumnVariant* ptr, 
OlapBlockDataConvertor* converter,

Review Comment:
   这个函数是处理稀疏列写入和doc 写入吗?上面的_process_subcolumns 还有用吗



##########
be/src/storage/segment/variant/variant_column_writer_impl.cpp:
##########
@@ -556,6 +623,220 @@ Status append_sparse_converted_column(const TabletColumn& 
tablet_column, ColumnW
     converter->clear_source_content(cid);
     return Status::OK();
 }
+
+bool has_doc_value_data(const ColumnVariant& variant) {
+    if (variant.size() == 0) {
+        return false;
+    }
+    const auto& offsets = variant.serialized_doc_value_column_offsets();
+    return !offsets.empty() && offsets[variant.size() - 1] > 0;
+}
+
+enum class VariantPayloadWritePath {

Review Comment:
   这四个分别代表什么?



##########
be/src/storage/segment/variant/variant_column_writer_impl.cpp:
##########
@@ -254,39 +277,76 @@ void build_sparse_subcolumns(const ColumnVariant& 
variant, const DocValuePathSta
     }
 }
 
-SubcolumnWritePlan build_subcolumn_write_plan(const ColumnVariant& variant, 
size_t num_rows,
-                                              int64_t 
variant_doc_materialization_min_rows) {
-    SubcolumnWritePlan plan;
-    // Below threshold: skip materialization and let finalize() compute stats 
on demand.
-    if (num_rows < static_cast<size_t>(variant_doc_materialization_min_rows)) {
-        return plan;
+void set_doc_value_stats(const ColumnVariant& variant, const 
DocValuePathStats* precomputed_stats,
+                         DocValuePathStats* stats) {
+    if (precomputed_stats != nullptr) {
+        *stats = *precomputed_stats;
+    } else {
+        build_doc_value_stats(variant, stats);
     }
+}
 
+DocValueMaterializationMode choose_doc_value_materialization_mode(
+        const DocValueMaterializationOptions& options) {
+    if (options.selected_paths != nullptr) {
+        return DocValueMaterializationMode::SparseSelectedPaths;
+    }
     if (config::enable_variant_doc_sparse_write_subcolumns) {
-        build_doc_value_stats(variant, &plan.stats);
-        build_sparse_subcolumns(variant, plan.stats, &plan.sparse_subcolumns);
-        plan.entries.reserve(plan.sparse_subcolumns.size());
-        for (auto& [path, sparse] : plan.sparse_subcolumns) {
-            SubcolumnWriteEntry entry;
-            // StringRef points to variant storage; valid for the plan's 
lifetime.
-            entry.path = std::string_view(path.data, path.size);
-            entry.subcolumn = &sparse.subcolumn;
-            entry.rowids = &sparse.rowids;
-            plan.entries.push_back(entry);
-        }
-        return plan;
+        return DocValueMaterializationMode::SparseAllPaths;

Review Comment:
   这个Sparse,有的时候是稀疏列的稀疏,有的时候是稀疏写入的稀疏,要不换个名字



##########
be/src/storage/segment/variant/variant_column_writer_impl.cpp:
##########
@@ -254,39 +277,76 @@ void build_sparse_subcolumns(const ColumnVariant& 
variant, const DocValuePathSta
     }
 }
 
-SubcolumnWritePlan build_subcolumn_write_plan(const ColumnVariant& variant, 
size_t num_rows,
-                                              int64_t 
variant_doc_materialization_min_rows) {
-    SubcolumnWritePlan plan;
-    // Below threshold: skip materialization and let finalize() compute stats 
on demand.
-    if (num_rows < static_cast<size_t>(variant_doc_materialization_min_rows)) {
-        return plan;
+void set_doc_value_stats(const ColumnVariant& variant, const 
DocValuePathStats* precomputed_stats,
+                         DocValuePathStats* stats) {
+    if (precomputed_stats != nullptr) {

Review Comment:
   什么时候会是nullptr



##########
be/src/storage/segment/variant/variant_column_writer_impl.cpp:
##########
@@ -1368,30 +1691,19 @@ Status VariantColumnWriterImpl::finalize() {
     auto olap_data_convertor = std::make_unique<OlapBlockDataConvertor>();
 
     DCHECK(ptr->is_finalized());
+    const bool has_extracted_columns = _has_extracted_variant_columns();
+    const VariantFinalizeContext finalize_context =
+            build_variant_finalize_context(*_tablet_column, *ptr, 
has_extracted_columns);
 
-    for (const auto& entry : 
variant_util::get_sorted_subcolumns(ptr->get_subcolumns())) {
-        if (entry->path.empty()) {
-            // already handled
-            continue;
-        }
-        // Not supported nested path to generate sub column info, currently
-        if (entry->path.has_nested_part()) {
-            continue;
-        }
-        TabletSchema::SubColumnInfo sub_column_info;
-        if 
(variant_util::generate_sub_column_info(*_opts.rowset_ctx->tablet_schema,
-                                                   _tablet_column->unique_id(),
-                                                   entry->path.get_path(), 
&sub_column_info)) {
-            _subcolumns_info.emplace(entry->path.get_path(), 
std::move(sub_column_info));
-        }
+    if (finalize_context.prepare_parse_time_subcolumns) {
+        RETURN_IF_ERROR(collect_typed_subcolumn_info_from_parse_tree(
+                *ptr, *_tablet_column, *_opts.rowset_ctx->tablet_schema, 
&_subcolumns_info));
+        
RETURN_IF_ERROR(ptr->convert_typed_path_to_storage_type(_subcolumns_info));

Review Comment:
   这个时候variant 里面没有子列,convert_typed_path_to_storage_type 好像没用?



##########
be/src/storage/segment/variant/variant_column_writer_impl.cpp:
##########
@@ -556,6 +623,220 @@ Status append_sparse_converted_column(const TabletColumn& 
tablet_column, ColumnW
     converter->clear_source_content(cid);
     return Status::OK();
 }
+
+bool has_doc_value_data(const ColumnVariant& variant) {
+    if (variant.size() == 0) {
+        return false;
+    }
+    const auto& offsets = variant.serialized_doc_value_column_offsets();
+    return !offsets.empty() && offsets[variant.size() - 1] > 0;
+}
+
+enum class VariantPayloadWritePath {
+    None,
+    ParseTimeSubcolumns,
+    RegularDocValueStaging,
+    PersistentDocValueMode,
+};
+
+struct VariantFinalizeContext {
+    bool use_regular_doc_value_staging = false;

Review Comment:
   这三个开关控制什么



##########
be/src/storage/segment/variant/variant_column_writer_impl.cpp:
##########
@@ -556,6 +623,220 @@ Status append_sparse_converted_column(const TabletColumn& 
tablet_column, ColumnW
     converter->clear_source_content(cid);
     return Status::OK();
 }
+
+bool has_doc_value_data(const ColumnVariant& variant) {
+    if (variant.size() == 0) {
+        return false;
+    }
+    const auto& offsets = variant.serialized_doc_value_column_offsets();
+    return !offsets.empty() && offsets[variant.size() - 1] > 0;
+}
+
+enum class VariantPayloadWritePath {
+    None,
+    ParseTimeSubcolumns,
+    RegularDocValueStaging,
+    PersistentDocValueMode,
+};
+
+struct VariantFinalizeContext {
+    bool use_regular_doc_value_staging = false;
+    bool prepare_parse_time_subcolumns = true;
+    bool prepare_sparse_payload_from_parse_tree = false;
+    VariantPayloadWritePath payload_write_path = 
VariantPayloadWritePath::ParseTimeSubcolumns;
+};
+
+VariantFinalizeContext build_variant_finalize_context(const TabletColumn& 
tablet_column,
+                                                      const ColumnVariant& 
variant,
+                                                      bool 
has_extracted_columns) {
+    VariantFinalizeContext context;
+
+    // Plain non-doc VARIANT may arrive as doc-value KV staging from storage 
parse. The staging data
+    // is internal to this root writer and is converted into materialized 
subcolumns plus sparse
+    // payload. When extracted columns own the payload, leave this writer in 
metadata-only mode.
+    context.use_regular_doc_value_staging =
+            !has_extracted_columns && !tablet_column.variant_enable_doc_mode() 
&&
+            !tablet_column.variant_enable_nested_group() && 
has_doc_value_data(variant);
+    context.prepare_parse_time_subcolumns = 
!context.use_regular_doc_value_staging;
+    context.prepare_sparse_payload_from_parse_tree =
+            context.prepare_parse_time_subcolumns &&
+            variant_util::should_write_variant_binary_columns(tablet_column);
+
+    if (has_extracted_columns) {
+        context.payload_write_path = VariantPayloadWritePath::None;
+    } else if (context.use_regular_doc_value_staging) {
+        context.payload_write_path = 
VariantPayloadWritePath::RegularDocValueStaging;
+    } else if (tablet_column.variant_enable_doc_mode()) {
+        context.payload_write_path = 
VariantPayloadWritePath::PersistentDocValueMode;
+    } else {
+        context.payload_write_path = 
VariantPayloadWritePath::ParseTimeSubcolumns;

Review Comment:
   ParseTimeSubcolumns 是 nested 的写入吗



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to