This is an automated email from the ASF dual-hosted git repository.

suxiaogang223 pushed a commit to branch 
codex/complex-column-predicate-stats-filtering
in repository https://gitbox.apache.org/repos/asf/doris.git

commit b6d913a83c0c8bf871649ef3341c8be28a983d39
Author: Socrates <[email protected]>
AuthorDate: Thu Jun 4 02:44:14 2026 +0800

    [improvement](be) Map nested parquet predicates through column mapper
    
    ### What problem does this PR solve?
    
    Issue Number: close #xxx
    
    Related PR: #xxx
    
    Problem Summary: Resolve nested struct filter projection and pruning 
targets through ColumnMapping before falling back to file schema names, so 
renamed mapped children can still produce file-local pruning paths.
    
    ### Release note
    
    None
    
    ### Check List (For Author)
    
    - Test: Unit Test
    
        - git diff --check
    
    - Behavior changed: No
    
    - Does this need documentation: Yes
---
 be/src/format/reader/column_mapper.cpp             | 117 ++++++++++++++++++++-
 be/src/format/reader/column_mapper.h               |   1 +
 be/test/format/new_parquet/parquet_reader_test.cpp |  55 ++++++++++
 ...complex-column-predicate-and-stats-filtering.md |   4 +-
 4 files changed, 174 insertions(+), 3 deletions(-)

diff --git a/be/src/format/reader/column_mapper.cpp 
b/be/src/format/reader/column_mapper.cpp
index 7e7e7894ddd..b338b60dce9 100644
--- a/be/src/format/reader/column_mapper.cpp
+++ b/be/src/format/reader/column_mapper.cpp
@@ -302,6 +302,20 @@ static const SchemaField* resolve_file_child(const 
std::vector<SchemaField>& chi
     return &children[selector.ordinal - 1];
 }
 
+static const ColumnMapping* resolve_mapped_child(const 
std::vector<ColumnMapping>& child_mappings,
+                                                 const StructChildSelector& 
selector) {
+    if (selector.by_name) {
+        const auto child_it = std::ranges::find_if(child_mappings, [&](const 
auto& child_mapping) {
+            return child_mapping.table_column_name == selector.name;
+        });
+        return child_it == child_mappings.end() ? nullptr : &*child_it;
+    }
+    if (selector.ordinal == 0 || selector.ordinal > child_mappings.size()) {
+        return nullptr;
+    }
+    return &child_mappings[selector.ordinal - 1];
+}
+
 static Status build_filter_projection_path(const std::vector<SchemaField>& 
children,
                                            std::span<const 
StructChildSelector> selectors,
                                            FieldProjection* projection) {
@@ -335,6 +349,47 @@ static Status build_filter_projection_path(const 
std::vector<SchemaField>& child
     return Status::OK();
 }
 
+// Prefer the table-to-file mapping tree for nested filter projection. This 
keeps renamed
+// children and field-id schema evolution in the mapper instead of leaking 
table names into the
+// file reader request. The file schema fallback below is only for filter-only 
children that do not
+// have an output child mapping yet.
+static Status build_filter_projection_path(const ColumnMapping& mapping,
+                                           std::span<const 
StructChildSelector> selectors,
+                                           FieldProjection* projection) {
+    DORIS_CHECK(projection != nullptr);
+    if (selectors.empty()) {
+        return Status::InvalidArgument("Nested struct selector path is empty");
+    }
+    const auto* child_mapping = resolve_mapped_child(mapping.child_mappings, 
selectors.front());
+    if (child_mapping == nullptr) {
+        return build_filter_projection_path(mapping.original_file_children, 
selectors, projection);
+    }
+    if (!child_mapping->field_id.has_value()) {
+        projection->field_id = -1;
+        return Status::OK();
+    }
+    projection->field_id = *child_mapping->field_id;
+    projection->project_all_children = selectors.size() == 1;
+    projection->children.clear();
+    if (selectors.size() == 1) {
+        return Status::OK();
+    }
+    FieldProjection child_projection;
+    if (child_mapping->child_mappings.empty()) {
+        
RETURN_IF_ERROR(build_filter_projection_path(child_mapping->original_file_children,
+                                                     selectors.subspan(1), 
&child_projection));
+    } else {
+        RETURN_IF_ERROR(build_filter_projection_path(*child_mapping, 
selectors.subspan(1),
+                                                     &child_projection));
+    }
+    if (child_projection.field_id < 0) {
+        projection->field_id = -1;
+        return Status::OK();
+    }
+    projection->children.push_back(std::move(child_projection));
+    return Status::OK();
+}
+
 static const SchemaField* resolve_filter_schema_path(const 
std::vector<SchemaField>& children,
                                                      std::span<const 
StructChildSelector> selectors,
                                                      std::vector<int32_t>* 
file_child_id_path) {
@@ -363,6 +418,49 @@ static const SchemaField* resolve_filter_schema_path(const 
std::vector<SchemaFie
     return leaf;
 }
 
+// Resolve a nested predicate through ColumnMapping when possible. The 
returned child-id path and
+// leaf type are file-local, so parquet pruning can stay independent from 
table/global schema.
+static bool resolve_mapped_filter_schema_path(const ColumnMapping& mapping,
+                                              std::span<const 
StructChildSelector> selectors,
+                                              std::vector<int32_t>* 
file_child_id_path,
+                                              std::string* leaf_name, 
DataTypePtr* leaf_type) {
+    DORIS_CHECK(file_child_id_path != nullptr);
+    DORIS_CHECK(leaf_name != nullptr);
+    DORIS_CHECK(leaf_type != nullptr);
+    if (selectors.empty()) {
+        return false;
+    }
+    const auto* child_mapping = resolve_mapped_child(mapping.child_mappings, 
selectors.front());
+    if (child_mapping == nullptr) {
+        return false;
+    }
+    if (!child_mapping->field_id.has_value()) {
+        file_child_id_path->clear();
+        return false;
+    }
+    file_child_id_path->push_back(*child_mapping->field_id);
+    if (selectors.size() == 1) {
+        if (child_mapping->file_type == nullptr ||
+            
is_complex_type(remove_nullable(child_mapping->file_type)->get_primitive_type()))
 {
+            file_child_id_path->clear();
+            return false;
+        }
+        *leaf_name = child_mapping->file_column_name;
+        *leaf_type = remove_nullable(child_mapping->file_type);
+        return true;
+    }
+    if (child_mapping->child_mappings.empty()) {
+        file_child_id_path->clear();
+        return false;
+    }
+    if (!resolve_mapped_filter_schema_path(*child_mapping, 
selectors.subspan(1), file_child_id_path,
+                                           leaf_name, leaf_type)) {
+        file_child_id_path->clear();
+        return false;
+    }
+    return true;
+}
+
 static bool resolve_nested_predicate_target(const NestedStructPath& path,
                                             const std::vector<ColumnMapping>& 
mappings,
                                             NestedPredicateTargetInfo* target) 
{
@@ -377,6 +475,17 @@ static bool resolve_nested_predicate_target(const 
NestedStructPath& path,
         return false;
     }
     std::vector<int32_t> file_child_id_path;
+    std::string leaf_name;
+    DataTypePtr file_leaf_type;
+    if (resolve_mapped_filter_schema_path(*mapping_it, path.selectors, 
&file_child_id_path,
+                                          &leaf_name, &file_leaf_type)) {
+        target->root_file_column_id = *mapping_it->field_id;
+        target->file_child_id_path = std::move(file_child_id_path);
+        target->leaf_name = std::move(leaf_name);
+        target->file_leaf_type = std::move(file_leaf_type);
+        return true;
+    }
+
     const auto* leaf = 
resolve_filter_schema_path(mapping_it->original_file_children,
                                                   path.selectors, 
&file_child_id_path);
     if (leaf == nullptr || leaf->type == nullptr ||
@@ -1088,8 +1197,8 @@ static Status build_filter_projection_map(const 
std::vector<TableFilter>& table_
             }
 
             FieldProjection child_projection;
-            
RETURN_IF_ERROR(build_filter_projection_path(mapping_it->original_file_children,
-                                                         path.selectors, 
&child_projection));
+            RETURN_IF_ERROR(
+                    build_filter_projection_path(*mapping_it, path.selectors, 
&child_projection));
             if (child_projection.field_id < 0) {
                 continue;
             }
@@ -1173,6 +1282,7 @@ Status TableColumnMapper::create_mapping(const 
std::vector<TableColumn>& project
     for (const auto& table_column : projected_columns) {
         ColumnMapping mapping;
         mapping.table_column_id = table_column.id;
+        mapping.table_column_name = table_column.name;
         mapping.table_type = table_column.type;
         if (table_column.is_partition_key && 
partition_values.contains(table_column.name)) {
             // 1. Partition column, use partition value as a constant mapping. 
Note that partition column may also have default expression, but partition 
value should take precedence if it exists.
@@ -1380,6 +1490,7 @@ Status TableColumnMapper::_create_direct_mapping(const 
TableColumn& table_column
         return Status::InvalidArgument("mapping is null");
     }
     mapping->field_id = file_field.id;
+    mapping->table_column_name = table_column.name;
     mapping->file_column_name = file_field.name;
     mapping->original_file_type = file_field.type;
     mapping->original_file_children = file_field.children;
@@ -1400,6 +1511,7 @@ Status TableColumnMapper::_create_direct_mapping(const 
TableColumn& table_column
                 }
                 ColumnMapping child_mapping;
                 child_mapping.table_column_id = table_child.id;
+                child_mapping.table_column_name = table_child.name;
                 child_mapping.file_column_name = table_child.name;
                 child_mapping.table_type = table_child.type;
                 child_mapping.file_type = table_child.type;
@@ -1410,6 +1522,7 @@ Status TableColumnMapper::_create_direct_mapping(const 
TableColumn& table_column
             }
             ColumnMapping child_mapping;
             child_mapping.table_column_id = table_child.id;
+            child_mapping.table_column_name = table_child.name;
             child_mapping.table_type = table_child.type;
             RETURN_IF_ERROR(_create_direct_mapping(table_child, *file_child, 
&child_mapping));
             mapping->child_mappings.push_back(std::move(child_mapping));
diff --git a/be/src/format/reader/column_mapper.h 
b/be/src/format/reader/column_mapper.h
index a70e246bcce..78ca20d6091 100644
--- a/be/src/format/reader/column_mapper.h
+++ b/be/src/format/reader/column_mapper.h
@@ -62,6 +62,7 @@ enum TableVirtualColumnType {
 // 这是 table 层和 file 层的核心边界对象。
 struct ColumnMapping {
     int32_t table_column_id = -1;
+    std::string table_column_name;
     // File-local field id for top-level columns, or child id for nested 
columns.
     std::optional<int32_t> field_id;
     std::string file_column_name;
diff --git a/be/test/format/new_parquet/parquet_reader_test.cpp 
b/be/test/format/new_parquet/parquet_reader_test.cpp
index b4815ab2d5a..260e65695db 100644
--- a/be/test/format/new_parquet/parquet_reader_test.cpp
+++ b/be/test/format/new_parquet/parquet_reader_test.cpp
@@ -740,6 +740,61 @@ TEST(TableColumnMapperTest, 
MergesStructFilterOnlyChildIntoPredicateProjection)
     EXPECT_EQ(read_type->get_element_name(1), "a");
 }
 
+TEST(TableColumnMapperTest, MapsRenamedNestedStructPredicateByFieldId) {
+    auto id_type = std::make_shared<DataTypeInt32>();
+    reader::SchemaField file_child;
+    file_child.id = 101;
+    file_child.name = "file_id";
+    file_child.type = id_type;
+    reader::SchemaField struct_field;
+    struct_field.id = 100;
+    struct_field.name = "s";
+    struct_field.type = std::make_shared<DataTypeStruct>(DataTypes {id_type}, 
Strings {"file_id"});
+    struct_field.children = {file_child};
+
+    reader::TableColumn table_child;
+    table_child.id = 101;
+    table_child.name = "table_id";
+    table_child.type = id_type;
+    reader::TableColumn table_column;
+    table_column.id = 100;
+    table_column.name = "s";
+    table_column.type = std::make_shared<DataTypeStruct>(DataTypes {id_type}, 
Strings {"table_id"});
+    table_column.children = {table_child};
+
+    auto filter_expr = std::make_shared<TestFunctionExpr>(
+            "gt", std::make_shared<DataTypeUInt8>(), 
TExprNodeType::BINARY_PRED, TExprOpcode::GT);
+    filter_expr->add_child(
+            struct_element_expr(TableSlotRef::create_shared(100, 100, -1, 
table_column.type, "s"),
+                                id_type, "table_id"));
+    filter_expr->add_child(TableLiteral::create_shared(id_type, 
Field::create_field<TYPE_INT>(5)));
+    reader::TableFilter table_filter {
+            .conjunct = VExprContext::create_shared(filter_expr),
+            .slot_ids = {100},
+    };
+
+    reader::TableColumnMapperOptions options;
+    options.mode = reader::TableColumnMappingMode::BY_FIELD_ID;
+    reader::TableColumnMapper mapper(options);
+    ASSERT_TRUE(mapper.create_mapping({table_column}, {}, 
{struct_field}).ok());
+
+    reader::FileScanRequest request;
+    ASSERT_TRUE(mapper.create_scan_request({table_filter}, {}, {table_column}, 
&request).ok());
+
+    ASSERT_EQ(request.predicate_columns.size(), 1);
+    const auto& projection = request.predicate_columns[0];
+    EXPECT_EQ(projection.field_id, 100);
+    ASSERT_FALSE(projection.project_all_children);
+    ASSERT_EQ(projection.children.size(), 1);
+    EXPECT_EQ(projection.children[0].field_id, 101);
+
+    ASSERT_EQ(request.column_predicate_filters.size(), 1);
+    EXPECT_EQ(request.column_predicate_filters[0].file_column_id, 100);
+    EXPECT_EQ(request.column_predicate_filters[0].file_child_id_path, 
std::vector<int32_t>({101}));
+    ASSERT_EQ(request.column_predicate_filters[0].predicates.size(), 1);
+    EXPECT_EQ(request.column_predicate_filters[0].predicates[0]->type(), 
PredicateType::GT);
+}
+
 TEST(TableColumnMapperTest, BuildsNestedStructInListPredicateFilter) {
     auto a_type = std::make_shared<DataTypeInt32>();
     auto b_type = std::make_shared<DataTypeString>();
diff --git a/docs/complex-column-predicate-and-stats-filtering.md 
b/docs/complex-column-predicate-and-stats-filtering.md
index 70ac659608a..3850199698a 100644
--- a/docs/complex-column-predicate-and-stats-filtering.md
+++ b/docs/complex-column-predicate-and-stats-filtering.md
@@ -174,6 +174,7 @@ SELECT s.name FROM t WHERE s.id > 5;
 - output child 顺序保持优先,filter-only child 追加到 read projection。
 - filter-only child 不加入 `ColumnMapping.child_mappings`,避免 table output 
materialization 把它当作输出字段。
 - `ColumnMapping` 保存 `original_file_type` / `original_file_children`,重复创建 
split-local request 时可以从原始 file schema 重建 read projection。
+- nested filter projection 优先通过 `ColumnMapping.child_mappings` 映射 table child 
到 file child;没有 child mapping 的 filter-only path 再回退到 file schema 解析。
 
 ### 5.3 nested file-layer pruning target
 
@@ -183,6 +184,7 @@ SELECT s.name FROM t WHERE s.id > 5;
 - `file_child_id_path` 是 top-level file column 下的 file-local child field id 
path,不是 table id,也不是 ordinal。
 - mapper 会从 AND 语义下的 `struct_element(...) op literal` / `literal op 
struct_element(...)` 构造 nested file-layer pruning hint。
 - mapper 会从 AND 语义下的 `struct_element(...) IN (...)` 构造 nested `IN_LIST` 
pruning hint。
+- 对已经存在 `ColumnMapping` 的 nested child,mapper 使用 table child name + field-id 
mapping 生成 file-local `file_child_id_path`,支持 table/file nested child rename。
 - 不从 OR/NOT/任意函数子树中提取 pruning predicate,避免把非必要条件当成必需条件裁剪。
 - literal 转换到 file leaf type 失败、path 解析失败、leaf 不是 primitive 时,不生成 pruning hint。
 
@@ -259,7 +261,7 @@ page index 对 repeated leaf 的 row range 语义复杂。本轮只允许 non-re
 ## 7. 后续工作
 
 - 如果后续 Arrow writer 或外部 fixture 能稳定提供 bloom filter metadata,补 nested bloom 
pruning 的真实 parquet fixture。
-- schema change 场景下,把 table nested path 到 file nested path 的 mapping 入口收敛到 
mapper,不让 file reader 理解 table/global schema。
+- 完整复杂 child schema change 需要 FE/table reader 提供完整 nested table mapping;file 
reader 仍不理解 table/global schema。
 - LIST/MAP/repeated leaf 只有在 Dremel row semantics 和 row-range 语义明确后再接入 pruning。
 
 ## 8. 需要避免的实现


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to