This is an automated email from the ASF dual-hosted git repository. suxiaogang223 pushed a commit to branch codex/complex-column-predicate-stats-filtering in repository https://gitbox.apache.org/repos/asf/doris.git
commit b6d913a83c0c8bf871649ef3341c8be28a983d39 Author: Socrates <[email protected]> AuthorDate: Thu Jun 4 02:44:14 2026 +0800 [improvement](be) Map nested parquet predicates through column mapper ### What problem does this PR solve? Issue Number: close #xxx Related PR: #xxx Problem Summary: Resolve nested struct filter projection and pruning targets through ColumnMapping before falling back to file schema names, so renamed mapped children can still produce file-local pruning paths. ### Release note None ### Check List (For Author) - Test: Unit Test - git diff --check - Behavior changed: No - Does this need documentation: Yes --- be/src/format/reader/column_mapper.cpp | 117 ++++++++++++++++++++- be/src/format/reader/column_mapper.h | 1 + be/test/format/new_parquet/parquet_reader_test.cpp | 55 ++++++++++ ...complex-column-predicate-and-stats-filtering.md | 4 +- 4 files changed, 174 insertions(+), 3 deletions(-) diff --git a/be/src/format/reader/column_mapper.cpp b/be/src/format/reader/column_mapper.cpp index 7e7e7894ddd..b338b60dce9 100644 --- a/be/src/format/reader/column_mapper.cpp +++ b/be/src/format/reader/column_mapper.cpp @@ -302,6 +302,20 @@ static const SchemaField* resolve_file_child(const std::vector<SchemaField>& chi return &children[selector.ordinal - 1]; } +static const ColumnMapping* resolve_mapped_child(const std::vector<ColumnMapping>& child_mappings, + const StructChildSelector& selector) { + if (selector.by_name) { + const auto child_it = std::ranges::find_if(child_mappings, [&](const auto& child_mapping) { + return child_mapping.table_column_name == selector.name; + }); + return child_it == child_mappings.end() ? nullptr : &*child_it; + } + if (selector.ordinal == 0 || selector.ordinal > child_mappings.size()) { + return nullptr; + } + return &child_mappings[selector.ordinal - 1]; +} + static Status build_filter_projection_path(const std::vector<SchemaField>& children, std::span<const StructChildSelector> selectors, FieldProjection* projection) { @@ -335,6 +349,47 @@ static Status build_filter_projection_path(const std::vector<SchemaField>& child return Status::OK(); } +// Prefer the table-to-file mapping tree for nested filter projection. This keeps renamed +// children and field-id schema evolution in the mapper instead of leaking table names into the +// file reader request. The file schema fallback below is only for filter-only children that do not +// have an output child mapping yet. +static Status build_filter_projection_path(const ColumnMapping& mapping, + std::span<const StructChildSelector> selectors, + FieldProjection* projection) { + DORIS_CHECK(projection != nullptr); + if (selectors.empty()) { + return Status::InvalidArgument("Nested struct selector path is empty"); + } + const auto* child_mapping = resolve_mapped_child(mapping.child_mappings, selectors.front()); + if (child_mapping == nullptr) { + return build_filter_projection_path(mapping.original_file_children, selectors, projection); + } + if (!child_mapping->field_id.has_value()) { + projection->field_id = -1; + return Status::OK(); + } + projection->field_id = *child_mapping->field_id; + projection->project_all_children = selectors.size() == 1; + projection->children.clear(); + if (selectors.size() == 1) { + return Status::OK(); + } + FieldProjection child_projection; + if (child_mapping->child_mappings.empty()) { + RETURN_IF_ERROR(build_filter_projection_path(child_mapping->original_file_children, + selectors.subspan(1), &child_projection)); + } else { + RETURN_IF_ERROR(build_filter_projection_path(*child_mapping, selectors.subspan(1), + &child_projection)); + } + if (child_projection.field_id < 0) { + projection->field_id = -1; + return Status::OK(); + } + projection->children.push_back(std::move(child_projection)); + return Status::OK(); +} + static const SchemaField* resolve_filter_schema_path(const std::vector<SchemaField>& children, std::span<const StructChildSelector> selectors, std::vector<int32_t>* file_child_id_path) { @@ -363,6 +418,49 @@ static const SchemaField* resolve_filter_schema_path(const std::vector<SchemaFie return leaf; } +// Resolve a nested predicate through ColumnMapping when possible. The returned child-id path and +// leaf type are file-local, so parquet pruning can stay independent from table/global schema. +static bool resolve_mapped_filter_schema_path(const ColumnMapping& mapping, + std::span<const StructChildSelector> selectors, + std::vector<int32_t>* file_child_id_path, + std::string* leaf_name, DataTypePtr* leaf_type) { + DORIS_CHECK(file_child_id_path != nullptr); + DORIS_CHECK(leaf_name != nullptr); + DORIS_CHECK(leaf_type != nullptr); + if (selectors.empty()) { + return false; + } + const auto* child_mapping = resolve_mapped_child(mapping.child_mappings, selectors.front()); + if (child_mapping == nullptr) { + return false; + } + if (!child_mapping->field_id.has_value()) { + file_child_id_path->clear(); + return false; + } + file_child_id_path->push_back(*child_mapping->field_id); + if (selectors.size() == 1) { + if (child_mapping->file_type == nullptr || + is_complex_type(remove_nullable(child_mapping->file_type)->get_primitive_type())) { + file_child_id_path->clear(); + return false; + } + *leaf_name = child_mapping->file_column_name; + *leaf_type = remove_nullable(child_mapping->file_type); + return true; + } + if (child_mapping->child_mappings.empty()) { + file_child_id_path->clear(); + return false; + } + if (!resolve_mapped_filter_schema_path(*child_mapping, selectors.subspan(1), file_child_id_path, + leaf_name, leaf_type)) { + file_child_id_path->clear(); + return false; + } + return true; +} + static bool resolve_nested_predicate_target(const NestedStructPath& path, const std::vector<ColumnMapping>& mappings, NestedPredicateTargetInfo* target) { @@ -377,6 +475,17 @@ static bool resolve_nested_predicate_target(const NestedStructPath& path, return false; } std::vector<int32_t> file_child_id_path; + std::string leaf_name; + DataTypePtr file_leaf_type; + if (resolve_mapped_filter_schema_path(*mapping_it, path.selectors, &file_child_id_path, + &leaf_name, &file_leaf_type)) { + target->root_file_column_id = *mapping_it->field_id; + target->file_child_id_path = std::move(file_child_id_path); + target->leaf_name = std::move(leaf_name); + target->file_leaf_type = std::move(file_leaf_type); + return true; + } + const auto* leaf = resolve_filter_schema_path(mapping_it->original_file_children, path.selectors, &file_child_id_path); if (leaf == nullptr || leaf->type == nullptr || @@ -1088,8 +1197,8 @@ static Status build_filter_projection_map(const std::vector<TableFilter>& table_ } FieldProjection child_projection; - RETURN_IF_ERROR(build_filter_projection_path(mapping_it->original_file_children, - path.selectors, &child_projection)); + RETURN_IF_ERROR( + build_filter_projection_path(*mapping_it, path.selectors, &child_projection)); if (child_projection.field_id < 0) { continue; } @@ -1173,6 +1282,7 @@ Status TableColumnMapper::create_mapping(const std::vector<TableColumn>& project for (const auto& table_column : projected_columns) { ColumnMapping mapping; mapping.table_column_id = table_column.id; + mapping.table_column_name = table_column.name; mapping.table_type = table_column.type; if (table_column.is_partition_key && partition_values.contains(table_column.name)) { // 1. Partition column, use partition value as a constant mapping. Note that partition column may also have default expression, but partition value should take precedence if it exists. @@ -1380,6 +1490,7 @@ Status TableColumnMapper::_create_direct_mapping(const TableColumn& table_column return Status::InvalidArgument("mapping is null"); } mapping->field_id = file_field.id; + mapping->table_column_name = table_column.name; mapping->file_column_name = file_field.name; mapping->original_file_type = file_field.type; mapping->original_file_children = file_field.children; @@ -1400,6 +1511,7 @@ Status TableColumnMapper::_create_direct_mapping(const TableColumn& table_column } ColumnMapping child_mapping; child_mapping.table_column_id = table_child.id; + child_mapping.table_column_name = table_child.name; child_mapping.file_column_name = table_child.name; child_mapping.table_type = table_child.type; child_mapping.file_type = table_child.type; @@ -1410,6 +1522,7 @@ Status TableColumnMapper::_create_direct_mapping(const TableColumn& table_column } ColumnMapping child_mapping; child_mapping.table_column_id = table_child.id; + child_mapping.table_column_name = table_child.name; child_mapping.table_type = table_child.type; RETURN_IF_ERROR(_create_direct_mapping(table_child, *file_child, &child_mapping)); mapping->child_mappings.push_back(std::move(child_mapping)); diff --git a/be/src/format/reader/column_mapper.h b/be/src/format/reader/column_mapper.h index a70e246bcce..78ca20d6091 100644 --- a/be/src/format/reader/column_mapper.h +++ b/be/src/format/reader/column_mapper.h @@ -62,6 +62,7 @@ enum TableVirtualColumnType { // 这是 table 层和 file 层的核心边界对象。 struct ColumnMapping { int32_t table_column_id = -1; + std::string table_column_name; // File-local field id for top-level columns, or child id for nested columns. std::optional<int32_t> field_id; std::string file_column_name; diff --git a/be/test/format/new_parquet/parquet_reader_test.cpp b/be/test/format/new_parquet/parquet_reader_test.cpp index b4815ab2d5a..260e65695db 100644 --- a/be/test/format/new_parquet/parquet_reader_test.cpp +++ b/be/test/format/new_parquet/parquet_reader_test.cpp @@ -740,6 +740,61 @@ TEST(TableColumnMapperTest, MergesStructFilterOnlyChildIntoPredicateProjection) EXPECT_EQ(read_type->get_element_name(1), "a"); } +TEST(TableColumnMapperTest, MapsRenamedNestedStructPredicateByFieldId) { + auto id_type = std::make_shared<DataTypeInt32>(); + reader::SchemaField file_child; + file_child.id = 101; + file_child.name = "file_id"; + file_child.type = id_type; + reader::SchemaField struct_field; + struct_field.id = 100; + struct_field.name = "s"; + struct_field.type = std::make_shared<DataTypeStruct>(DataTypes {id_type}, Strings {"file_id"}); + struct_field.children = {file_child}; + + reader::TableColumn table_child; + table_child.id = 101; + table_child.name = "table_id"; + table_child.type = id_type; + reader::TableColumn table_column; + table_column.id = 100; + table_column.name = "s"; + table_column.type = std::make_shared<DataTypeStruct>(DataTypes {id_type}, Strings {"table_id"}); + table_column.children = {table_child}; + + auto filter_expr = std::make_shared<TestFunctionExpr>( + "gt", std::make_shared<DataTypeUInt8>(), TExprNodeType::BINARY_PRED, TExprOpcode::GT); + filter_expr->add_child( + struct_element_expr(TableSlotRef::create_shared(100, 100, -1, table_column.type, "s"), + id_type, "table_id")); + filter_expr->add_child(TableLiteral::create_shared(id_type, Field::create_field<TYPE_INT>(5))); + reader::TableFilter table_filter { + .conjunct = VExprContext::create_shared(filter_expr), + .slot_ids = {100}, + }; + + reader::TableColumnMapperOptions options; + options.mode = reader::TableColumnMappingMode::BY_FIELD_ID; + reader::TableColumnMapper mapper(options); + ASSERT_TRUE(mapper.create_mapping({table_column}, {}, {struct_field}).ok()); + + reader::FileScanRequest request; + ASSERT_TRUE(mapper.create_scan_request({table_filter}, {}, {table_column}, &request).ok()); + + ASSERT_EQ(request.predicate_columns.size(), 1); + const auto& projection = request.predicate_columns[0]; + EXPECT_EQ(projection.field_id, 100); + ASSERT_FALSE(projection.project_all_children); + ASSERT_EQ(projection.children.size(), 1); + EXPECT_EQ(projection.children[0].field_id, 101); + + ASSERT_EQ(request.column_predicate_filters.size(), 1); + EXPECT_EQ(request.column_predicate_filters[0].file_column_id, 100); + EXPECT_EQ(request.column_predicate_filters[0].file_child_id_path, std::vector<int32_t>({101})); + ASSERT_EQ(request.column_predicate_filters[0].predicates.size(), 1); + EXPECT_EQ(request.column_predicate_filters[0].predicates[0]->type(), PredicateType::GT); +} + TEST(TableColumnMapperTest, BuildsNestedStructInListPredicateFilter) { auto a_type = std::make_shared<DataTypeInt32>(); auto b_type = std::make_shared<DataTypeString>(); diff --git a/docs/complex-column-predicate-and-stats-filtering.md b/docs/complex-column-predicate-and-stats-filtering.md index 70ac659608a..3850199698a 100644 --- a/docs/complex-column-predicate-and-stats-filtering.md +++ b/docs/complex-column-predicate-and-stats-filtering.md @@ -174,6 +174,7 @@ SELECT s.name FROM t WHERE s.id > 5; - output child 顺序保持优先,filter-only child 追加到 read projection。 - filter-only child 不加入 `ColumnMapping.child_mappings`,避免 table output materialization 把它当作输出字段。 - `ColumnMapping` 保存 `original_file_type` / `original_file_children`,重复创建 split-local request 时可以从原始 file schema 重建 read projection。 +- nested filter projection 优先通过 `ColumnMapping.child_mappings` 映射 table child 到 file child;没有 child mapping 的 filter-only path 再回退到 file schema 解析。 ### 5.3 nested file-layer pruning target @@ -183,6 +184,7 @@ SELECT s.name FROM t WHERE s.id > 5; - `file_child_id_path` 是 top-level file column 下的 file-local child field id path,不是 table id,也不是 ordinal。 - mapper 会从 AND 语义下的 `struct_element(...) op literal` / `literal op struct_element(...)` 构造 nested file-layer pruning hint。 - mapper 会从 AND 语义下的 `struct_element(...) IN (...)` 构造 nested `IN_LIST` pruning hint。 +- 对已经存在 `ColumnMapping` 的 nested child,mapper 使用 table child name + field-id mapping 生成 file-local `file_child_id_path`,支持 table/file nested child rename。 - 不从 OR/NOT/任意函数子树中提取 pruning predicate,避免把非必要条件当成必需条件裁剪。 - literal 转换到 file leaf type 失败、path 解析失败、leaf 不是 primitive 时,不生成 pruning hint。 @@ -259,7 +261,7 @@ page index 对 repeated leaf 的 row range 语义复杂。本轮只允许 non-re ## 7. 后续工作 - 如果后续 Arrow writer 或外部 fixture 能稳定提供 bloom filter metadata,补 nested bloom pruning 的真实 parquet fixture。 -- schema change 场景下,把 table nested path 到 file nested path 的 mapping 入口收敛到 mapper,不让 file reader 理解 table/global schema。 +- 完整复杂 child schema change 需要 FE/table reader 提供完整 nested table mapping;file reader 仍不理解 table/global schema。 - LIST/MAP/repeated leaf 只有在 Dremel row semantics 和 row-range 语义明确后再接入 pruning。 ## 8. 需要避免的实现 --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
