This is an automated email from the ASF dual-hosted git repository. suxiaogang223 pushed a commit to branch codex/complex-column-predicate-stats-filtering in repository https://gitbox.apache.org/repos/asf/doris.git
commit c3e3771bb230c0197f83e8a3d1d0b55ccdae490b Author: Socrates <[email protected]> AuthorDate: Thu Jun 4 02:35:14 2026 +0800 [test](be) Cover nested parquet pruning fixtures ### What problem does this PR solve? Issue Number: close #xxx Related PR: #xxx Problem Summary: Add real parquet fixtures for nested struct dictionary and page-index pruning, and update the complex predicate pruning status document. ### Release note None ### Check List (For Author) - Test: Unit Test - git diff --check - Behavior changed: No - Does this need documentation: Yes --- be/test/format/new_parquet/parquet_reader_test.cpp | 147 +++++++++++++++++++++ ...complex-column-predicate-and-stats-filtering.md | 11 +- 2 files changed, 152 insertions(+), 6 deletions(-) diff --git a/be/test/format/new_parquet/parquet_reader_test.cpp b/be/test/format/new_parquet/parquet_reader_test.cpp index 13c7ee419cc..b4815ab2d5a 100644 --- a/be/test/format/new_parquet/parquet_reader_test.cpp +++ b/be/test/format/new_parquet/parquet_reader_test.cpp @@ -375,6 +375,31 @@ void write_dictionary_filter_parquet_file(const std::string& file_path) { builder.build())); } +void write_nested_dictionary_filter_parquet_file(const std::string& file_path) { + auto id_field = arrow::field("id", arrow::int32(), false); + auto name_field = arrow::field("name", arrow::utf8(), false); + auto struct_type = arrow::struct_({id_field, name_field}); + auto schema = arrow::schema({ + arrow::field("s", struct_type, false), + }); + auto table = arrow::Table::Make( + schema, {build_struct_array({1, 2, 3, 4, 5, 6}, {"aa", "az", "lm", "lz", "za", "zz"})}); + + auto file_result = arrow::io::FileOutputStream::Open(file_path); + ASSERT_TRUE(file_result.ok()) << file_result.status(); + std::shared_ptr<arrow::io::FileOutputStream> out = *file_result; + + ::parquet::WriterProperties::Builder builder; + builder.version(::parquet::ParquetVersion::PARQUET_2_6); + builder.data_page_version(::parquet::ParquetDataPageVersion::V2); + builder.compression(::parquet::Compression::UNCOMPRESSED); + builder.enable_dictionary("s.name"); + builder.disable_dictionary("s.id"); + builder.disable_statistics(); + PARQUET_THROW_NOT_OK(::parquet::arrow::WriteTable(*table, arrow::default_memory_pool(), out, 1, + builder.build())); +} + void write_dictionary_edge_parquet_file(const std::string& file_path) { auto schema = arrow::schema({ arrow::field("id", arrow::int32(), false), @@ -400,6 +425,38 @@ void write_dictionary_edge_parquet_file(const std::string& file_path) { builder.build())); } +void write_nested_page_index_filter_parquet_file(const std::string& file_path) { + std::vector<int32_t> ids(128); + std::iota(ids.begin(), ids.end(), 0); + std::vector<std::string> names; + names.reserve(ids.size()); + for (const auto id : ids) { + names.push_back("name-" + std::to_string(id)); + } + auto id_field = arrow::field("id", arrow::int32(), false); + auto name_field = arrow::field("name", arrow::utf8(), false); + auto struct_type = arrow::struct_({id_field, name_field}); + auto schema = arrow::schema({ + arrow::field("s", struct_type, false), + }); + auto table = arrow::Table::Make(schema, {build_struct_array(ids, names)}); + + auto file_result = arrow::io::FileOutputStream::Open(file_path); + ASSERT_TRUE(file_result.ok()) << file_result.status(); + std::shared_ptr<arrow::io::FileOutputStream> out = *file_result; + + ::parquet::WriterProperties::Builder builder; + builder.version(::parquet::ParquetVersion::PARQUET_2_6); + builder.data_page_version(::parquet::ParquetDataPageVersion::V2); + builder.compression(::parquet::Compression::UNCOMPRESSED); + builder.disable_dictionary(); + builder.enable_write_page_index(); + builder.write_batch_size(8); + builder.data_pagesize(10); + PARQUET_THROW_NOT_OK(::parquet::arrow::WriteTable(*table, arrow::default_memory_pool(), out, + ids.size(), builder.build())); +} + void write_page_index_filter_parquet_file(const std::string& file_path) { std::vector<int32_t> ids(128); std::iota(ids.begin(), ids.end(), 0); @@ -1650,6 +1707,50 @@ TEST_F(NewParquetReaderTest, NestedStructPredicateFiltersRowGroupsByStatistics) EXPECT_EQ(plan.pruning_stats.filtered_group_rows, 2); } +TEST_F(NewParquetReaderTest, NestedStructPredicateFiltersRowGroupsByDictionary) { + write_nested_dictionary_filter_parquet_file(_file_path); + auto parquet_file_reader = ::parquet::ParquetFileReader::OpenFile(_file_path, false); + ASSERT_EQ(parquet_file_reader->metadata()->num_row_groups(), 6); + for (int row_group_idx = 0; row_group_idx < 6; ++row_group_idx) { + auto row_group = parquet_file_reader->metadata()->RowGroup(row_group_idx); + ASSERT_NE(row_group, nullptr); + auto name_chunk = row_group->ColumnChunk(1); + ASSERT_NE(name_chunk, nullptr); + ASSERT_TRUE(name_chunk->has_dictionary_page()); + ASSERT_TRUE(name_chunk->statistics() == nullptr || !name_chunk->statistics()->HasMinMax()); + } + + std::vector<std::unique_ptr<parquet::ParquetColumnSchema>> file_schema; + auto schema_descriptor = parquet_file_reader->metadata()->schema(); + ASSERT_NE(schema_descriptor, nullptr); + ASSERT_TRUE(parquet::build_parquet_column_schema(*schema_descriptor, &file_schema).ok()); + ASSERT_EQ(file_schema.size(), 1); + ASSERT_EQ(file_schema[0]->children.size(), 2); + ASSERT_EQ(file_schema[0]->children[1]->name, "name"); + + reader::FileScanRequest request; + reader::FileColumnPredicateFilter column_filter; + column_filter.file_column_id = 0; + column_filter.file_child_id_path = {1}; + auto name_type = std::make_shared<DataTypeString>(); + column_filter.predicates.push_back(create_comparison_predicate<PredicateType::EQ>( + 0, "name", name_type, Field::create_field<TYPE_STRING>("lm"), false)); + request.column_predicate_filters.push_back(std::move(column_filter)); + + parquet::RowGroupScanPlan plan; + parquet::ParquetScanRange scan_range; + ASSERT_TRUE(parquet::plan_parquet_row_groups(*parquet_file_reader->metadata(), + parquet_file_reader.get(), file_schema, request, + scan_range, false, &plan) + .ok()); + ASSERT_EQ(plan.row_groups.size(), 1); + EXPECT_EQ(plan.row_groups[0].row_group_id, 2); + EXPECT_EQ(plan.pruning_stats.total_row_groups, 6); + EXPECT_EQ(plan.pruning_stats.selected_row_groups, 1); + EXPECT_EQ(plan.pruning_stats.filtered_row_groups_by_dictionary, 5); + EXPECT_EQ(plan.pruning_stats.filtered_group_rows, 5); +} + TEST_F(NewParquetReaderTest, PlannerNarrowsRowRangesByPageIndex) { write_page_index_filter_parquet_file(_file_path); auto parquet_file_reader = ::parquet::ParquetFileReader::OpenFile(_file_path, false); @@ -1693,6 +1794,52 @@ TEST_F(NewParquetReaderTest, PlannerNarrowsRowRangesByPageIndex) { EXPECT_EQ(plan.pruning_stats.selected_row_ranges, plan.row_groups[0].selected_ranges.size()); } +TEST_F(NewParquetReaderTest, NestedStructPredicateNarrowsRowRangesByPageIndex) { + write_nested_page_index_filter_parquet_file(_file_path); + auto parquet_file_reader = ::parquet::ParquetFileReader::OpenFile(_file_path, false); + ASSERT_EQ(parquet_file_reader->metadata()->num_row_groups(), 1); + auto page_index_reader = parquet_file_reader->GetPageIndexReader(); + ASSERT_NE(page_index_reader, nullptr); + auto row_group_index_reader = page_index_reader->RowGroup(0); + ASSERT_NE(row_group_index_reader, nullptr); + auto offset_index = row_group_index_reader->GetOffsetIndex(0); + ASSERT_NE(offset_index, nullptr); + ASSERT_GT(offset_index->page_locations().size(), 1); + + std::vector<std::unique_ptr<parquet::ParquetColumnSchema>> file_schema; + auto schema_descriptor = parquet_file_reader->metadata()->schema(); + ASSERT_NE(schema_descriptor, nullptr); + ASSERT_TRUE(parquet::build_parquet_column_schema(*schema_descriptor, &file_schema).ok()); + ASSERT_EQ(file_schema.size(), 1); + ASSERT_EQ(file_schema[0]->children.size(), 2); + ASSERT_EQ(file_schema[0]->children[0]->name, "id"); + + reader::FileScanRequest request; + reader::FileColumnPredicateFilter column_filter; + column_filter.file_column_id = 0; + column_filter.file_child_id_path = {0}; + auto id_type = std::make_shared<DataTypeInt32>(); + column_filter.predicates.push_back(create_comparison_predicate<PredicateType::GT>( + 0, "id", id_type, Field::create_field<TYPE_INT>(63), false)); + request.column_predicate_filters.push_back(std::move(column_filter)); + + parquet::RowGroupScanPlan plan; + parquet::ParquetScanRange scan_range; + ASSERT_TRUE(parquet::plan_parquet_row_groups(*parquet_file_reader->metadata(), + parquet_file_reader.get(), file_schema, request, + scan_range, false, &plan) + .ok()); + ASSERT_EQ(plan.row_groups.size(), 1); + ASSERT_FALSE(plan.row_groups[0].selected_ranges.empty()); + EXPECT_GT(plan.row_groups[0].selected_ranges.front().start, 0); + EXPECT_LT(plan.row_groups[0].selected_ranges.front().length, 128); + EXPECT_EQ(plan.pruning_stats.total_row_groups, 1); + EXPECT_EQ(plan.pruning_stats.selected_row_groups, 1); + EXPECT_EQ(plan.pruning_stats.filtered_row_groups_by_page_index, 0); + EXPECT_GT(plan.pruning_stats.filtered_page_rows, 0); + EXPECT_EQ(plan.pruning_stats.selected_row_ranges, plan.row_groups[0].selected_ranges.size()); +} + TEST_F(NewParquetReaderTest, InPredicateFiltersRowGroupsByDictionary) { write_dictionary_filter_parquet_file(_file_path); auto reader = create_reader(); diff --git a/docs/complex-column-predicate-and-stats-filtering.md b/docs/complex-column-predicate-and-stats-filtering.md index 448149ddea4..70ac659608a 100644 --- a/docs/complex-column-predicate-and-stats-filtering.md +++ b/docs/complex-column-predicate-and-stats-filtering.md @@ -182,6 +182,7 @@ SELECT s.name FROM t WHERE s.id > 5; - `FileColumnPredicateFilter` 保留 `file_column_id`,新增 `file_child_id_path`。 - `file_child_id_path` 是 top-level file column 下的 file-local child field id path,不是 table id,也不是 ordinal。 - mapper 会从 AND 语义下的 `struct_element(...) op literal` / `literal op struct_element(...)` 构造 nested file-layer pruning hint。 +- mapper 会从 AND 语义下的 `struct_element(...) IN (...)` 构造 nested `IN_LIST` pruning hint。 - 不从 OR/NOT/任意函数子树中提取 pruning predicate,避免把非必要条件当成必需条件裁剪。 - literal 转换到 file leaf type 失败、path 解析失败、leaf 不是 primitive 时,不生成 pruning hint。 @@ -192,9 +193,9 @@ SELECT s.name FROM t WHERE s.id > 5; - `ParquetStatisticsUtils::ResolvePredicateLeafSchema()` 统一解析 top-level 或 nested target。 - 解析结果必须是 primitive leaf、`leaf_column_id >= 0` 且 `max_repetition_level == 0`。 - row group min/max statistics 使用 resolved leaf schema。 -- dictionary pruning 使用 resolved leaf schema 和 leaf `ColumnChunk`,仍保持 string-like、dictionary-encoded、EQ/IN_LIST 限制。 -- bloom filter 使用 resolved leaf schema,仍保持 supported primitive type、EQ/IN_LIST/null 相关限制。 -- page index 使用 resolved leaf schema,只允许 non-repeated primitive leaf;LIST/MAP/repeated leaf 直接跳过 page range pruning。 +- dictionary pruning 使用 resolved leaf schema 和 leaf `ColumnChunk`,仍保持 string-like、dictionary-encoded、EQ/IN_LIST 限制,并已有 nested struct 真实 parquet fixture 覆盖。 +- bloom filter 使用 resolved leaf schema,仍保持 supported primitive type、EQ/IN_LIST/null 相关限制;当前 Arrow writer 头文件没有稳定的 bloom 写入开关,因此先以 Arrow bloom adapter / pruning 逻辑单元测试覆盖。 +- page index 使用 resolved leaf schema,只允许 non-repeated primitive leaf;LIST/MAP/repeated leaf 直接跳过 page range pruning,并已有 nested struct 真实 parquet fixture 覆盖。 ## 6. 统计信息 / pruning 设计约束 @@ -257,9 +258,7 @@ page index 对 repeated leaf 的 row range 语义复杂。本轮只允许 non-re ## 7. 后续工作 -- 扩展 UT 覆盖 nested struct 多层 path、反向比较、OR 不提取、缺失 child 不下推。 -- 增加 nested string leaf dictionary pruning、nested page index pruning、nested bloom pruning 的真实 parquet fixture。 -- 支持从 `IN_PRED` 的 `struct_element(...) IN (...)` 构造 nested `IN_LIST` pruning hint。 +- 如果后续 Arrow writer 或外部 fixture 能稳定提供 bloom filter metadata,补 nested bloom pruning 的真实 parquet fixture。 - schema change 场景下,把 table nested path 到 file nested path 的 mapping 入口收敛到 mapper,不让 file reader 理解 table/global schema。 - LIST/MAP/repeated leaf 只有在 Dremel row semantics 和 row-range 语义明确后再接入 pruning。 --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
