This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 845d459f05 [Fix](orc-reader) Fix some bugs of orc lazy materialization. (#20410) 845d459f05 is described below commit 845d459f055f792c828c83f9334a9e49174b3ee4 Author: Qi Chen <kaka11.c...@gmail.com> AuthorDate: Fri Jun 9 08:53:01 2023 +0800 [Fix](orc-reader) Fix some bugs of orc lazy materialization. (#20410) Fix some bugs of orc lazy materialization(#18615) - Fix issue causing column size to continuously increase after `execute_conjuncts()` by calling `Block::erase_useless_column()`. - Fix partition issues of orc lazy materialization. - Fix lazy materialization will not be used when the predicate column is inconsistent with the orc file. --- be/src/vec/exec/format/orc/vorc_reader.cpp | 21 +++++++------ be/src/vec/exec/format/orc/vorc_reader.h | 1 + .../iceberg/iceberg_partition_upper_case.out | 36 +++++++++++++++++++--- .../iceberg/iceberg_partition_upper_case.groovy | 26 +++++++++++++--- 4 files changed, 67 insertions(+), 17 deletions(-) diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp b/be/src/vec/exec/format/orc/vorc_reader.cpp index 0ba81cf986..7d92a9f5e1 100644 --- a/be/src/vec/exec/format/orc/vorc_reader.cpp +++ b/be/src/vec/exec/format/orc/vorc_reader.cpp @@ -637,11 +637,7 @@ Status OrcReader::set_fill_columns( std::unordered_map<std::string, std::pair<uint32_t, int>> predicate_columns; std::function<void(VExpr * expr)> visit_slot = [&](VExpr* expr) { if (VSlotRef* slot_ref = typeid_cast<VSlotRef*>(expr)) { - auto expr_name = slot_ref->expr_name(); - auto iter = _col_name_to_file_col_name.find(expr_name); - if (iter != _col_name_to_file_col_name.end()) { - expr_name = iter->second; - } + auto& expr_name = slot_ref->expr_name(); predicate_columns.emplace(expr_name, std::make_pair(slot_ref->column_id(), slot_ref->slot_id())); if (slot_ref->column_id() == 0) { @@ -684,6 +680,8 @@ Status OrcReader::set_fill_columns( } else { _lazy_read_ctx.predicate_columns.first.emplace_back(iter->first); _lazy_read_ctx.predicate_columns.second.emplace_back(iter->second.second); + _lazy_read_ctx.predicate_orc_columns.emplace_back( + _col_name_to_file_col_name[iter->first]); _lazy_read_ctx.all_predicate_col_ids.emplace_back(iter->second.first); } } @@ -714,6 +712,10 @@ Status OrcReader::set_fill_columns( _lazy_read_ctx.can_lazy_read = true; } + if (_colname_to_value_range == nullptr || !_init_search_argument(_colname_to_value_range)) { + _lazy_read_ctx.can_lazy_read = false; + } + if (!_lazy_read_ctx.can_lazy_read) { for (auto& kv : _lazy_read_ctx.predicate_partition_columns) { _lazy_read_ctx.partition_columns.emplace(kv.first, kv.second); @@ -728,12 +730,9 @@ Status OrcReader::set_fill_columns( // create orc row reader _row_reader_options.range(_range_start_offset, _range_size); _row_reader_options.setTimezoneName(_ctz); - if (!_init_search_argument(_colname_to_value_range)) { - _lazy_read_ctx.can_lazy_read = false; - } _row_reader_options.include(_read_cols); if (_lazy_read_ctx.can_lazy_read) { - _row_reader_options.filter(_lazy_read_ctx.predicate_columns.first); + _row_reader_options.filter(_lazy_read_ctx.predicate_orc_columns); _orc_filter = std::unique_ptr<ORCFilterImpl>(new ORCFilterImpl(this)); } try { @@ -1209,6 +1208,8 @@ Status OrcReader::get_next_block(Block* block, size_t* read_rows, bool* eof) { } *read_rows = rr; + RETURN_IF_ERROR(_fill_partition_columns(block, rr, _lazy_read_ctx.partition_columns)); + RETURN_IF_ERROR(_fill_missing_columns(block, rr, _lazy_read_ctx.missing_columns)); RETURN_IF_CATCH_EXCEPTION(Block::filter_block_internal(block, columns_to_filter, *_filter)); Block::erase_useless_column(block, column_to_keep); } else { @@ -1271,6 +1272,7 @@ void OrcReader::_fill_batch_vec(std::vector<orc::ColumnVectorBatch*>& result, Status OrcReader::filter(orc::ColumnVectorBatch& data, uint16_t* sel, uint16_t size, void* arg) { Block* block = (Block*)arg; + size_t origin_column_num = block->columns(); const auto& batch_vec = down_cast<orc::StructVectorBatch*>(&data)->fields; for (auto& col_name : _lazy_read_ctx.predicate_columns.first) { @@ -1318,6 +1320,7 @@ Status OrcReader::filter(orc::ColumnVectorBatch& data, uint16_t* sel, uint16_t s for (auto& col : _lazy_read_ctx.predicate_missing_columns) { block->get_by_name(col.first).column->assume_mutable()->clear(); } + Block::erase_useless_column(block, origin_column_num); } uint16_t new_size = 0; diff --git a/be/src/vec/exec/format/orc/vorc_reader.h b/be/src/vec/exec/format/orc/vorc_reader.h index 7b5f808ce8..13a2d7265c 100644 --- a/be/src/vec/exec/format/orc/vorc_reader.h +++ b/be/src/vec/exec/format/orc/vorc_reader.h @@ -102,6 +102,7 @@ struct LazyReadContext { // be different with orc column name // std::pair<std::list<col_name>, std::vector<slot_id>> std::pair<std::list<std::string>, std::vector<int>> predicate_columns; + // predicate orc file column names std::list<std::string> predicate_orc_columns; std::vector<std::string> lazy_read_columns; std::unordered_map<std::string, std::tuple<std::string, const SlotDescriptor*>> diff --git a/regression-test/data/external_table_emr_p2/iceberg/iceberg_partition_upper_case.out b/regression-test/data/external_table_emr_p2/iceberg/iceberg_partition_upper_case.out index 9da2a5b80f..376a9495b0 100644 --- a/regression-test/data/external_table_emr_p2/iceberg/iceberg_partition_upper_case.out +++ b/regression-test/data/external_table_emr_p2/iceberg/iceberg_partition_upper_case.out @@ -26,31 +26,45 @@ Shanghai -- !orcupper5 -- 2 k2_2 k3_2 Beijing +-- !orcupper6 -- +1 k2_1 k3_1 Beijing + +-- !orcupper7 -- +1 k2_1 k3_1 Beijing +2 k2_2 k3_2 Beijing + -- !orclower1 -- 1 k2_1 k3_1 Beijing 2 k2_2 k3_2 Beijing 3 k2_3 k3_3 Shanghai 4 k2_4 k3_4 Shanghai --- !orclower1 -- +-- !orclower2 -- 1 Beijing 2 Beijing 3 Shanghai 4 Shanghai --- !orclower1 -- +-- !orclower3 -- 1 k2_1 2 k2_2 3 k2_3 4 k2_4 --- !orclower1 -- +-- !orclower4 -- Beijing Beijing Shanghai Shanghai --- !orclower1 -- +-- !orclower5 -- +2 k2_2 k3_2 Beijing + +-- !orclower6 -- +1 k2_1 k3_1 Beijing + +-- !orclower7 -- +1 k2_1 k3_1 Beijing 2 k2_2 k3_2 Beijing -- !parquetupper1 -- @@ -84,6 +98,13 @@ Shanghai 3 k2_3 k3_3 Shanghai 4 k2_4 k3_4 Shanghai +-- !parquetupper7 -- +1 k2_1 k3_1 Beijing + +-- !parquetupper8 -- +1 k2_1 k3_1 Beijing +2 k2_2 k3_2 Beijing + -- !parquetlower1 -- 1 k2_1 k3_1 Beijing 2 k2_2 k3_2 Beijing @@ -115,3 +136,10 @@ Shanghai 3 k2_3 k3_3 Shanghai 4 k2_4 k3_4 Shanghai +-- !parquetupper7 -- +1 k2_1 k3_1 Beijing + +-- !parquetupper8 -- +1 k2_1 k3_1 Beijing +2 k2_2 k3_2 Beijing + diff --git a/regression-test/suites/external_table_emr_p2/iceberg/iceberg_partition_upper_case.groovy b/regression-test/suites/external_table_emr_p2/iceberg/iceberg_partition_upper_case.groovy index efc2aaf77f..b4957495dc 100644 --- a/regression-test/suites/external_table_emr_p2/iceberg/iceberg_partition_upper_case.groovy +++ b/regression-test/suites/external_table_emr_p2/iceberg/iceberg_partition_upper_case.groovy @@ -21,12 +21,16 @@ suite("iceberg_partition_upper_case", "p2") { def orc_upper3 = """select k1, k2 from iceberg_partition_upper_case_orc order by k1;""" def orc_upper4 = """select city from iceberg_partition_upper_case_orc order by city;""" def orc_upper5 = """select * from iceberg_partition_upper_case_orc where k1>1 and city='Beijing' order by k1;""" + def orc_upper6 = """select * from iceberg_partition_upper_case_orc where k1=1 order by k1;""" + def orc_upper7 = """select * from iceberg_partition_upper_case_orc where k2 like '%k2%' and city like '%Bei%' order by k1;""" def orc_lower1 = """select * from iceberg_partition_lower_case_orc order by k1;""" def orc_lower2 = """select k1, city from iceberg_partition_lower_case_orc order by k1;""" def orc_lower3 = """select k1, k2 from iceberg_partition_lower_case_orc order by k1;""" def orc_lower4 = """select city from iceberg_partition_lower_case_orc order by city;""" def orc_lower5 = """select * from iceberg_partition_lower_case_orc where k1>1 and city='Beijing' order by k1;""" + def orc_lower6 = """select * from iceberg_partition_lower_case_orc where k1=1 order by k1;""" + def orc_lower7 = """select * from iceberg_partition_lower_case_orc where k2 like '%k2%' and city like '%Bei%' order by k1;""" def parquet_upper1 = """select * from iceberg_partition_upper_case_parquet order by k1;""" def parquet_upper2 = """select k1, city from iceberg_partition_upper_case_parquet order by k1;""" @@ -34,6 +38,8 @@ suite("iceberg_partition_upper_case", "p2") { def parquet_upper4 = """select city from iceberg_partition_upper_case_parquet order by city;""" def parquet_upper5 = """select * from iceberg_partition_upper_case_parquet where k1>1 and city='Beijing' order by k1;""" def parquet_upper6 = """select * from iceberg_partition_upper_case_parquet where substring(city, 6)='hai' order by k1;""" + def parquet_upper7 = """select * from iceberg_partition_upper_case_parquet where k1=1 order by k1;""" + def parquet_upper8 = """select * from iceberg_partition_upper_case_parquet where k2 like '%k2%' and city like '%Bei%' order by k1;""" def parquet_lower1 = """select * from iceberg_partition_lower_case_parquet order by k1;""" def parquet_lower2 = """select k1, city from iceberg_partition_lower_case_parquet order by k1;""" @@ -41,6 +47,8 @@ suite("iceberg_partition_upper_case", "p2") { def parquet_lower4 = """select city from iceberg_partition_lower_case_parquet order by city;""" def parquet_lower5 = """select * from iceberg_partition_lower_case_parquet where k1>1 and city='Beijing' order by k1;""" def parquet_lower6 = """select * from iceberg_partition_lower_case_parquet where substring(city, 6)='hai' order by k1;""" + def parquet_lower7 = """select * from iceberg_partition_lower_case_parquet where k1=1 order by k1;""" + def parquet_lower8 = """select * from iceberg_partition_lower_case_parquet where k2 like '%k2%' and city like '%Bei%' order by k1;""" String enabled = context.config.otherConfigs.get("enableExternalHiveTest") if (enabled != null && enabled.equalsIgnoreCase("true")) { @@ -63,23 +71,33 @@ suite("iceberg_partition_upper_case", "p2") { qt_orcupper3 orc_upper3 qt_orcupper4 orc_upper4 qt_orcupper5 orc_upper5 + qt_orcupper6 orc_upper6 + qt_orcupper7 orc_upper7 + qt_orclower1 orc_lower1 - qt_orclower1 orc_lower2 - qt_orclower1 orc_lower3 - qt_orclower1 orc_lower4 - qt_orclower1 orc_lower5 + qt_orclower2 orc_lower2 + qt_orclower3 orc_lower3 + qt_orclower4 orc_lower4 + qt_orclower5 orc_lower5 + qt_orclower6 orc_lower6 + qt_orclower7 orc_lower7 qt_parquetupper1 parquet_upper1 qt_parquetupper2 parquet_upper2 qt_parquetupper3 parquet_upper3 qt_parquetupper4 parquet_upper4 qt_parquetupper5 parquet_upper5 qt_parquetupper6 parquet_upper6 + qt_parquetupper7 parquet_upper7 + qt_parquetupper8 parquet_upper8 qt_parquetlower1 parquet_lower1 qt_parquetlower2 parquet_lower2 qt_parquetlower3 parquet_lower3 qt_parquetlower4 parquet_lower4 qt_parquetlower5 parquet_lower5 qt_parquetlower6 parquet_lower6 + qt_parquetupper7 parquet_upper7 + qt_parquetupper8 parquet_upper8 } } + --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org