This is an automated email from the ASF dual-hosted git repository. stigahuang pushed a commit to branch branch-4.1.1 in repository https://gitbox.apache.org/repos/asf/impala.git
commit 44dc157a2c10578b82518012aa2e9aa9288dc6e5 Author: ttttttz <[email protected]> AuthorDate: Wed Jun 22 11:53:28 2022 +0800 IMPALA-11344: Missing slots in all cases should be allowed to be read When selecting only the missing fields of ORC files and the missing fields contain non-partition fields, the query will fail due to `Parse error in possibly corrupt ORC file: '$filename'. No columns found for this scan`. We should allow read missing slots in all cases. Testing: - Added a test to test_scanners.py that ensures the query can be executed successfully when selecting only the missing fields of ORC files. Change-Id: I15dca47ba5f7a93bfd5fcba3cab4ac6d64459023 Reviewed-on: http://gerrit.cloudera.org:8080/18652 Reviewed-by: Impala Public Jenkins <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> Reviewed-on: http://gerrit.cloudera.org:8080/18907 Reviewed-by: Zoltan Borok-Nagy <[email protected]> --- be/src/exec/orc-column-readers.cc | 23 +-------------- tests/query_test/test_scanners.py | 60 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 22 deletions(-) diff --git a/be/src/exec/orc-column-readers.cc b/be/src/exec/orc-column-readers.cc index 644ac325f..7c9ae072d 100644 --- a/be/src/exec/orc-column-readers.cc +++ b/be/src/exec/orc-column-readers.cc @@ -530,28 +530,7 @@ Status OrcStructReader::TopLevelReadValueBatch(ScratchTupleBatch* scratch_batch, } int num_rows_read = scratch_batch->num_tuples - scratch_batch_idx; if (children_.empty()) { - // We allow empty 'children_' for original files, because we might select the - // synthetic 'rowid' field which is not present in original files. - // We also allow empty 'children_' when we need to validate row batches of a zero slot - // scan. In that case 'children_' is empty and only 'row_validator_' owns an ORC - // vector batch (the write id batch). - bool valid_empty_children = scanner_->acid_original_file_ || - (scanner_->row_batches_need_validation_ && - scanner_->scan_node_->IsZeroSlotTableScan()); - if (!valid_empty_children) { - bool only_partitions = true; - for (SlotDescriptor* slot : tuple_desc_->slots()) { - if (!scanner_->IsPartitionKeySlot(slot)) { - only_partitions = false; - break; - } - } - if (!only_partitions) { - return Status(Substitute("Parse error in possibly corrupt ORC file: '$0'. " - "No columns found for this scan.", - scanner_->filename())); - } - } + // We allow empty 'children_' in all cases. DCHECK_EQ(0, num_rows_read); num_rows_read = std::min(scratch_batch->capacity - scratch_batch->num_tuples, NumElements() - row_idx_); diff --git a/tests/query_test/test_scanners.py b/tests/query_test/test_scanners.py index 112b53dc0..ca15d43a2 100644 --- a/tests/query_test/test_scanners.py +++ b/tests/query_test/test_scanners.py @@ -1734,6 +1734,66 @@ class TestOrc(ImpalaTestSuite): self.run_test_case('QueryTest/hive2-pre-gregorian-date-orc', vector, unique_database) + @SkipIfABFS.hive + @SkipIfADLS.hive + @SkipIfIsilon.hive + @SkipIfLocal.hive + @SkipIfS3.hive + @SkipIfGCS.hive + @SkipIfCOS.hive + def test_missing_field_orc(self, unique_database): + # Test scanning orc files with missing fields in file meta. + orc_tbl_name = unique_database + ".missing_field_orc" + self.client.execute("create table %s (f0 int) stored as orc" % orc_tbl_name) + self.run_stmt_in_hive("insert into table %s select 1" % orc_tbl_name) + self.client.execute("refresh %s" % orc_tbl_name) + + self.client.execute("alter table %s add columns(f1 int)" % orc_tbl_name) + result = self.client.execute("select f1 from %s " % orc_tbl_name) + assert result.data == ['NULL'] + + self.client.execute("alter table %s add columns(f2 STRUCT<s0:STRING, s1:STRING>)" + % orc_tbl_name) + result = self.client.execute("select f2.s0 from %s " % orc_tbl_name) + assert result.data == ['NULL'] + + orc_tbl_name = unique_database + ".missing_field_full_txn_test" + self.client.execute("create table %s(f0 int) stored as orc " + "tblproperties('transactional'='true')" % orc_tbl_name) + self.run_stmt_in_hive("insert into %s values(0)" % orc_tbl_name) + self.run_stmt_in_hive("alter table %s add columns(f1 int)" % orc_tbl_name) + self.run_stmt_in_hive("insert into %s values(1,1)" % orc_tbl_name) + self.client.execute("refresh %s" % orc_tbl_name) + result = self.client.execute("select f1 from %s" % orc_tbl_name) + assert len(result.data) == 2 + assert '1' in result.data + assert 'NULL' in result.data + + # TODO: add a test case for Iceberg tables once IMPALA-10542 is done. + # orc_tbl_name = unique_database + ".missing_field_iceberg_test" + # self.client.execute("create table %s (f0 int) stored as iceberg " + # "tblproperties('write.format.default' = 'orc')" + # % orc_tbl_name) + # self.run_stmt_in_hive("insert into %s values(0)" % orc_tbl_name) + # self.run_stmt_in_hive("alter table %s add columns(f1 int)" % orc_tbl_name) + # self.run_stmt_in_hive("insert into %s values(1,1)" % orc_tbl_name) + # self.client.execute("refresh %s" % orc_tbl_name) + # result = self.client.execute("select f1 from %s" % orc_tbl_name) + # assert len(result.data) == 2 + # assert '1' in result.data + # assert 'NULL' in result.data + + orc_tbl_name = unique_database + ".lineitem_orc_ext" + test_file = "/test-warehouse/tpch.lineitem_orc_def" + create_sql = "create external table %s like tpch_orc_def.lineitem " \ + "location '%s'" % (orc_tbl_name, test_file) + self.client.execute(create_sql) + self.client.execute("alter table %s add columns (new_col int)" % orc_tbl_name) + result = self.execute_query("select count(*) from %s where new_col is null" + % orc_tbl_name) + assert len(result.data) == 1 + assert '6001215' in result.data + class TestScannerReservation(ImpalaTestSuite): @classmethod
