This is an automated email from the ASF dual-hosted git repository. csringhofer pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit a49ff618f1137808c75cf81cccdadb980e89c34d Author: Zoltan Borok-Nagy <[email protected]> AuthorDate: Tue Mar 11 14:01:08 2025 +0100 IMPALA-13853: Don't adjust Iceberg field IDs for data files that don't have complex types In migrated Iceberg tables we can have data files with missing field IDs. We assume that their schema corresponds to the table schema at the point when the table migration happened. This means during runtime we can generate the field ids. The logic is more complicated when there are complex types in the table and the table is partitioned. In such cases we need to do some adjustments during field ID generation, in which case we verify that the file schema corresponds to the table schema. These adjustments are not needed when the table doesn't have complex types, hence we can be a bit more relaxed and skip schema verification, because field ID generation for top-level columns are not affected. This means Impala would still be able to read the table if there were trivial schema changes before migration. With this change we allow all data files that have a compatible schema with the table schema, which was the case before IMPALA-13364. This behavior is also aligned with Hive. Testing: * e2e tests added for both Parquet and ORC files Change-Id: Ib1f1d0cf36792d0400de346c83e999fa50c0fa67 Reviewed-on: http://gerrit.cloudera.org:8080/22610 Reviewed-by: Daniel Becker <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> --- be/src/exec/orc/orc-metadata-utils.cc | 5 ++- be/src/exec/parquet/parquet-metadata-utils.cc | 2 +- ...erg-migrated-table-field-id-resolution-orc.test | 51 ++++++++++++++++++++++ ...iceberg-migrated-table-field-id-resolution.test | 40 ++++++++++++++++- tests/query_test/test_iceberg.py | 3 ++ 5 files changed, 98 insertions(+), 3 deletions(-) diff --git a/be/src/exec/orc/orc-metadata-utils.cc b/be/src/exec/orc/orc-metadata-utils.cc index 9297acf8c..73c0c5ac6 100644 --- a/be/src/exec/orc/orc-metadata-utils.cc +++ b/be/src/exec/orc/orc-metadata-utils.cc @@ -348,7 +348,10 @@ Status OrcSchemaResolver::GenerateFieldIDs() { // Push children in reverse order to the stack so they are processed in the original // order - nodes.push(current->getSubtype(size - i - 1)); + const orc::Type* reverseOrderChild = current->getSubtype(size - i - 1); + if (reverseOrderChild->getSubtypeCount() > 0) { + nodes.push(reverseOrderChild); + } } if (current == root_ && !nodes.empty()) { diff --git a/be/src/exec/parquet/parquet-metadata-utils.cc b/be/src/exec/parquet/parquet-metadata-utils.cc index db6bedfc2..2cb1c3bad 100644 --- a/be/src/exec/parquet/parquet-metadata-utils.cc +++ b/be/src/exec/parquet/parquet-metadata-utils.cc @@ -1014,7 +1014,7 @@ Status ParquetSchemaResolver::GenerateFieldIDs() { DCHECK(current_child.children.size() == 1); nodes.push(¤t_child.children[0]); - } else { + } else if (current_child.children.size() > 0) { nodes.push(¤t_child); } } diff --git a/testdata/workloads/functional-query/queries/QueryTest/iceberg-migrated-table-field-id-resolution-orc.test b/testdata/workloads/functional-query/queries/QueryTest/iceberg-migrated-table-field-id-resolution-orc.test new file mode 100644 index 000000000..a6e46f436 --- /dev/null +++ b/testdata/workloads/functional-query/queries/QueryTest/iceberg-migrated-table-field-id-resolution-orc.test @@ -0,0 +1,51 @@ +==== +---- QUERY +# Regression test for IMPALA-13853. We should still be able to read ORC file +# if starting columns match the schema. +create table mig_add_column_orc(i int) partitioned by (p int) stored as orc; +==== +---- HIVE_QUERY +insert into $DATABASE.mig_add_column_orc partition(p=2) values (1); +==== +---- QUERY +alter table mig_add_column_orc add column j int; +alter table mig_add_column_orc convert to iceberg; +select * from mig_add_column_orc; +---- RESULTS +1,NULL,2 +---- TYPES +INT,INT,INT +==== +---- QUERY +# Regression test for IMPALA-13853. We should still be able to read ORC file +# if starting columns match the schema. +create table mig_drop_column_orc(i int, j int) partitioned by (p int) stored as orc; +==== +---- HIVE_QUERY +insert into $DATABASE.mig_drop_column_orc partition(p=2) values (1, 100); +==== +---- QUERY +alter table mig_drop_column_orc drop column j; +alter table mig_drop_column_orc convert to iceberg; +select * from mig_drop_column_orc; +---- RESULTS +1,2 +---- TYPES +INT,INT +==== +---- QUERY +# Regression test for IMPALA-13853. We should reject ORC files that have mis-matching +# schema +create table mig_wrong_type_orc(i int, s string) partitioned by (p int) stored as orc; +==== +---- HIVE_QUERY +insert into $DATABASE.mig_wrong_type_orc partition(p=2) values (1, 'IMPALA'); +==== +---- QUERY +alter table mig_wrong_type_orc drop column s; +alter table mig_wrong_type_orc add column j int; +alter table mig_wrong_type_orc convert to iceberg; +select * from mig_wrong_type_orc; +---- CATCH +table column INT is map to column string in ORC file +==== diff --git a/testdata/workloads/functional-query/queries/QueryTest/iceberg-migrated-table-field-id-resolution.test b/testdata/workloads/functional-query/queries/QueryTest/iceberg-migrated-table-field-id-resolution.test index 42cc98936..c419d1157 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/iceberg-migrated-table-field-id-resolution.test +++ b/testdata/workloads/functional-query/queries/QueryTest/iceberg-migrated-table-field-id-resolution.test @@ -205,4 +205,42 @@ select my_map_4.key, my_map_4.value from iceberg_migrated_complex_test_orc, iceb 5,6 ---- TYPES INT, INT -==== \ No newline at end of file +==== +---- QUERY +# Regression test for IMPALA-13853. We should still be able to read Parquet file +# if starting columns match the schema. +create table mig_add_column(i int) partitioned by (p int) stored as parquet; +insert into mig_add_column partition(p) values (1, 2); +alter table mig_add_column add column j int; +alter table mig_add_column convert to iceberg; +select * from mig_add_column; +---- RESULTS +1,NULL,2 +---- TYPES +INT,INT,INT +==== +---- QUERY +# Regression test for IMPALA-13853. We should still be able to read Parquet file +# if starting columns match the schema. +create table mig_drop_column(i int, j int) partitioned by (p int) stored as parquet; +insert into mig_drop_column partition(p) values (1, 100, 2); +alter table mig_drop_column drop column j; +alter table mig_drop_column convert to iceberg; +select * from mig_drop_column; +---- RESULTS +1,2 +---- TYPES +INT,INT +==== +---- QUERY +# Regression test for IMPALA-13853. We should reject Parquet files that have mis-matching +# schema +create table mig_wrong_type(i int, s string) partitioned by (p int) stored as parquet; +insert into mig_wrong_type partition(p) values (1, 'IMPALA', 2); +alter table mig_wrong_type drop column s; +alter table mig_wrong_type add column j int; +alter table mig_wrong_type convert to iceberg; +select * from mig_wrong_type; +---- CATCH +has an incompatible Parquet schema for column +==== diff --git a/tests/query_test/test_iceberg.py b/tests/query_test/test_iceberg.py index 36b4909b8..10c0ae5d5 100644 --- a/tests/query_test/test_iceberg.py +++ b/tests/query_test/test_iceberg.py @@ -269,6 +269,9 @@ class TestIcebergTable(IcebergTestSuite): "iceberg_migrated_complex_test_orc", "orc") self.run_test_case('QueryTest/iceberg-migrated-table-field-id-resolution', vector, unique_database) + if IS_HDFS: + self.run_test_case('QueryTest/iceberg-migrated-table-field-id-resolution-orc', + vector, unique_database) def test_column_case_sensitivity(self, vector, unique_database): create_iceberg_table_from_directory(self.client, unique_database,
