(impala) 01/02: IMPALA-13853: Don't adjust Iceberg field IDs for data files that don't have complex types

csringhofer Wed, 12 Mar 2025 12:41:28 -0700

This is an automated email from the ASF dual-hosted git repository.

csringhofer pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


commit a49ff618f1137808c75cf81cccdadb980e89c34d
Author: Zoltan Borok-Nagy <[email protected]>
AuthorDate: Tue Mar 11 14:01:08 2025 +0100

    IMPALA-13853: Don't adjust Iceberg field IDs for data files that don't have 
complex types
    
    In migrated Iceberg tables we can have data files with missing field
    IDs. We assume that their schema corresponds to the table schema at the
    point when the table migration happened. This means during runtime we
    can generate the field ids. The logic is more complicated when there are
    complex types in the table and the table is partitioned. In such cases
    we need to do some adjustments during field ID generation, in which case
    we verify that the file schema corresponds to the table schema.
    
    These adjustments are not needed when the table doesn't have complex
    types, hence we can be a bit more relaxed and skip schema verification,
    because field ID generation for top-level columns are not affected.
    This means Impala would still be able to read the table if there were
    trivial schema changes before migration.
    
    With this change we allow all data files that have a compatible schema
    with the table schema, which was the case before IMPALA-13364. This
    behavior is also aligned with Hive.
    
    Testing:
     * e2e tests added for both Parquet and ORC files
    
    Change-Id: Ib1f1d0cf36792d0400de346c83e999fa50c0fa67
    Reviewed-on: http://gerrit.cloudera.org:8080/22610
    Reviewed-by: Daniel Becker <[email protected]>
    Tested-by: Impala Public Jenkins <[email protected]>
---
 be/src/exec/orc/orc-metadata-utils.cc              |  5 ++-
 be/src/exec/parquet/parquet-metadata-utils.cc      |  2 +-
 ...erg-migrated-table-field-id-resolution-orc.test | 51 ++++++++++++++++++++++
 ...iceberg-migrated-table-field-id-resolution.test | 40 ++++++++++++++++-
 tests/query_test/test_iceberg.py                   |  3 ++
 5 files changed, 98 insertions(+), 3 deletions(-)

diff --git a/be/src/exec/orc/orc-metadata-utils.cc 
b/be/src/exec/orc/orc-metadata-utils.cc
index 9297acf8c..73c0c5ac6 100644
--- a/be/src/exec/orc/orc-metadata-utils.cc
+++ b/be/src/exec/orc/orc-metadata-utils.cc
@@ -348,7 +348,10 @@ Status OrcSchemaResolver::GenerateFieldIDs() {
 
       // Push children in reverse order to the stack so they are processed in 
the original
       // order
-      nodes.push(current->getSubtype(size - i - 1));
+      const orc::Type* reverseOrderChild = current->getSubtype(size - i - 1);
+      if (reverseOrderChild->getSubtypeCount() > 0) {
+        nodes.push(reverseOrderChild);
+      }
     }
 
     if (current == root_ && !nodes.empty()) {
diff --git a/be/src/exec/parquet/parquet-metadata-utils.cc 
b/be/src/exec/parquet/parquet-metadata-utils.cc
index db6bedfc2..2cb1c3bad 100644
--- a/be/src/exec/parquet/parquet-metadata-utils.cc
+++ b/be/src/exec/parquet/parquet-metadata-utils.cc
@@ -1014,7 +1014,7 @@ Status ParquetSchemaResolver::GenerateFieldIDs() {
         DCHECK(current_child.children.size() == 1);
 
         nodes.push(&current_child.children[0]);
-      } else {
+      } else if (current_child.children.size() > 0) {
         nodes.push(&current_child);
       }
     }
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/iceberg-migrated-table-field-id-resolution-orc.test
 
b/testdata/workloads/functional-query/queries/QueryTest/iceberg-migrated-table-field-id-resolution-orc.test
new file mode 100644
index 000000000..a6e46f436
--- /dev/null
+++ 
b/testdata/workloads/functional-query/queries/QueryTest/iceberg-migrated-table-field-id-resolution-orc.test
@@ -0,0 +1,51 @@
+====
+---- QUERY
+# Regression test for IMPALA-13853. We should still be able to read ORC file
+# if starting columns match the schema.
+create table mig_add_column_orc(i int) partitioned by (p int) stored as orc;
+====
+---- HIVE_QUERY
+insert into $DATABASE.mig_add_column_orc partition(p=2) values (1);
+====
+---- QUERY
+alter table mig_add_column_orc add column j int;
+alter table mig_add_column_orc convert to iceberg;
+select * from mig_add_column_orc;
+---- RESULTS
+1,NULL,2
+---- TYPES
+INT,INT,INT
+====
+---- QUERY
+# Regression test for IMPALA-13853. We should still be able to read ORC file
+# if starting columns match the schema.
+create table mig_drop_column_orc(i int, j int) partitioned by (p int) stored 
as orc;
+====
+---- HIVE_QUERY
+insert into $DATABASE.mig_drop_column_orc partition(p=2) values (1, 100);
+====
+---- QUERY
+alter table mig_drop_column_orc drop column j;
+alter table mig_drop_column_orc convert to iceberg;
+select * from mig_drop_column_orc;
+---- RESULTS
+1,2
+---- TYPES
+INT,INT
+====
+---- QUERY
+# Regression test for IMPALA-13853. We should reject ORC files that have 
mis-matching
+# schema
+create table mig_wrong_type_orc(i int, s string) partitioned by (p int) stored 
as orc;
+====
+---- HIVE_QUERY
+insert into $DATABASE.mig_wrong_type_orc partition(p=2) values (1, 'IMPALA');
+====
+---- QUERY
+alter table mig_wrong_type_orc drop column s;
+alter table mig_wrong_type_orc add column j int;
+alter table mig_wrong_type_orc convert to iceberg;
+select * from mig_wrong_type_orc;
+---- CATCH
+table column INT is map to column string in ORC file
+====
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/iceberg-migrated-table-field-id-resolution.test
 
b/testdata/workloads/functional-query/queries/QueryTest/iceberg-migrated-table-field-id-resolution.test
index 42cc98936..c419d1157 100644
--- 
a/testdata/workloads/functional-query/queries/QueryTest/iceberg-migrated-table-field-id-resolution.test
+++ 
b/testdata/workloads/functional-query/queries/QueryTest/iceberg-migrated-table-field-id-resolution.test
@@ -205,4 +205,42 @@ select my_map_4.key, my_map_4.value from 
iceberg_migrated_complex_test_orc, iceb
 5,6
 ---- TYPES
 INT, INT
-====
\ No newline at end of file
+====
+---- QUERY
+# Regression test for IMPALA-13853. We should still be able to read Parquet 
file
+# if starting columns match the schema.
+create table mig_add_column(i int) partitioned by (p int) stored as parquet;
+insert into mig_add_column partition(p) values (1, 2);
+alter table mig_add_column add column j int;
+alter table mig_add_column convert to iceberg;
+select * from mig_add_column;
+---- RESULTS
+1,NULL,2
+---- TYPES
+INT,INT,INT
+====
+---- QUERY
+# Regression test for IMPALA-13853. We should still be able to read Parquet 
file
+# if starting columns match the schema.
+create table mig_drop_column(i int, j int) partitioned by (p int) stored as 
parquet;
+insert into mig_drop_column partition(p) values (1, 100, 2);
+alter table mig_drop_column drop column j;
+alter table mig_drop_column convert to iceberg;
+select * from mig_drop_column;
+---- RESULTS
+1,2
+---- TYPES
+INT,INT
+====
+---- QUERY
+# Regression test for IMPALA-13853. We should reject Parquet files that have 
mis-matching
+# schema
+create table mig_wrong_type(i int, s string) partitioned by (p int) stored as 
parquet;
+insert into mig_wrong_type partition(p) values (1, 'IMPALA', 2);
+alter table mig_wrong_type drop column s;
+alter table mig_wrong_type add column j int;
+alter table mig_wrong_type convert to iceberg;
+select * from mig_wrong_type;
+---- CATCH
+has an incompatible Parquet schema for column
+====
diff --git a/tests/query_test/test_iceberg.py b/tests/query_test/test_iceberg.py
index 36b4909b8..10c0ae5d5 100644
--- a/tests/query_test/test_iceberg.py
+++ b/tests/query_test/test_iceberg.py
@@ -269,6 +269,9 @@ class TestIcebergTable(IcebergTestSuite):
                                         "iceberg_migrated_complex_test_orc", 
"orc")
     self.run_test_case('QueryTest/iceberg-migrated-table-field-id-resolution',
                        vector, unique_database)
+    if IS_HDFS:
+      
self.run_test_case('QueryTest/iceberg-migrated-table-field-id-resolution-orc',
+          vector, unique_database)
 
   def test_column_case_sensitivity(self, vector, unique_database):
     create_iceberg_table_from_directory(self.client, unique_database,

(impala) 01/02: IMPALA-13853: Don't adjust Iceberg field IDs for data files that don't have complex types

Reply via email to