This is an automated email from the ASF dual-hosted git repository. boroknagyz pushed a commit to branch branch-4.4.0 in repository https://gitbox.apache.org/repos/asf/impala.git
commit 99ce967ba60666adff5dd74fd38a06fee7f2c521 Author: Zoltan Borok-Nagy <[email protected]> AuthorDate: Mon Apr 15 18:30:35 2024 +0200 IMPALA-13002: Iceberg V2 tables with Avro delete files aren't read properly If the Iceberg table has Avro delete files (e.g. by setting 'write.delete.format.default'='avro') then Impala won't be able to read the contents of the delete files properly. It is because the avro schema is not set properly for the virtual delete table. Testing: * added e2e tests with position delete files of all kinds Change-Id: Iff13198991caf32c51cd9e0ace4454fd00216cf6 Reviewed-on: http://gerrit.cloudera.org:8080/21301 Tested-by: Impala Public Jenkins <[email protected]> Reviewed-by: Daniel Becker <[email protected]> Reviewed-by: Gabor Kaszab <[email protected]> --- .../apache/impala/catalog/IcebergDeleteTable.java | 5 + .../iceberg-mixed-format-position-deletes.test | 133 +++++++++++++++++++++ tests/query_test/test_iceberg.py | 5 + 3 files changed, 143 insertions(+) diff --git a/fe/src/main/java/org/apache/impala/catalog/IcebergDeleteTable.java b/fe/src/main/java/org/apache/impala/catalog/IcebergDeleteTable.java index ed47c7690..40f2c2c94 100644 --- a/fe/src/main/java/org/apache/impala/catalog/IcebergDeleteTable.java +++ b/fe/src/main/java/org/apache/impala/catalog/IcebergDeleteTable.java @@ -33,6 +33,7 @@ import org.apache.impala.thrift.TIcebergFileFormat; import org.apache.impala.thrift.TIcebergPartitionStats; import org.apache.impala.thrift.TTableDescriptor; import org.apache.impala.thrift.TTableStats; +import org.apache.impala.util.AvroSchemaConverter; /** * Base class for the virtual table implementations for Iceberg deletes, like position or @@ -83,6 +84,10 @@ public abstract class IcebergDeleteTable extends VirtualTable implements FeIcebe TTableDescriptor desc = baseTable_.toThriftDescriptor(tableId, referencedPartitions); desc.setColumnDescriptors(FeCatalogUtils.getTColumnDescriptors(this)); + if (desc.hdfsTable.isSetAvroSchema()) { + desc.hdfsTable.setAvroSchema(AvroSchemaConverter.convertColumns(getColumns(), + getFullName().replaceAll("-", "_")).toString()); + } return desc; } diff --git a/testdata/workloads/functional-query/queries/QueryTest/iceberg-mixed-format-position-deletes.test b/testdata/workloads/functional-query/queries/QueryTest/iceberg-mixed-format-position-deletes.test new file mode 100644 index 000000000..e1452a274 --- /dev/null +++ b/testdata/workloads/functional-query/queries/QueryTest/iceberg-mixed-format-position-deletes.test @@ -0,0 +1,133 @@ +==== +---- HIVE_QUERY +use $DATABASE; +CREATE TABLE ice_mixed_formats(i int, j int) +STORED BY ICEBERG +STORED AS PARQUET +TBLPROPERTIES ('format-version'='2'); +INSERT INTO ice_mixed_formats VALUES (1, 1); +DELETE FROM ice_mixed_formats WHERE i = 1; +ALTER TABLE ice_mixed_formats SET TBLPROPERTIES ('write.format.default'='orc'); +INSERT INTO ice_mixed_formats VALUES (2, 2); +DELETE FROM ice_mixed_formats WHERE i = 2; +INSERT INTO ice_mixed_formats VALUES (3, 3); +INSERT INTO ice_mixed_formats VALUES (10, 10); +ALTER TABLE ice_mixed_formats SET TBLPROPERTIES ('write.format.default'='avro'); +DELETE FROM ice_mixed_formats WHERE i = 3; +==== +---- QUERY +refresh ice_mixed_formats; +==== +---- QUERY +SHOW FILES IN ice_mixed_formats; +---- RESULTS: VERIFY_IS_SUBSET +row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_mixed_formats/data/.*-data-.*.parquet','.*B','','.*' +row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_mixed_formats/data/.*-delete-.*parquet','.*B','','.*' +row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_mixed_formats/data/.*-data-.*.orc','.*B','','.*' +row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_mixed_formats/data/.*-delete-.*orc','.*B','','.*' +row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_mixed_formats/data/.*-delete-.*.avro','.*B','','.*' +---- TYPES +STRING, STRING, STRING, STRING +==== +---- QUERY +select * from ice_mixed_formats; +---- RESULTS +10,10 +---- TYPES +INT,INT +==== +---- QUERY +select count(*) from ice_mixed_formats; +---- RESULTS +1 +---- TYPES +BIGINT +==== +---- QUERY +select * from ice_mixed_formats where i > 1; +---- RESULTS +10,10 +---- TYPES +INT,INT +==== +---- QUERY +select count(*) from ice_mixed_formats where i > 1; +---- RESULTS +1 +---- TYPES +BIGINT +==== +---- QUERY +select i, count(*) from ice_mixed_formats group by 1; +---- RESULTS +10,1 +---- TYPES +INT,BIGINT +==== +---- HIVE_QUERY +use $DATABASE; +CREATE TABLE ice_mixed_formats_partitioned(i int, j int) +PARTITIONED BY SPEC (truncate(2, j)) +STORED BY ICEBERG +STORED AS PARQUET +TBLPROPERTIES ('format-version'='2'); +INSERT INTO ice_mixed_formats_partitioned VALUES (1, 1); +DELETE FROM ice_mixed_formats_partitioned WHERE i = 1; +ALTER TABLE ice_mixed_formats_partitioned SET TBLPROPERTIES ('write.format.default'='orc'); +INSERT INTO ice_mixed_formats_partitioned VALUES (2, 2); +DELETE FROM ice_mixed_formats_partitioned WHERE i = 2; +INSERT INTO ice_mixed_formats_partitioned VALUES (3, 3); +INSERT INTO ice_mixed_formats_partitioned VALUES (10, 10); +ALTER TABLE ice_mixed_formats_partitioned SET TBLPROPERTIES ('write.format.default'='avro'); +DELETE FROM ice_mixed_formats_partitioned WHERE i = 3; +==== +---- QUERY +refresh ice_mixed_formats_partitioned; +==== +---- QUERY +SHOW FILES IN ice_mixed_formats_partitioned; +---- RESULTS: VERIFY_IS_SUBSET +row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_mixed_formats_partitioned/data/j_trunc=0/.*-data-.*.parquet','.*B','','.*' +row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_mixed_formats_partitioned/data/j_trunc=0/.*-delete-.*parquet','.*B','','.*' +row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_mixed_formats_partitioned/data/j_trunc=2/.*-data-.*.orc','.*B','','.*' +row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_mixed_formats_partitioned/data/j_trunc=2/.*-delete-.*orc','.*B','','.*' +row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_mixed_formats_partitioned/data/j_trunc=10/.*-data-.*.orc','.*B','','.*' +row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_mixed_formats_partitioned/data/j_trunc=2/.*-delete-.*.avro','.*B','','.*' +---- TYPES +STRING, STRING, STRING, STRING +==== +---- QUERY +select * from ice_mixed_formats_partitioned; +---- RESULTS +10,10 +---- TYPES +INT,INT +==== +---- QUERY +select count(*) from ice_mixed_formats_partitioned; +---- RESULTS +1 +---- TYPES +BIGINT +==== +---- QUERY +select * from ice_mixed_formats_partitioned where i > 1; +---- RESULTS +10,10 +---- TYPES +INT,INT +==== +---- QUERY +select count(*) from ice_mixed_formats_partitioned where i > 1; +---- RESULTS +1 +---- TYPES +BIGINT +==== +---- QUERY +select i, count(*) from ice_mixed_formats_partitioned group by 1; +---- RESULTS +10,1 +---- TYPES +INT,BIGINT +==== diff --git a/tests/query_test/test_iceberg.py b/tests/query_test/test_iceberg.py index 6c2d646f9..2e715b160 100644 --- a/tests/query_test/test_iceberg.py +++ b/tests/query_test/test_iceberg.py @@ -1463,6 +1463,11 @@ class TestIcebergV2Table(IcebergTestSuite): def test_read_position_deletes(self, vector): self.run_test_case('QueryTest/iceberg-v2-read-position-deletes', vector) + @SkipIfFS.hive + def test_read_mixed_format_position_deletes(self, vector, unique_database): + self.run_test_case('QueryTest/iceberg-mixed-format-position-deletes', + vector, unique_database) + @SkipIfDockerizedCluster.internal_hostname @SkipIf.hardcoded_uris def test_read_null_delete_records(self, vector):
