This is an automated email from the ASF dual-hosted git repository. michaelsmith pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit 88cb9c19083ae8c2bc70d373a1a70384d476b9cd Author: Zoltan Borok-Nagy <[email protected]> AuthorDate: Thu Oct 17 13:41:46 2024 +0200 IMPALA-13463: Impala should ignore case of Iceberg schema elements Schema is case insensitive in Impala. Via Spark it's possible to create schema elements with upper/lower case letters and store them in the metadata JSON files of Iceberg, e.g.: "schemas" : [ { "type" : "struct", "schema-id" : 0, "fields" : [ { "id" : 1, "name" : "ID", "required" : false, "type" : "string" }, { "id" : 2, "name" : "OWNERID", "required" : false, "type" : "string" } ] } ], This can cause problems in Impala during predicate pushdown, as we can get a ValidationException from the Iceberg library (as Impala pushes down predicates with lower case column names, while Iceberg sees upper case names). With this patch Impala invokes Scan.caseSensitive(boolean caseSensitive) on the TableScan object to set case insensitivity. Testing: * added e2e test Change-Id: Iedaf152d8a0c02a124c3dcf8acb59b4ba4e81cf4 Reviewed-on: http://gerrit.cloudera.org:8080/21950 Tested-by: Impala Public Jenkins <[email protected]> Reviewed-by: Wenzhe Zhou <[email protected]> Reviewed-by: Daniel Becker <[email protected]> --- .../java/org/apache/impala/util/IcebergUtil.java | 9 +- ...e91c0129-f018b1d800000000_872469098_data.0.parq | Bin 0 -> 605 bytes ...97c4c65-c9fce43a00000000_1852333400_data.0.parq | Bin 0 -> 591 bytes .../1a457d69-768a-4bfd-8da5-c080d3b88e50-m0.avro | Bin 0 -> 6039 bytes .../96461a99-3b56-4573-ab6d-8b8ba3fbcae2-m0.avro | Bin 0 -> 6034 bytes ...667-1-96461a99-3b56-4573-ab6d-8b8ba3fbcae2.avro | Bin 0 -> 3875 bytes ...468-1-1a457d69-768a-4bfd-8da5-c080d3b88e50.avro | Bin 0 -> 3797 bytes .../metadata/v3.metadata.json | 127 +++++++++++++++++++++ .../metadata/version-hint.text | 1 + .../iceberg-column-case-sensitivity-issue.test | 20 ++++ tests/query_test/test_iceberg.py | 6 + 11 files changed, 161 insertions(+), 2 deletions(-) diff --git a/fe/src/main/java/org/apache/impala/util/IcebergUtil.java b/fe/src/main/java/org/apache/impala/util/IcebergUtil.java index 3bacdfd3a..5ad64f71e 100644 --- a/fe/src/main/java/org/apache/impala/util/IcebergUtil.java +++ b/fe/src/main/java/org/apache/impala/util/IcebergUtil.java @@ -655,7 +655,7 @@ public class IcebergUtil { private static TableScan createScanAsOf(FeIcebergTable table, TimeTravelSpec timeTravelSpec) { - TableScan scan = table.getIcebergApiTable().newScan(); + TableScan scan = newScan(table); if (timeTravelSpec == null) { scan = scan.useSnapshot(table.snapshotId()); } else { @@ -687,7 +687,7 @@ public class IcebergUtil { if (table.snapshotId() == -1) { return new GroupedContentFiles(CloseableIterable.empty()); } - TableScan scan = table.getIcebergApiTable().newScan(); + TableScan scan = newScan(table); scan = scan.useSnapshot(snapshotId); for (Expression predicate : predicates) { scan = scan.filter(predicate); @@ -699,6 +699,11 @@ public class IcebergUtil { } } + private static TableScan newScan(FeIcebergTable table) { + TableScan scan = table.getIcebergApiTable().newScan(); + return scan.caseSensitive(false); + } + /** * Use ContentFile path to generate 128-bit Murmur3 hash as map key, cached in memory */ diff --git a/testdata/data/iceberg_test/iceberg_column_case_sensitivity_issue/data/id_bucket=3/5b4ef6d2e91c0129-f018b1d800000000_872469098_data.0.parq b/testdata/data/iceberg_test/iceberg_column_case_sensitivity_issue/data/id_bucket=3/5b4ef6d2e91c0129-f018b1d800000000_872469098_data.0.parq new file mode 100644 index 000000000..2b5988bf9 Binary files /dev/null and b/testdata/data/iceberg_test/iceberg_column_case_sensitivity_issue/data/id_bucket=3/5b4ef6d2e91c0129-f018b1d800000000_872469098_data.0.parq differ diff --git a/testdata/data/iceberg_test/iceberg_column_case_sensitivity_issue/data/id_bucket=7/504c5f5ae97c4c65-c9fce43a00000000_1852333400_data.0.parq b/testdata/data/iceberg_test/iceberg_column_case_sensitivity_issue/data/id_bucket=7/504c5f5ae97c4c65-c9fce43a00000000_1852333400_data.0.parq new file mode 100644 index 000000000..f70cd0c84 Binary files /dev/null and b/testdata/data/iceberg_test/iceberg_column_case_sensitivity_issue/data/id_bucket=7/504c5f5ae97c4c65-c9fce43a00000000_1852333400_data.0.parq differ diff --git a/testdata/data/iceberg_test/iceberg_column_case_sensitivity_issue/metadata/1a457d69-768a-4bfd-8da5-c080d3b88e50-m0.avro b/testdata/data/iceberg_test/iceberg_column_case_sensitivity_issue/metadata/1a457d69-768a-4bfd-8da5-c080d3b88e50-m0.avro new file mode 100644 index 000000000..a27606f92 Binary files /dev/null and b/testdata/data/iceberg_test/iceberg_column_case_sensitivity_issue/metadata/1a457d69-768a-4bfd-8da5-c080d3b88e50-m0.avro differ diff --git a/testdata/data/iceberg_test/iceberg_column_case_sensitivity_issue/metadata/96461a99-3b56-4573-ab6d-8b8ba3fbcae2-m0.avro b/testdata/data/iceberg_test/iceberg_column_case_sensitivity_issue/metadata/96461a99-3b56-4573-ab6d-8b8ba3fbcae2-m0.avro new file mode 100644 index 000000000..704686951 Binary files /dev/null and b/testdata/data/iceberg_test/iceberg_column_case_sensitivity_issue/metadata/96461a99-3b56-4573-ab6d-8b8ba3fbcae2-m0.avro differ diff --git a/testdata/data/iceberg_test/iceberg_column_case_sensitivity_issue/metadata/snap-1855055649619147667-1-96461a99-3b56-4573-ab6d-8b8ba3fbcae2.avro b/testdata/data/iceberg_test/iceberg_column_case_sensitivity_issue/metadata/snap-1855055649619147667-1-96461a99-3b56-4573-ab6d-8b8ba3fbcae2.avro new file mode 100644 index 000000000..a336f60d1 Binary files /dev/null and b/testdata/data/iceberg_test/iceberg_column_case_sensitivity_issue/metadata/snap-1855055649619147667-1-96461a99-3b56-4573-ab6d-8b8ba3fbcae2.avro differ diff --git a/testdata/data/iceberg_test/iceberg_column_case_sensitivity_issue/metadata/snap-7743982156242154468-1-1a457d69-768a-4bfd-8da5-c080d3b88e50.avro b/testdata/data/iceberg_test/iceberg_column_case_sensitivity_issue/metadata/snap-7743982156242154468-1-1a457d69-768a-4bfd-8da5-c080d3b88e50.avro new file mode 100644 index 000000000..f5e8d3bd7 Binary files /dev/null and b/testdata/data/iceberg_test/iceberg_column_case_sensitivity_issue/metadata/snap-7743982156242154468-1-1a457d69-768a-4bfd-8da5-c080d3b88e50.avro differ diff --git a/testdata/data/iceberg_test/iceberg_column_case_sensitivity_issue/metadata/v3.metadata.json b/testdata/data/iceberg_test/iceberg_column_case_sensitivity_issue/metadata/v3.metadata.json new file mode 100644 index 000000000..0095d2b7f --- /dev/null +++ b/testdata/data/iceberg_test/iceberg_column_case_sensitivity_issue/metadata/v3.metadata.json @@ -0,0 +1,127 @@ +{ + "format-version" : 1, + "table-uuid" : "6259114b-df40-4263-9375-4bd3102965d2", + "location" : "hdfs://localhost:20500/test-warehouse/iceberg_column_case_sensitivity_issue", + "last-updated-ms" : 1729164485534, + "last-column-id" : 2, + "schema" : { + "type" : "struct", + "schema-id" : 0, + "fields" : [ { + "id" : 1, + "name" : "ID", + "required" : false, + "type" : "string" + }, { + "id" : 2, + "name" : "OWNERID", + "required" : false, + "type" : "string" + } ] + }, + "current-schema-id" : 0, + "schemas" : [ { + "type" : "struct", + "schema-id" : 0, + "fields" : [ { + "id" : 1, + "name" : "ID", + "required" : false, + "type" : "string" + }, { + "id" : 2, + "name" : "OWNERID", + "required" : false, + "type" : "string" + } ] + } ], + "partition-spec" : [ { + "name" : "ID_bucket", + "transform" : "bucket[16]", + "source-id" : 1, + "field-id" : 1000 + } ], + "default-spec-id" : 0, + "partition-specs" : [ { + "spec-id" : 0, + "fields" : [ { + "name" : "ID_bucket", + "transform" : "bucket[16]", + "source-id" : 1, + "field-id" : 1000 + } ] + } ], + "last-partition-id" : 1000, + "default-sort-order-id" : 0, + "sort-orders" : [ { + "order-id" : 0, + "fields" : [ ] + } ], + "properties" : { + "engine.hive.enabled" : "true", + "OBJCAPABILITIES" : "EXTREAD,EXTWRITE", + "storage_handler" : "org.apache.iceberg.mr.hive.HiveIcebergStorageHandler", + "write.format.default" : "parquet", + "iceberg.catalog" : "hadoop.tables" + }, + "current-snapshot-id" : 1855055649619147667, + "refs" : { + "main" : { + "snapshot-id" : 1855055649619147667, + "type" : "branch" + } + }, + "snapshots" : [ { + "snapshot-id" : 7743982156242154468, + "timestamp-ms" : 1729164477675, + "summary" : { + "operation" : "append", + "added-data-files" : "1", + "added-records" : "1", + "added-files-size" : "605", + "changed-partition-count" : "1", + "total-records" : "1", + "total-files-size" : "605", + "total-data-files" : "1", + "total-delete-files" : "0", + "total-position-deletes" : "0", + "total-equality-deletes" : "0" + }, + "manifest-list" : "hdfs://localhost:20500/test-warehouse/iceberg_column_case_sensitivity_issue/metadata/snap-7743982156242154468-1-1a457d69-768a-4bfd-8da5-c080d3b88e50.avro", + "schema-id" : 0 + }, { + "snapshot-id" : 1855055649619147667, + "parent-snapshot-id" : 7743982156242154468, + "timestamp-ms" : 1729164485534, + "summary" : { + "operation" : "append", + "added-data-files" : "1", + "added-records" : "1", + "added-files-size" : "591", + "changed-partition-count" : "1", + "total-records" : "2", + "total-files-size" : "1196", + "total-data-files" : "2", + "total-delete-files" : "0", + "total-position-deletes" : "0", + "total-equality-deletes" : "0" + }, + "manifest-list" : "hdfs://localhost:20500/test-warehouse/iceberg_column_case_sensitivity_issue/metadata/snap-1855055649619147667-1-96461a99-3b56-4573-ab6d-8b8ba3fbcae2.avro", + "schema-id" : 0 + } ], + "statistics" : [ ], + "snapshot-log" : [ { + "timestamp-ms" : 1729164477675, + "snapshot-id" : 7743982156242154468 + }, { + "timestamp-ms" : 1729164485534, + "snapshot-id" : 1855055649619147667 + } ], + "metadata-log" : [ { + "timestamp-ms" : 1729164453998, + "metadata-file" : "hdfs://localhost:20500/test-warehouse/iceberg_column_case_sensitivity_issue/metadata/v1.metadata.json" + }, { + "timestamp-ms" : 1729164477675, + "metadata-file" : "hdfs://localhost:20500/test-warehouse/iceberg_column_case_sensitivity_issue/metadata/v2.metadata.json" + } ] +} diff --git a/testdata/data/iceberg_test/iceberg_column_case_sensitivity_issue/metadata/version-hint.text b/testdata/data/iceberg_test/iceberg_column_case_sensitivity_issue/metadata/version-hint.text new file mode 100644 index 000000000..e440e5c84 --- /dev/null +++ b/testdata/data/iceberg_test/iceberg_column_case_sensitivity_issue/metadata/version-hint.text @@ -0,0 +1 @@ +3 \ No newline at end of file diff --git a/testdata/workloads/functional-query/queries/QueryTest/iceberg-column-case-sensitivity-issue.test b/testdata/workloads/functional-query/queries/QueryTest/iceberg-column-case-sensitivity-issue.test new file mode 100644 index 000000000..a1907b1ac --- /dev/null +++ b/testdata/workloads/functional-query/queries/QueryTest/iceberg-column-case-sensitivity-issue.test @@ -0,0 +1,20 @@ +==== +---- QUERY +select * from iceberg_column_case_sensitivity_issue; +---- RESULTS +'1','impala' +'2','hive' +---- TYPES +STRING, STRING +==== +---- QUERY +# Check that predicate pushdown works well +select * from iceberg_column_case_sensitivity_issue +where id = '1'; +---- RESULTS +'1','impala' +---- TYPES +STRING, STRING +---- RUNTIME_PROFILE +aggregation(SUM, NumRowGroups): 1 +==== diff --git a/tests/query_test/test_iceberg.py b/tests/query_test/test_iceberg.py index 663493369..0cc5e72a7 100644 --- a/tests/query_test/test_iceberg.py +++ b/tests/query_test/test_iceberg.py @@ -271,6 +271,12 @@ class TestIcebergTable(IcebergTestSuite): self.run_test_case('QueryTest/iceberg-migrated-table-field-id-resolution', vector, unique_database) + def test_column_case_sensitivity(self, vector, unique_database): + create_iceberg_table_from_directory(self.client, unique_database, + "iceberg_column_case_sensitivity_issue", "parquet") + self.run_test_case('QueryTest/iceberg-column-case-sensitivity-issue', + vector, unique_database) + @SkipIfFS.hive def test_migrated_table_field_id_resolution_complex(self, vector, unique_database): def get_table_loc(tbl_name):
