This is an automated email from the ASF dual-hosted git repository.
michaelsmith pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
The following commit(s) were added to refs/heads/master by this push:
new fafcd6006 IMPALA-13471: test_enable_reading_puffin() seems to fail in
the Ozone build
fafcd6006 is described below
commit fafcd600611e2b6d36327e781e2be0857b1bba98
Author: Daniel Becker <[email protected]>
AuthorDate: Thu Oct 24 15:00:48 2024 +0200
IMPALA-13471: test_enable_reading_puffin() seems to fail in the Ozone build
The tests
TestIcebergWithPuffinStatsStartupFlag::test_[dis|en]able_reading_puffin
queried an Iceberg table that is created during normal dataload from
existing non-filesystem-specific metadata and data files. Therefore the
path of the Puffin stats file that is present in the metadata.json file
does not contain any filesystem-specific prefix, for which Puffin
reading does not work on Ozone.
Note that reading Puffin stats for tables that are created normally do
work on Ozone.
This change modifies the tests to create the table on the fly, modifying
the file path to include the filesystem-specific prefix.
Change-Id: I7afec1c70d7b43bae98289d65749b01ca720e7f7
Reviewed-on: http://gerrit.cloudera.org:8080/22008
Reviewed-by: Impala Public Jenkins <[email protected]>
Tested-by: Impala Public Jenkins <[email protected]>
---
testdata/data/README | 11 ++++++
...e84073-d6306ef200000000_1823051976_data.0.parq} | Bin 604 -> 604 bytes
...7is-8d3be062-6b30-45ea-b26e-6b706c50f8e3.stats} | Bin 534 -> 534 bytes
...> 989a6a68-ca7b-4339-adc2-cb5ba2656f0c-m0.avro} | Bin 3220 -> 3208 bytes
...08-1-989a6a68-ca7b-4339-adc2-cb5ba2656f0c.avro} | Bin 1997 -> 1985 bytes
.../metadata/v1.metadata.json | 6 ++--
.../metadata/v2.metadata.json | 28 +++++++--------
.../metadata/v3.metadata.json | 40 ++++++++++-----------
.../functional/functional_schema_template.sql | 13 -------
.../datasets/functional/schema_constraints.csv | 1 -
tests/custom_cluster/test_iceberg_with_puffin.py | 16 +++++----
tests/util/iceberg_metadata_util.py | 10 ++++++
12 files changed, 68 insertions(+), 57 deletions(-)
diff --git a/testdata/data/README b/testdata/data/README
index 61eedb10e..e4a05f268 100644
--- a/testdata/data/README
+++ b/testdata/data/README
@@ -1045,6 +1045,17 @@ tblproperties ('format-version'='2');
insert into iceberg_v2_positional_update_all_rows values (1,'a'), (2,'b'),
(3,'c')
update iceberg_v2_positional_update_all_rows set s = upper(s);
+iceberg_with_puffin_stats:
+Created similarly to iceberg_v2_no_deletes.
+With Impala:
+ create table iceberg_with_puffin_stats(i INT, d DECIMAL(9, 0)) stored as
iceberg;
+ insert into iceberg_with_puffin_stats values (1, 1), (2, 2);
+With Trino:
+ use iceberg.default;
+ analyze iceberg_with_puffin_stats;
+And then converted the table with 'convert_to_iceberg.sh' and
'avro_iceberg_convert.sh'
+described under the section of 'iceberg_v2_no_deletes.'.
+
iceberg_test/hadoop_catalog/ice/iceberg_multiple_storage_locations*:
- 'iceberg_test/hadoop_catalog/ice/iceberg_multiple_storage_locations'
- 'iceberg_test/hadoop_catalog/ice/iceberg_multiple_storage_locations_data'
diff --git
a/testdata/data/iceberg_test/iceberg_with_puffin_stats/data/0747babcda9277bf-954aff1b00000000_1684663509_data.0.parq
b/testdata/data/iceberg_test/iceberg_with_puffin_stats/data/524c361b0ae84073-d6306ef200000000_1823051976_data.0.parq
similarity index 87%
rename from
testdata/data/iceberg_test/iceberg_with_puffin_stats/data/0747babcda9277bf-954aff1b00000000_1684663509_data.0.parq
rename to
testdata/data/iceberg_test/iceberg_with_puffin_stats/data/524c361b0ae84073-d6306ef200000000_1823051976_data.0.parq
index f243f30d5..5ddbf56e8 100644
Binary files
a/testdata/data/iceberg_test/iceberg_with_puffin_stats/data/0747babcda9277bf-954aff1b00000000_1684663509_data.0.parq
and
b/testdata/data/iceberg_test/iceberg_with_puffin_stats/data/524c361b0ae84073-d6306ef200000000_1823051976_data.0.parq
differ
diff --git
a/testdata/data/iceberg_test/iceberg_with_puffin_stats/metadata/20240906_085606_00006_wsfgs-4d9242d5-bd79-4069-be8b-2cfced8e0647.stats
b/testdata/data/iceberg_test/iceberg_with_puffin_stats/metadata/20241028_142058_00006_dn7is-8d3be062-6b30-45ea-b26e-6b706c50f8e3.stats
similarity index 59%
rename from
testdata/data/iceberg_test/iceberg_with_puffin_stats/metadata/20240906_085606_00006_wsfgs-4d9242d5-bd79-4069-be8b-2cfced8e0647.stats
rename to
testdata/data/iceberg_test/iceberg_with_puffin_stats/metadata/20241028_142058_00006_dn7is-8d3be062-6b30-45ea-b26e-6b706c50f8e3.stats
index 98e388477..7ca51d65b 100644
Binary files
a/testdata/data/iceberg_test/iceberg_with_puffin_stats/metadata/20240906_085606_00006_wsfgs-4d9242d5-bd79-4069-be8b-2cfced8e0647.stats
and
b/testdata/data/iceberg_test/iceberg_with_puffin_stats/metadata/20241028_142058_00006_dn7is-8d3be062-6b30-45ea-b26e-6b706c50f8e3.stats
differ
diff --git
a/testdata/data/iceberg_test/iceberg_with_puffin_stats/metadata/11cd04ec-55ea-40aa-a89b-197c3c275e7a-m0.avro
b/testdata/data/iceberg_test/iceberg_with_puffin_stats/metadata/989a6a68-ca7b-4339-adc2-cb5ba2656f0c-m0.avro
similarity index 91%
rename from
testdata/data/iceberg_test/iceberg_with_puffin_stats/metadata/11cd04ec-55ea-40aa-a89b-197c3c275e7a-m0.avro
rename to
testdata/data/iceberg_test/iceberg_with_puffin_stats/metadata/989a6a68-ca7b-4339-adc2-cb5ba2656f0c-m0.avro
index 6921b7290..85b7f2b4c 100644
Binary files
a/testdata/data/iceberg_test/iceberg_with_puffin_stats/metadata/11cd04ec-55ea-40aa-a89b-197c3c275e7a-m0.avro
and
b/testdata/data/iceberg_test/iceberg_with_puffin_stats/metadata/989a6a68-ca7b-4339-adc2-cb5ba2656f0c-m0.avro
differ
diff --git
a/testdata/data/iceberg_test/iceberg_with_puffin_stats/metadata/snap-1880359224532128423-1-11cd04ec-55ea-40aa-a89b-197c3c275e7a.avro
b/testdata/data/iceberg_test/iceberg_with_puffin_stats/metadata/snap-8614787612208071008-1-989a6a68-ca7b-4339-adc2-cb5ba2656f0c.avro
similarity index 89%
rename from
testdata/data/iceberg_test/iceberg_with_puffin_stats/metadata/snap-1880359224532128423-1-11cd04ec-55ea-40aa-a89b-197c3c275e7a.avro
rename to
testdata/data/iceberg_test/iceberg_with_puffin_stats/metadata/snap-8614787612208071008-1-989a6a68-ca7b-4339-adc2-cb5ba2656f0c.avro
index 555d8e73b..2a34f1839 100644
Binary files
a/testdata/data/iceberg_test/iceberg_with_puffin_stats/metadata/snap-1880359224532128423-1-11cd04ec-55ea-40aa-a89b-197c3c275e7a.avro
and
b/testdata/data/iceberg_test/iceberg_with_puffin_stats/metadata/snap-8614787612208071008-1-989a6a68-ca7b-4339-adc2-cb5ba2656f0c.avro
differ
diff --git
a/testdata/data/iceberg_test/iceberg_with_puffin_stats/metadata/v1.metadata.json
b/testdata/data/iceberg_test/iceberg_with_puffin_stats/metadata/v1.metadata.json
index fb1d1ca73..d2c54f770 100644
---
a/testdata/data/iceberg_test/iceberg_with_puffin_stats/metadata/v1.metadata.json
+++
b/testdata/data/iceberg_test/iceberg_with_puffin_stats/metadata/v1.metadata.json
@@ -1,8 +1,8 @@
{
"format-version" : 1,
- "table-uuid" : "1804053c-6ba0-48df-8d22-f1bd139a3635",
- "location" : "/test-warehouse/iceberg_test/iceberg_with_puffin_stats",
- "last-updated-ms" : 1725612488531,
+ "table-uuid" : "8368b5fe-0726-4010-869d-cbbdfb5d511b",
+ "location" : "/test-warehouse/iceberg_with_puffin_stats",
+ "last-updated-ms" : 1730124581193,
"last-column-id" : 2,
"schema" : {
"type" : "struct",
diff --git
a/testdata/data/iceberg_test/iceberg_with_puffin_stats/metadata/v2.metadata.json
b/testdata/data/iceberg_test/iceberg_with_puffin_stats/metadata/v2.metadata.json
index 32a167382..fd855590d 100644
---
a/testdata/data/iceberg_test/iceberg_with_puffin_stats/metadata/v2.metadata.json
+++
b/testdata/data/iceberg_test/iceberg_with_puffin_stats/metadata/v2.metadata.json
@@ -1,8 +1,8 @@
{
"format-version" : 1,
- "table-uuid" : "1804053c-6ba0-48df-8d22-f1bd139a3635",
- "location" : "/test-warehouse/iceberg_test/iceberg_with_puffin_stats",
- "last-updated-ms" : 1725612500133,
+ "table-uuid" : "8368b5fe-0726-4010-869d-cbbdfb5d511b",
+ "location" : "/test-warehouse/iceberg_with_puffin_stats",
+ "last-updated-ms" : 1730125063894,
"last-column-id" : 2,
"schema" : {
"type" : "struct",
@@ -49,24 +49,24 @@
} ],
"properties" : {
"engine.hive.enabled" : "true",
- "impala.events.catalogServiceId" : "a6b8da59a60e425a:ad0468614a61e340",
+ "impala.events.catalogServiceId" : "58de66debfc6442a:9e804e3402019553",
"external.table.purge" : "TRUE",
- "impala.events.catalogVersion" : "4097",
+ "impala.events.catalogVersion" : "845",
"write.format.default" : "parquet",
"hive.metastore.table.owner" : "danielbecker",
"OBJCAPABILITIES" : "EXTREAD,EXTWRITE",
"storage_handler" : "org.apache.iceberg.mr.hive.HiveIcebergStorageHandler"
},
- "current-snapshot-id" : 1880359224532128423,
+ "current-snapshot-id" : 8614787612208071008,
"refs" : {
"main" : {
- "snapshot-id" : 1880359224532128423,
+ "snapshot-id" : 8614787612208071008,
"type" : "branch"
}
},
"snapshots" : [ {
- "snapshot-id" : 1880359224532128423,
- "timestamp-ms" : 1725612500115,
+ "snapshot-id" : 8614787612208071008,
+ "timestamp-ms" : 1730125063881,
"summary" : {
"operation" : "append",
"added-data-files" : "1",
@@ -80,16 +80,16 @@
"total-position-deletes" : "0",
"total-equality-deletes" : "0"
},
- "manifest-list" :
"/test-warehouse/iceberg_test/iceberg_with_puffin_stats/metadata/snap-1880359224532128423-1-11cd04ec-55ea-40aa-a89b-197c3c275e7a.avro",
+ "manifest-list" :
"/test-warehouse/iceberg_with_puffin_stats/metadata/snap-8614787612208071008-1-989a6a68-ca7b-4339-adc2-cb5ba2656f0c.avro",
"schema-id" : 0
} ],
"statistics" : [ ],
"snapshot-log" : [ {
- "timestamp-ms" : 1725612500115,
- "snapshot-id" : 1880359224532128423
+ "timestamp-ms" : 1730125063881,
+ "snapshot-id" : 8614787612208071008
} ],
"metadata-log" : [ {
- "timestamp-ms" : 1725612488531,
- "metadata-file" :
"/test-warehouse/iceberg_test/iceberg_with_puffin_stats/metadata/00000-c1fdc549-6895-4b22-987b-b1c229294749.metadata.json"
+ "timestamp-ms" : 1730124581193,
+ "metadata-file" :
"/test-warehouse/iceberg_with_puffin_stats/metadata/00000-e9800290-7a3b-45bb-b0e0-5747e1d43167.metadata.json"
} ]
}
\ No newline at end of file
diff --git
a/testdata/data/iceberg_test/iceberg_with_puffin_stats/metadata/v3.metadata.json
b/testdata/data/iceberg_test/iceberg_with_puffin_stats/metadata/v3.metadata.json
index 1cf15ea77..7be76e452 100644
---
a/testdata/data/iceberg_test/iceberg_with_puffin_stats/metadata/v3.metadata.json
+++
b/testdata/data/iceberg_test/iceberg_with_puffin_stats/metadata/v3.metadata.json
@@ -1,8 +1,8 @@
{
"format-version" : 1,
- "table-uuid" : "1804053c-6ba0-48df-8d22-f1bd139a3635",
- "location" : "/test-warehouse/iceberg_test/iceberg_with_puffin_stats",
- "last-updated-ms" : 1725612966926,
+ "table-uuid" : "8368b5fe-0726-4010-869d-cbbdfb5d511b",
+ "location" : "/test-warehouse/iceberg_with_puffin_stats",
+ "last-updated-ms" : 1730125258880,
"last-column-id" : 2,
"schema" : {
"type" : "struct",
@@ -49,24 +49,24 @@
} ],
"properties" : {
"engine.hive.enabled" : "true",
- "impala.events.catalogServiceId" : "a6b8da59a60e425a:ad0468614a61e340",
+ "impala.events.catalogServiceId" : "58de66debfc6442a:9e804e3402019553",
"external.table.purge" : "TRUE",
- "impala.events.catalogVersion" : "4097",
+ "impala.events.catalogVersion" : "845",
"write.format.default" : "parquet",
"hive.metastore.table.owner" : "danielbecker",
"OBJCAPABILITIES" : "EXTREAD,EXTWRITE",
"storage_handler" : "org.apache.iceberg.mr.hive.HiveIcebergStorageHandler"
},
- "current-snapshot-id" : 1880359224532128423,
+ "current-snapshot-id" : 8614787612208071008,
"refs" : {
"main" : {
- "snapshot-id" : 1880359224532128423,
+ "snapshot-id" : 8614787612208071008,
"type" : "branch"
}
},
"snapshots" : [ {
- "snapshot-id" : 1880359224532128423,
- "timestamp-ms" : 1725612500115,
+ "snapshot-id" : 8614787612208071008,
+ "timestamp-ms" : 1730125063881,
"summary" : {
"operation" : "append",
"added-data-files" : "1",
@@ -80,17 +80,17 @@
"total-position-deletes" : "0",
"total-equality-deletes" : "0"
},
- "manifest-list" :
"/test-warehouse/iceberg_test/iceberg_with_puffin_stats/metadata/snap-1880359224532128423-1-11cd04ec-55ea-40aa-a89b-197c3c275e7a.avro",
+ "manifest-list" :
"/test-warehouse/iceberg_with_puffin_stats/metadata/snap-8614787612208071008-1-989a6a68-ca7b-4339-adc2-cb5ba2656f0c.avro",
"schema-id" : 0
} ],
"statistics" : [ {
- "snapshot-id" : 1880359224532128423,
- "statistics-path" :
"/test-warehouse/iceberg_test/iceberg_with_puffin_stats/metadata/20240906_085606_00006_wsfgs-4d9242d5-bd79-4069-be8b-2cfced8e0647.stats",
+ "snapshot-id" : 8614787612208071008,
+ "statistics-path" :
"/test-warehouse/iceberg_with_puffin_stats/metadata/20241028_142058_00006_dn7is-8d3be062-6b30-45ea-b26e-6b706c50f8e3.stats",
"file-size-in-bytes" : 534,
"file-footer-size-in-bytes" : 440,
"blob-metadata" : [ {
"type" : "apache-datasketches-theta-v1",
- "snapshot-id" : 1880359224532128423,
+ "snapshot-id" : 8614787612208071008,
"sequence-number" : 0,
"fields" : [ 1 ],
"properties" : {
@@ -98,7 +98,7 @@
}
}, {
"type" : "apache-datasketches-theta-v1",
- "snapshot-id" : 1880359224532128423,
+ "snapshot-id" : 8614787612208071008,
"sequence-number" : 0,
"fields" : [ 2 ],
"properties" : {
@@ -108,14 +108,14 @@
} ],
"partition-statistics" : [ ],
"snapshot-log" : [ {
- "timestamp-ms" : 1725612500115,
- "snapshot-id" : 1880359224532128423
+ "timestamp-ms" : 1730125063881,
+ "snapshot-id" : 8614787612208071008
} ],
"metadata-log" : [ {
- "timestamp-ms" : 1725612488531,
- "metadata-file" :
"/test-warehouse/iceberg_test/iceberg_with_puffin_stats/metadata/00000-c1fdc549-6895-4b22-987b-b1c229294749.metadata.json"
+ "timestamp-ms" : 1730124581193,
+ "metadata-file" :
"/test-warehouse/iceberg_with_puffin_stats/metadata/00000-e9800290-7a3b-45bb-b0e0-5747e1d43167.metadata.json"
}, {
- "timestamp-ms" : 1725612500133,
- "metadata-file" :
"/test-warehouse/iceberg_test/iceberg_with_puffin_stats/metadata/00001-1765f110-5212-4aee-9667-3d571582e15a.metadata.json"
+ "timestamp-ms" : 1730125063894,
+ "metadata-file" :
"/test-warehouse/iceberg_with_puffin_stats/metadata/00001-9e4ccd47-9dc9-480a-8ff6-425dae1fc2df.metadata.json"
} ]
}
\ No newline at end of file
diff --git a/testdata/datasets/functional/functional_schema_template.sql
b/testdata/datasets/functional/functional_schema_template.sql
index a02807c50..641045ea7 100644
--- a/testdata/datasets/functional/functional_schema_template.sql
+++ b/testdata/datasets/functional/functional_schema_template.sql
@@ -3234,19 +3234,6 @@ hadoop fs -put -f
${IMPALA_HOME}/testdata/data/iceberg_test/iceberg_non_partitio
---- DATASET
functional
---- BASE_TABLE_NAME
-iceberg_with_puffin_stats
----- CREATE
-CREATE EXTERNAL TABLE IF NOT EXISTS {db_name}{db_suffix}.{table_name}
-STORED AS ICEBERG
-LOCATION '/test-warehouse/iceberg_test/iceberg_with_puffin_stats'
-TBLPROPERTIES('write.format.default'='parquet',
'iceberg.catalog'='hadoop.tables');
----- DEPENDENT_LOAD
-`hadoop fs -mkdir -p /test-warehouse/iceberg_test && \
-hadoop fs -put -f
${IMPALA_HOME}/testdata/data/iceberg_test/iceberg_with_puffin_stats
/test-warehouse/iceberg_test/
-====
----- DATASET
-functional
----- BASE_TABLE_NAME
hadoop_catalog_test_external
---- CREATE
CREATE EXTERNAL TABLE IF NOT EXISTS {db_name}{db_suffix}.{table_name}
diff --git a/testdata/datasets/functional/schema_constraints.csv
b/testdata/datasets/functional/schema_constraints.csv
index 3dc03d87a..a3d279df3 100644
--- a/testdata/datasets/functional/schema_constraints.csv
+++ b/testdata/datasets/functional/schema_constraints.csv
@@ -70,7 +70,6 @@ table_name:complextypestbl_iceberg_orc,
constraint:restrict_to, table_format:par
table_name:hadoop_catalog_test_external, constraint:restrict_to,
table_format:parquet/none/none
table_name:iceberg_int_partitioned, constraint:restrict_to,
table_format:parquet/none/none
table_name:iceberg_non_partitioned, constraint:restrict_to,
table_format:parquet/none/none
-table_name:iceberg_with_puffin_stats, constraint:restrict_to,
table_format:parquet/none/none
table_name:iceberg_partitioned, constraint:restrict_to,
table_format:parquet/none/none
table_name:iceberg_partitioned_orc_external, constraint:restrict_to,
table_format:parquet/none/none
table_name:iceberg_partition_transforms_zorder, constraint:restrict_to,
table_format:parquet/none/none
diff --git a/tests/custom_cluster/test_iceberg_with_puffin.py
b/tests/custom_cluster/test_iceberg_with_puffin.py
index 459d1ec90..f0e12123e 100644
--- a/tests/custom_cluster/test_iceberg_with_puffin.py
+++ b/tests/custom_cluster/test_iceberg_with_puffin.py
@@ -27,6 +27,7 @@ import sys
import tempfile
from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
+from tests.common.file_utils import create_iceberg_table_from_directory
class TestIcebergWithPuffinStatsStartupFlag(CustomClusterTestSuite):
@@ -40,18 +41,21 @@ class
TestIcebergWithPuffinStatsStartupFlag(CustomClusterTestSuite):
@CustomClusterTestSuite.with_args(
catalogd_args='--disable_reading_puffin_stats=true')
@pytest.mark.execute_serially
- def test_disable_reading_puffin(self):
- self._read_ndv_stats_expect_result([-1, -1])
+ def test_disable_reading_puffin(self, unique_database):
+ self._read_ndv_stats_expect_result(unique_database, [-1, -1])
@CustomClusterTestSuite.with_args(
catalogd_args='--disable_reading_puffin_stats=false')
@pytest.mark.execute_serially
def test_enable_reading_puffin(self, unique_database):
- self._read_ndv_stats_expect_result([2, 2])
+ self._read_ndv_stats_expect_result(unique_database, [2, 2])
- def _read_ndv_stats_expect_result(self, expected_ndv_stats):
- tbl_name = "functional_parquet.iceberg_with_puffin_stats"
- show_col_stats_stmt = "show column stats {}".format(tbl_name)
+ def _read_ndv_stats_expect_result(self, unique_database, expected_ndv_stats):
+ tbl_name = "iceberg_with_puffin_stats"
+ create_iceberg_table_from_directory(self.client, unique_database,
tbl_name, "parquet")
+
+ full_tbl_name = "{}.{}".format(unique_database, tbl_name)
+ show_col_stats_stmt = "show column stats {}".format(full_tbl_name)
query_result = self.execute_query(show_col_stats_stmt)
rows = query_result.get_data().split("\n")
diff --git a/tests/util/iceberg_metadata_util.py
b/tests/util/iceberg_metadata_util.py
index f9935b227..a97b14d90 100644
--- a/tests/util/iceberg_metadata_util.py
+++ b/tests/util/iceberg_metadata_util.py
@@ -60,6 +60,10 @@ def rewrite_metadata(prefix, unique_database, metadata_dir):
metadata['metadata-log'] = \
list(map(partial(add_prefix_to_mlog, table_params),
metadata['metadata-log']))
+ if 'statistics' in metadata:
+ metadata['statistics'] = list(map(
+ partial(add_prefix_to_statistics, table_params),
metadata['statistics']))
+
with open(mfile + '.tmp', 'w') as f:
json.dump(metadata, f, indent=2)
os.rename(mfile + '.tmp', mfile)
@@ -137,6 +141,12 @@ def add_prefix_to_mlog(table_params, metadata_log):
return metadata_log
+def add_prefix_to_statistics(table_params, statistics):
+ statistics['statistics-path'] = generate_new_path(
+ table_params, statistics['statistics-path'])
+ return statistics
+
+
def add_prefix_to_snapshot_entry(table_params, entry):
if 'manifest_path' in entry:
entry['manifest_path'] = generate_new_path(table_params,
entry['manifest_path'])