This is an automated email from the ASF dual-hosted git repository. dbecker pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit 5c415545ea37e66104bc62b3ae6a6a276c416109 Author: Zoltan Borok-Nagy <[email protected]> AuthorDate: Wed May 21 16:04:20 2025 +0200 IMPALA-14023: Fix test_scan_metrics_in_profile in non-HDFS builds test_scan_metrics_in_profile was querying pre-written Iceberg V2 tables. The position delete files of such tables contain hard-coded URIs of data files, i.e. URIs that start with "hdfs://localhost...". Therefore the test only worked well in HDFS builds. This patch splits the test into two parts: * test_scan_metrics_in_profile_basic: it works on all storage systems as it only works on Iceberg tables that don't have delete files. * test_scan_metrics_in_profile_with_deletes: uses Iceberg tables that have delete files, therefore it is only executed on HDFS. Change-Id: I80a7c6469a7f56b58254e1327a05ef7b3dc9c9ff Reviewed-on: http://gerrit.cloudera.org:8080/22931 Reviewed-by: Impala Public Jenkins <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> --- .../QueryTest/iceberg-scan-metrics-basic.test | 29 +++++++++++ ...test => iceberg-scan-metrics-with-deletes.test} | 28 ---------- tests/query_test/test_iceberg.py | 59 ++++++++++++---------- 3 files changed, 61 insertions(+), 55 deletions(-) diff --git a/testdata/workloads/functional-query/queries/QueryTest/iceberg-scan-metrics-basic.test b/testdata/workloads/functional-query/queries/QueryTest/iceberg-scan-metrics-basic.test new file mode 100644 index 000000000..55155cdc5 --- /dev/null +++ b/testdata/workloads/functional-query/queries/QueryTest/iceberg-scan-metrics-basic.test @@ -0,0 +1,29 @@ +==== +---- QUERY +select * from functional_parquet.iceberg_partitioned +---- RUNTIME_PROFILE +Iceberg Plan Metrics for Node 00: + Planning done without Iceberg: no Iceberg scan metrics available. +==== +---- QUERY +# Filtering on a partition column pushes the predicate down to Iceberg, so we have metrics. +select * from functional_parquet.iceberg_partitioned where action='download' +---- RUNTIME_PROFILE +Iceberg Plan Metrics for Node 00: +row_regex:.*total-planning-duration: .+ +result-data-files: 6 +result-delete-files: 0 +total-data-manifests: 1 +total-delete-manifests: 0 +scanned-data-manifests: 1 +skipped-data-manifests: 0 +row_regex:.*total-file-size-in-bytes: .+ \(\d+\) +row_regex:.*total-delete-file-size-in-bytes: .+ \(\d+\) +skipped-data-files: 14 +skipped-delete-files: 0 +scanned-delete-manifests: 0 +skipped-delete-manifests: 0 +indexed-delete-files: 0 +equality-delete-files: 0 +positional-delete-files: 0 +==== diff --git a/testdata/workloads/functional-query/queries/QueryTest/iceberg-scan-metrics.test b/testdata/workloads/functional-query/queries/QueryTest/iceberg-scan-metrics-with-deletes.test similarity index 80% rename from testdata/workloads/functional-query/queries/QueryTest/iceberg-scan-metrics.test rename to testdata/workloads/functional-query/queries/QueryTest/iceberg-scan-metrics-with-deletes.test index 472e51777..aed019c21 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/iceberg-scan-metrics.test +++ b/testdata/workloads/functional-query/queries/QueryTest/iceberg-scan-metrics-with-deletes.test @@ -1,33 +1,5 @@ ==== ---- QUERY -select * from functional_parquet.iceberg_partitioned ----- RUNTIME_PROFILE -Iceberg Plan Metrics for Node 00: - Planning done without Iceberg: no Iceberg scan metrics available. -==== ----- QUERY -# Filtering on a partition column pushes the predicate down to Iceberg, so we have metrics. -select * from functional_parquet.iceberg_partitioned where action='download' ----- RUNTIME_PROFILE -Iceberg Plan Metrics for Node 00: -row_regex:.*total-planning-duration: .+ -result-data-files: 6 -result-delete-files: 0 -total-data-manifests: 1 -total-delete-manifests: 0 -scanned-data-manifests: 1 -skipped-data-manifests: 0 -row_regex:.*total-file-size-in-bytes: .+ \(\d+\) -row_regex:.*total-delete-file-size-in-bytes: .+ \(\d+\) -skipped-data-files: 14 -skipped-delete-files: 0 -scanned-delete-manifests: 0 -skipped-delete-manifests: 0 -indexed-delete-files: 0 -equality-delete-files: 0 -positional-delete-files: 0 -==== ----- QUERY # Time travel results in planning with Iceberg, so we have metrics. select * from functional_parquet.iceberg_v2_no_deletes FOR SYSTEM_VERSION AS OF NO_DELETES_SNAPTHOT_ID, diff --git a/tests/query_test/test_iceberg.py b/tests/query_test/test_iceberg.py index bed02055d..0bcbf3a94 100644 --- a/tests/query_test/test_iceberg.py +++ b/tests/query_test/test_iceberg.py @@ -1493,33 +1493,8 @@ class TestIcebergTable(IcebergTestSuite): assert snapshots[0].get_parent_id() == snapshots[2].get_parent_id() assert snapshots[0].get_creation_time() < snapshots[2].get_creation_time() - def test_scan_metrics_in_profile(self, vector): - def get_latest_snapshot_id(fq_tbl_name): - query = ("select snapshot_id from {}.snapshots order by committed_at desc" - .format(fq_tbl_name)) - res = self.execute_query(query) - return res.data[0] - - ice_db = "functional_parquet" - - no_deletes = "{}.{}".format(ice_db, "iceberg_v2_no_deletes") - no_deletes_snapshot_id = get_latest_snapshot_id(no_deletes) - - pos_delete_all_rows = "{}.{}".format(ice_db, "iceberg_v2_positional_delete_all_rows") - pos_delete_all_rows_snapshot_id = get_latest_snapshot_id(pos_delete_all_rows) - - not_all_data_files_have_delete_files = "{}.{}".format( - ice_db, "iceberg_v2_positional_not_all_data_files_have_delete_files") - not_all_data_files_have_delete_files_snapshot_id = get_latest_snapshot_id( - not_all_data_files_have_delete_files) - - self.run_test_case('QueryTest/iceberg-scan-metrics', vector, - test_file_vars={ - "NO_DELETES_SNAPTHOT_ID": no_deletes_snapshot_id, - "POS_DELETE_ALL_ROWS_SNAPSHOT_ID": pos_delete_all_rows_snapshot_id, - "NOT_ALL_DATA_FILES_HAVE_DELETE_FILES_SNAPSHOT_ID": - not_all_data_files_have_delete_files_snapshot_id - }) + def test_scan_metrics_in_profile_basic(self, vector): + self.run_test_case('QueryTest/iceberg-scan-metrics-basic', vector) class TestIcebergV2Table(IcebergTestSuite): @@ -1611,6 +1586,36 @@ class TestIcebergV2Table(IcebergTestSuite): self.run_test_case('QueryTest/iceberg-tablesample-v2', vector, use_db="functional_parquet") + @SkipIfDockerizedCluster.internal_hostname + @SkipIf.hardcoded_uris + def test_scan_metrics_in_profile_with_deletes(self, vector): + def get_latest_snapshot_id(fq_tbl_name): + query = ("select snapshot_id from {}.snapshots order by committed_at desc" + .format(fq_tbl_name)) + res = self.execute_query(query) + return res.data[0] + + ice_db = "functional_parquet" + + no_deletes = "{}.{}".format(ice_db, "iceberg_v2_no_deletes") + no_deletes_snapshot_id = get_latest_snapshot_id(no_deletes) + + pos_delete_all_rows = "{}.{}".format(ice_db, "iceberg_v2_positional_delete_all_rows") + pos_delete_all_rows_snapshot_id = get_latest_snapshot_id(pos_delete_all_rows) + + not_all_data_files_have_delete_files = "{}.{}".format( + ice_db, "iceberg_v2_positional_not_all_data_files_have_delete_files") + not_all_data_files_have_delete_files_snapshot_id = get_latest_snapshot_id( + not_all_data_files_have_delete_files) + + self.run_test_case('QueryTest/iceberg-scan-metrics-with-deletes', vector, + test_file_vars={ + "NO_DELETES_SNAPTHOT_ID": no_deletes_snapshot_id, + "POS_DELETE_ALL_ROWS_SNAPSHOT_ID": pos_delete_all_rows_snapshot_id, + "NOT_ALL_DATA_FILES_HAVE_DELETE_FILES_SNAPSHOT_ID": + not_all_data_files_have_delete_files_snapshot_id + }) + @SkipIf.hardcoded_uris def test_metadata_tables(self, vector, unique_database): # Remove 'batch_size' option so we can set it at .test file.
