(impala) 05/05: IMPALA-14023: Fix test_scan_metrics_in_profile in non-HDFS builds

dbecker Wed, 21 May 2025 23:28:04 -0700

This is an automated email from the ASF dual-hosted git repository.

dbecker pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


commit 5c415545ea37e66104bc62b3ae6a6a276c416109
Author: Zoltan Borok-Nagy <[email protected]>
AuthorDate: Wed May 21 16:04:20 2025 +0200

    IMPALA-14023: Fix test_scan_metrics_in_profile in non-HDFS builds
    
    test_scan_metrics_in_profile was querying pre-written Iceberg V2
    tables. The position delete files of such tables contain hard-coded
    URIs of data files, i.e. URIs that start with "hdfs://localhost...".
    Therefore the test only worked well in HDFS builds.
    
    This patch splits the test into two parts:
    
    * test_scan_metrics_in_profile_basic: it works on all storage systems
      as it only works on Iceberg tables that don't have delete files.
    * test_scan_metrics_in_profile_with_deletes: uses Iceberg tables
      that have delete files, therefore it is only executed on HDFS.
    
    Change-Id: I80a7c6469a7f56b58254e1327a05ef7b3dc9c9ff
    Reviewed-on: http://gerrit.cloudera.org:8080/22931
    Reviewed-by: Impala Public Jenkins <[email protected]>
    Tested-by: Impala Public Jenkins <[email protected]>
---
 .../QueryTest/iceberg-scan-metrics-basic.test      | 29 +++++++++++
 ...test => iceberg-scan-metrics-with-deletes.test} | 28 ----------
 tests/query_test/test_iceberg.py                   | 59 ++++++++++++----------
 3 files changed, 61 insertions(+), 55 deletions(-)

diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/iceberg-scan-metrics-basic.test
 
b/testdata/workloads/functional-query/queries/QueryTest/iceberg-scan-metrics-basic.test
new file mode 100644
index 000000000..55155cdc5
--- /dev/null
+++ 
b/testdata/workloads/functional-query/queries/QueryTest/iceberg-scan-metrics-basic.test
@@ -0,0 +1,29 @@
+====
+---- QUERY
+select * from functional_parquet.iceberg_partitioned
+---- RUNTIME_PROFILE
+Iceberg Plan Metrics for Node 00:
+        Planning done without Iceberg: no Iceberg scan metrics available.
+====
+---- QUERY
+# Filtering on a partition column pushes the predicate down to Iceberg, so we 
have metrics.
+select * from functional_parquet.iceberg_partitioned where action='download'
+---- RUNTIME_PROFILE
+Iceberg Plan Metrics for Node 00:
+row_regex:.*total-planning-duration: .+
+result-data-files: 6
+result-delete-files: 0
+total-data-manifests: 1
+total-delete-manifests: 0
+scanned-data-manifests: 1
+skipped-data-manifests: 0
+row_regex:.*total-file-size-in-bytes: .+ \(\d+\)
+row_regex:.*total-delete-file-size-in-bytes: .+ \(\d+\)
+skipped-data-files: 14
+skipped-delete-files: 0
+scanned-delete-manifests: 0
+skipped-delete-manifests: 0
+indexed-delete-files: 0
+equality-delete-files: 0
+positional-delete-files: 0
+====
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/iceberg-scan-metrics.test
 
b/testdata/workloads/functional-query/queries/QueryTest/iceberg-scan-metrics-with-deletes.test
similarity index 80%
rename from 
testdata/workloads/functional-query/queries/QueryTest/iceberg-scan-metrics.test
rename to 
testdata/workloads/functional-query/queries/QueryTest/iceberg-scan-metrics-with-deletes.test
index 472e51777..aed019c21 100644
--- 
a/testdata/workloads/functional-query/queries/QueryTest/iceberg-scan-metrics.test
+++ 
b/testdata/workloads/functional-query/queries/QueryTest/iceberg-scan-metrics-with-deletes.test
@@ -1,33 +1,5 @@
 ====
 ---- QUERY
-select * from functional_parquet.iceberg_partitioned
----- RUNTIME_PROFILE
-Iceberg Plan Metrics for Node 00:
-        Planning done without Iceberg: no Iceberg scan metrics available.
-====
----- QUERY
-# Filtering on a partition column pushes the predicate down to Iceberg, so we 
have metrics.
-select * from functional_parquet.iceberg_partitioned where action='download'
----- RUNTIME_PROFILE
-Iceberg Plan Metrics for Node 00:
-row_regex:.*total-planning-duration: .+
-result-data-files: 6
-result-delete-files: 0
-total-data-manifests: 1
-total-delete-manifests: 0
-scanned-data-manifests: 1
-skipped-data-manifests: 0
-row_regex:.*total-file-size-in-bytes: .+ \(\d+\)
-row_regex:.*total-delete-file-size-in-bytes: .+ \(\d+\)
-skipped-data-files: 14
-skipped-delete-files: 0
-scanned-delete-manifests: 0
-skipped-delete-manifests: 0
-indexed-delete-files: 0
-equality-delete-files: 0
-positional-delete-files: 0
-====
----- QUERY
 # Time travel results in planning with Iceberg, so we have metrics.
 select * from
   functional_parquet.iceberg_v2_no_deletes FOR SYSTEM_VERSION AS OF 
NO_DELETES_SNAPTHOT_ID,
diff --git a/tests/query_test/test_iceberg.py b/tests/query_test/test_iceberg.py
index bed02055d..0bcbf3a94 100644
--- a/tests/query_test/test_iceberg.py
+++ b/tests/query_test/test_iceberg.py
@@ -1493,33 +1493,8 @@ class TestIcebergTable(IcebergTestSuite):
     assert snapshots[0].get_parent_id() == snapshots[2].get_parent_id()
     assert snapshots[0].get_creation_time() < snapshots[2].get_creation_time()
 
-  def test_scan_metrics_in_profile(self, vector):
-    def get_latest_snapshot_id(fq_tbl_name):
-        query = ("select snapshot_id from {}.snapshots order by committed_at 
desc"
-            .format(fq_tbl_name))
-        res = self.execute_query(query)
-        return res.data[0]
-
-    ice_db = "functional_parquet"
-
-    no_deletes = "{}.{}".format(ice_db, "iceberg_v2_no_deletes")
-    no_deletes_snapshot_id = get_latest_snapshot_id(no_deletes)
-
-    pos_delete_all_rows = "{}.{}".format(ice_db, 
"iceberg_v2_positional_delete_all_rows")
-    pos_delete_all_rows_snapshot_id = 
get_latest_snapshot_id(pos_delete_all_rows)
-
-    not_all_data_files_have_delete_files = "{}.{}".format(
-        ice_db, "iceberg_v2_positional_not_all_data_files_have_delete_files")
-    not_all_data_files_have_delete_files_snapshot_id = get_latest_snapshot_id(
-        not_all_data_files_have_delete_files)
-
-    self.run_test_case('QueryTest/iceberg-scan-metrics', vector,
-        test_file_vars={
-            "NO_DELETES_SNAPTHOT_ID": no_deletes_snapshot_id,
-            "POS_DELETE_ALL_ROWS_SNAPSHOT_ID": pos_delete_all_rows_snapshot_id,
-            "NOT_ALL_DATA_FILES_HAVE_DELETE_FILES_SNAPSHOT_ID":
-                 not_all_data_files_have_delete_files_snapshot_id
-                       })
+  def test_scan_metrics_in_profile_basic(self, vector):
+    self.run_test_case('QueryTest/iceberg-scan-metrics-basic', vector)
 
 
 class TestIcebergV2Table(IcebergTestSuite):
@@ -1611,6 +1586,36 @@ class TestIcebergV2Table(IcebergTestSuite):
     self.run_test_case('QueryTest/iceberg-tablesample-v2', vector,
         use_db="functional_parquet")
 
+  @SkipIfDockerizedCluster.internal_hostname
+  @SkipIf.hardcoded_uris
+  def test_scan_metrics_in_profile_with_deletes(self, vector):
+    def get_latest_snapshot_id(fq_tbl_name):
+        query = ("select snapshot_id from {}.snapshots order by committed_at 
desc"
+            .format(fq_tbl_name))
+        res = self.execute_query(query)
+        return res.data[0]
+
+    ice_db = "functional_parquet"
+
+    no_deletes = "{}.{}".format(ice_db, "iceberg_v2_no_deletes")
+    no_deletes_snapshot_id = get_latest_snapshot_id(no_deletes)
+
+    pos_delete_all_rows = "{}.{}".format(ice_db, 
"iceberg_v2_positional_delete_all_rows")
+    pos_delete_all_rows_snapshot_id = 
get_latest_snapshot_id(pos_delete_all_rows)
+
+    not_all_data_files_have_delete_files = "{}.{}".format(
+        ice_db, "iceberg_v2_positional_not_all_data_files_have_delete_files")
+    not_all_data_files_have_delete_files_snapshot_id = get_latest_snapshot_id(
+        not_all_data_files_have_delete_files)
+
+    self.run_test_case('QueryTest/iceberg-scan-metrics-with-deletes', vector,
+        test_file_vars={
+            "NO_DELETES_SNAPTHOT_ID": no_deletes_snapshot_id,
+            "POS_DELETE_ALL_ROWS_SNAPSHOT_ID": pos_delete_all_rows_snapshot_id,
+            "NOT_ALL_DATA_FILES_HAVE_DELETE_FILES_SNAPSHOT_ID":
+            not_all_data_files_have_delete_files_snapshot_id
+        })
+
   @SkipIf.hardcoded_uris
   def test_metadata_tables(self, vector, unique_database):
     # Remove 'batch_size' option so we can set it at .test file.

(impala) 05/05: IMPALA-14023: Fix test_scan_metrics_in_profile in non-HDFS builds

Reply via email to