This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 1a381b6aea2d19d738119bd0e1c457e13a0d2763
Author: Zoltan Borok-Nagy <[email protected]>
AuthorDate: Wed Jun 18 15:30:32 2025 +0200

    IMPALA-14142: Fix TestIcebergV2Table.test_compute_stats_table_sampling
    
    TestIcebergV2Table.test_compute_stats_table_sampling was failing in
    ARM release builds. However, COMPUTE STATS with TABLESAMPLE is
    inherently non-deterministic due to its use of SAMPLED_NDV().
    
    This patch completely rewrites the tests and moves them to
    test_stats_extrapolation.py to test Iceberg tables similarly to
    legacy tables.
    
    'diff_perc' argument of appx_equals() method was also updated in
    the tests, as with the previous value (1.0) it only reported errors
    for negative estimates.
    
    Change-Id: I98b07b156aad300827c9e1b7970b8dfacfc6d251
    Reviewed-on: http://gerrit.cloudera.org:8080/23044
    Reviewed-by: Impala Public Jenkins <[email protected]>
    Tested-by: Impala Public Jenkins <[email protected]>
---
 .../iceberg-v2-compute-stats-table-sampling.test   | 234 ---------------------
 tests/metadata/test_stats_extrapolation.py         |  66 +++++-
 tests/query_test/test_iceberg.py                   |   8 -
 3 files changed, 55 insertions(+), 253 deletions(-)

diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/iceberg-v2-compute-stats-table-sampling.test
 
b/testdata/workloads/functional-query/queries/QueryTest/iceberg-v2-compute-stats-table-sampling.test
deleted file mode 100644
index 19e81836a..000000000
--- 
a/testdata/workloads/functional-query/queries/QueryTest/iceberg-v2-compute-stats-table-sampling.test
+++ /dev/null
@@ -1,234 +0,0 @@
-====
----- QUERY
-DROP STATS iceberg_non_partitioned;
-COMPUTE STATS iceberg_non_partitioned tablesample system(10) repeatable(1234);
----- RESULTS
-'Updated 1 partition(s) and 4 column(s).'
----- TYPES
-STRING
-====
----- QUERY
-SHOW TABLE STATS iceberg_non_partitioned;
----- LABELS
-#ROWS, #Files, Size, Bytes Cached, Cache Replication, Format, Incremental 
stats, Location, EC Policy
----- RESULTS: VERIFY_IS_EQUAL
-20,20,'22.90KB','NOT CACHED','NOT 
CACHED','PARQUET','false','$NAMENODE/test-warehouse/iceberg_test/iceberg_non_partitioned','$ERASURECODE_POLICY'
----- TYPES
-BIGINT,BIGINT,STRING,STRING,STRING,STRING,STRING,STRING,STRING
-====
----- QUERY
-SHOW COLUMN STATS iceberg_non_partitioned;
----- RESULTS
-'id','INT',3,0,4,4,-1,-1
-'user','STRING',2,0,4,4,-1,-1
-'action','STRING',2,0,5,4.333333492279053,-1,-1
-'event_time','TIMESTAMP',2,0,16,16,-1,-1
----- TYPES
-STRING, STRING, BIGINT, BIGINT, BIGINT, DOUBLE, BIGINT, BIGINT
-====
----- QUERY
-DROP STATS iceberg_non_partitioned;
-COMPUTE STATS iceberg_non_partitioned tablesample system(10) repeatable(1111);
----- RESULTS
-'Updated 1 partition(s) and 4 column(s).'
----- TYPES
-STRING
-====
----- QUERY
-SHOW TABLE STATS iceberg_non_partitioned;
----- LABELS
-#ROWS, #Files, Size, Bytes Cached, Cache Replication, Format, Incremental 
stats, Location, EC Policy
----- RESULTS: VERIFY_IS_EQUAL
-20,20,'22.90KB','NOT CACHED','NOT 
CACHED','PARQUET','false','$NAMENODE/test-warehouse/iceberg_test/iceberg_non_partitioned','$ERASURECODE_POLICY'
----- TYPES
-BIGINT,BIGINT,STRING,STRING,STRING,STRING,STRING,STRING,STRING
-====
----- QUERY
-SHOW COLUMN STATS iceberg_non_partitioned;
----- RESULTS
-'id','INT',2,0,4,4,-1,-1
-'user','STRING',2,0,4,4,-1,-1
-'action','STRING',2,0,8,6.5,-1,-1
-'event_time','TIMESTAMP',2,0,16,16,-1,-1
----- TYPES
-STRING, STRING, BIGINT, BIGINT, BIGINT, DOUBLE, BIGINT, BIGINT
-====
----- QUERY
-DROP STATS iceberg_partitioned;
-COMPUTE STATS iceberg_partitioned tablesample system(10) repeatable(1111);
----- RESULTS
-'Updated 1 partition(s) and 4 column(s).'
----- TYPES
-STRING
-====
----- QUERY
-SHOW TABLE STATS iceberg_partitioned;
----- LABELS
-#ROWS, #Files, Size, Bytes Cached, Cache Replication, Format, Incremental 
stats, Location, EC Policy
----- RESULTS: VERIFY_IS_EQUAL
-20,20,'22.90KB','NOT CACHED','NOT 
CACHED','PARQUET','false','$NAMENODE/test-warehouse/iceberg_test/iceberg_partitioned','$ERASURECODE_POLICY'
----- TYPES
-BIGINT,BIGINT,STRING,STRING,STRING,STRING,STRING,STRING,STRING
-====
----- QUERY
-SHOW COLUMN STATS iceberg_partitioned;
----- RESULTS
-'id','INT',3,0,4,4,-1,-1
-'user','STRING',3,0,4,4,-1,-1
-'action','STRING',3,0,8,5.666666507720947,-1,-1
-'event_time','TIMESTAMP',3,0,16,16,-1,-1
----- TYPES
-STRING, STRING, BIGINT, BIGINT, BIGINT, DOUBLE, BIGINT, BIGINT
-====
----- QUERY
-DROP STATS iceberg_v2_delete_equality_partitioned;
-COMPUTE STATS iceberg_v2_delete_equality_partitioned tablesample system(10) 
repeatable(1111);
----- RESULTS
-'Updated 1 partition(s) and 3 column(s).'
----- TYPES
-STRING
-====
----- QUERY
-SHOW TABLE STATS iceberg_v2_delete_equality_partitioned;
----- LABELS
-#ROWS, #Files, Size, Bytes Cached, Cache Replication, Format, Incremental 
stats, Location, EC Policy
----- RESULTS: VERIFY_IS_EQUAL
-3,6,'4.81KB','NOT CACHED','NOT 
CACHED','PARQUET','false','$NAMENODE/test-warehouse/iceberg_test/hadoop_catalog/ice/iceberg_v2_delete_equality_partitioned','$ERASURECODE_POLICY'
----- TYPES
-BIGINT,BIGINT,STRING,STRING,STRING,STRING,STRING,STRING,STRING
-====
----- QUERY
-SHOW COLUMN STATS iceberg_v2_delete_equality_partitioned;
----- RESULTS
-'i','INT',2,0,4,4,-1,-1
-'s','STRING',2,0,4,4,-1,-1
-'d','DATE',1,0,4,4,-1,-1
----- TYPES
-STRING, STRING, BIGINT, BIGINT, BIGINT, DOUBLE, BIGINT, BIGINT
-====
----- QUERY
-DROP STATS iceberg_v2_delete_equality_partitioned;
-COMPUTE STATS iceberg_v2_delete_equality_partitioned tablesample system(10) 
repeatable(1111);
----- RESULTS
-'Updated 1 partition(s) and 3 column(s).'
----- TYPES
-STRING
-====
----- QUERY
-SHOW TABLE STATS iceberg_v2_delete_equality_partitioned;
----- LABELS
-#ROWS, #Files, Size, Bytes Cached, Cache Replication, Format, Incremental 
stats, Location, EC Policy
----- RESULTS: VERIFY_IS_EQUAL
-3,6,'4.81KB','NOT CACHED','NOT 
CACHED','PARQUET','false','$NAMENODE/test-warehouse/iceberg_test/hadoop_catalog/ice/iceberg_v2_delete_equality_partitioned','$ERASURECODE_POLICY'
----- TYPES
-BIGINT,BIGINT,STRING,STRING,STRING,STRING,STRING,STRING,STRING
-====
----- QUERY
-SHOW COLUMN STATS iceberg_v2_delete_equality_partitioned;
----- RESULTS
-'i','INT',2,0,4,4,-1,-1
-'s','STRING',2,0,4,4,-1,-1
-'d','DATE',1,0,4,4,-1,-1
----- TYPES
-STRING, STRING, BIGINT, BIGINT, BIGINT, DOUBLE, BIGINT, BIGINT
-====
----- QUERY
-DROP STATS iceberg_v2_positional_not_all_data_files_have_delete_files;
-COMPUTE STATS iceberg_v2_positional_not_all_data_files_have_delete_files 
tablesample system(30) repeatable(1111);
----- RESULTS
-'Updated 1 partition(s) and 2 column(s).'
----- TYPES
-STRING
-====
----- QUERY
-SHOW TABLE STATS iceberg_v2_positional_not_all_data_files_have_delete_files;
----- LABELS
-#ROWS, #Files, Size, Bytes Cached, Cache Replication, Format, Incremental 
stats, Location, EC Policy
----- RESULTS: VERIFY_IS_EQUAL
-1,6,'7.77KB','NOT CACHED','NOT 
CACHED','PARQUET','false','$NAMENODE/test-warehouse/iceberg_test/hadoop_catalog/ice/iceberg_v2_positional_not_all_data_files_have_delete_files','$ERASURECODE_POLICY'
----- TYPES
-BIGINT,BIGINT,STRING,STRING,STRING,STRING,STRING,STRING,STRING
-====
----- QUERY
-SHOW COLUMN STATS iceberg_v2_positional_not_all_data_files_have_delete_files;
----- RESULTS
-'i','INT',1,0,4,4,-1,-1
-'s','STRING',1,0,1,1,-1,-1
----- TYPES
-STRING, STRING, BIGINT, BIGINT, BIGINT, DOUBLE, BIGINT, BIGINT
-====
----- QUERY
-DROP STATS iceberg_v2_positional_not_all_data_files_have_delete_files_orc;
-COMPUTE STATS iceberg_v2_positional_not_all_data_files_have_delete_files_orc 
tablesample system(30) repeatable(1111);
----- RESULTS
-'Updated 1 partition(s) and 2 column(s).'
----- TYPES
-STRING
-====
----- QUERY
-SHOW TABLE STATS 
iceberg_v2_positional_not_all_data_files_have_delete_files_orc;
----- LABELS
-#ROWS, #Files, Size, Bytes Cached, Cache Replication, Format, Incremental 
stats, Location, EC Policy
----- RESULTS: VERIFY_IS_EQUAL
-1,6,'3.97KB','NOT CACHED','NOT 
CACHED','ORC','false','$NAMENODE/test-warehouse/iceberg_test/hadoop_catalog/ice/iceberg_v2_positional_not_all_data_files_have_delete_files_orc','$ERASURECODE_POLICY'
----- TYPES
-BIGINT,BIGINT,STRING,STRING,STRING,STRING,STRING,STRING,STRING
-====
----- QUERY
-SHOW COLUMN STATS 
iceberg_v2_positional_not_all_data_files_have_delete_files_orc;
----- RESULTS
-'i','INT',1,0,4,4,-1,-1
-'s','STRING',1,0,1,1,-1,-1
----- TYPES
-STRING, STRING, BIGINT, BIGINT, BIGINT, DOUBLE, BIGINT, BIGINT
-====
----- QUERY
-DROP STATS iceberg_v2_positional_not_all_data_files_have_delete_files;
-COMPUTE STATS iceberg_v2_positional_not_all_data_files_have_delete_files 
tablesample system(50) repeatable(1111);
----- RESULTS
-'Updated 1 partition(s) and 2 column(s).'
----- TYPES
-STRING
-====
----- QUERY
-SHOW TABLE STATS iceberg_v2_positional_not_all_data_files_have_delete_files;
----- LABELS
-#ROWS, #Files, Size, Bytes Cached, Cache Replication, Format, Incremental 
stats, Location, EC Policy
----- RESULTS: VERIFY_IS_EQUAL
-4,6,'7.77KB','NOT CACHED','NOT 
CACHED','PARQUET','false','$NAMENODE/test-warehouse/iceberg_test/hadoop_catalog/ice/iceberg_v2_positional_not_all_data_files_have_delete_files','$ERASURECODE_POLICY'
----- TYPES
-BIGINT,BIGINT,STRING,STRING,STRING,STRING,STRING,STRING,STRING
-====
----- QUERY
-SHOW COLUMN STATS iceberg_v2_positional_not_all_data_files_have_delete_files;
----- RESULTS
-'i','INT',4,0,4,4,-1,-1
-'s','STRING',4,0,1,1,-1,-1
----- TYPES
-STRING, STRING, BIGINT, BIGINT, BIGINT, DOUBLE, BIGINT, BIGINT
-====
----- QUERY
-DROP STATS iceberg_v2_positional_not_all_data_files_have_delete_files_orc;
-COMPUTE STATS iceberg_v2_positional_not_all_data_files_have_delete_files_orc 
tablesample system(50) repeatable(1111);
----- RESULTS
-'Updated 1 partition(s) and 2 column(s).'
----- TYPES
-STRING
-====
----- QUERY
-SHOW TABLE STATS 
iceberg_v2_positional_not_all_data_files_have_delete_files_orc;
----- LABELS
-#ROWS, #Files, Size, Bytes Cached, Cache Replication, Format, Incremental 
stats, Location, EC Policy
----- RESULTS: VERIFY_IS_EQUAL
-4,6,'3.97KB','NOT CACHED','NOT 
CACHED','ORC','false','$NAMENODE/test-warehouse/iceberg_test/hadoop_catalog/ice/iceberg_v2_positional_not_all_data_files_have_delete_files_orc','$ERASURECODE_POLICY'
----- TYPES
-BIGINT,BIGINT,STRING,STRING,STRING,STRING,STRING,STRING,STRING
-====
----- QUERY
-SHOW COLUMN STATS 
iceberg_v2_positional_not_all_data_files_have_delete_files_orc;
----- RESULTS
-'i','INT',4,0,4,4,-1,-1
-'s','STRING',4,0,1,1,-1,-1
----- TYPES
-STRING, STRING, BIGINT, BIGINT, BIGINT, DOUBLE, BIGINT, BIGINT
-====
diff --git a/tests/metadata/test_stats_extrapolation.py 
b/tests/metadata/test_stats_extrapolation.py
index fccba527f..7d762c5d4 100644
--- a/tests/metadata/test_stats_extrapolation.py
+++ b/tests/metadata/test_stats_extrapolation.py
@@ -55,7 +55,7 @@ class TestStatsExtrapolation(ImpalaTestSuite):
 
     # Since our test tables are small, set the minimum sample size to 0 to 
make sure
     # we exercise the sampling code paths.
-    self.client.execute("set compute_stats_min_sample_size=0")
+    self.client.set_configuration_option('compute_stats_min_sample_size', '0')
 
     # Test partitioned table.
     part_test_tbl = unique_database + ".alltypes"
@@ -120,24 +120,57 @@ class TestStatsExtrapolation(ImpalaTestSuite):
       self.client.execute(
         "compute stats {0} tablesample system(10)".format(wide_test_tbl))
 
+  def test_compute_stats_tablesample_iceberg(self, unique_database):
+    self.client.set_configuration_option('compute_stats_min_sample_size', '0')
+
+    ice_tbls = ['iceberg_non_partitioned', 'iceberg_partitioned',
+                'iceberg_v2_delete_equality_partitioned',
+                'iceberg_v2_positional_not_all_data_files_have_delete_files',
+                
'iceberg_v2_positional_not_all_data_files_have_delete_files_orc']
+
+    # Array of sampling parameters: [(percentage, seed), ...]
+    sampling_params = [(1, 3), (10, 7), (20, 13), (100, 99)]
+
+    for tbl in ice_tbls:
+      orig_tbl = 'functional_parquet.' + tbl
+      cloned_tbl_base = unique_database + '.' + tbl + "_base"
+      cloned_tbl_sample = unique_database + '.' + tbl + "_sample"
+      self.clone_iceberg_table(orig_tbl, cloned_tbl_base)
+      self.clone_iceberg_table(orig_tbl, cloned_tbl_sample)
+      self.__set_extrapolation_tblprop(cloned_tbl_sample)
+      self.client.execute("compute stats {0}".format(cloned_tbl_base))
+
+      for sampling_param in sampling_params:
+        percentage = sampling_param[0]
+        seed = sampling_param[1]
+        self.__run_sampling_test(
+            cloned_tbl_sample, "", cloned_tbl_base, percentage, seed,
+            is_precise_table_stats=True)
+
+  def clone_iceberg_table(self, src_tbl, dst_tbl):
+    # We are cloning external, non-HiveCatalog tables here.
+    self.client.execute("create table {0} like {1}".format(dst_tbl, src_tbl))
+
   def __set_extrapolation_tblprop(self, tbl):
     """Alters the given table to enable stats extrapolation via tblproperty."""
     self.client.execute("alter table {0} set "
       "tblproperties('impala.enable.stats.extrapolation'='true')".format(tbl))
 
-  def __run_sampling_test(self, tbl, cols, expected_tbl, perc, seed):
+  def __run_sampling_test(self, tbl, cols, expected_tbl, perc, seed,
+                          is_precise_table_stats=False):
     """Drops stats on 'tbl' and then runs COMPUTE STATS TABLESAMPLE on 'tbl' 
with the
     given column restriction clause, sampling percent and random seed. Checks 
that
     the resulting table and column stats are reasonably close to those of
-    'expected_tbl'."""
+    'expected_tbl'. For table formats like ICEBERG the table-level stats are 
always
+    precise, this can be indicated by the parameter 
'is_extrapolated_table_stats'"""
     self.client.execute("drop stats {0}".format(tbl))
     self.client.execute(
         "compute stats {0}{1} tablesample system ({2}) repeatable 
({3})".format(
             tbl, cols, perc, seed))
-    self.__check_table_stats(tbl, expected_tbl)
-    self.__check_column_stats(cols, tbl, expected_tbl)
+    self.__check_table_stats(tbl, expected_tbl, perc, is_precise_table_stats)
+    self.__check_column_stats(cols, tbl, perc, expected_tbl)
 
-  def __check_table_stats(self, tbl, expected_tbl):
+  def __check_table_stats(self, tbl, expected_tbl, perc, is_precise=False):
     """Checks that the row counts reported in SHOW TABLE STATS on 'tbl' are 
within 2x
     of those reported for 'expected_tbl'. Assumes that COMPUTE STATS was 
previously run
     on 'expected_table' and that COMPUTE STATS TABLESAMPLE was run on 'tbl'."""
@@ -147,25 +180,27 @@ class TestStatsExtrapolation(ImpalaTestSuite):
     assert len(actual.column_labels) == len(expected.column_labels)
     col_names = actual.column_labels
     rows_col_idx = col_names.index("#ROWS")
-    extrap_rows_col_idx = col_names.index("EXTRAP #ROWS")
+    extrap_rows_label = "#ROWS" if is_precise else "EXTRAP #ROWS"
+    extrap_rows_col_idx = col_names.index(extrap_rows_label)
     for i in range(0, len(actual.data)):
       act_cols = actual.data[i].split("\t")
       exp_cols = expected.data[i].split("\t")
       assert int(exp_cols[rows_col_idx]) >= 0
       # The expected_tbl is expected to have valid extrapolated #rows for 
every partition.
       assert int(act_cols[extrap_rows_col_idx]) >= 0
+      diff_perc = 0.0 if is_precise else self.get_diff_perc(perc)
       self.appx_equals(
-        int(act_cols[extrap_rows_col_idx]), int(exp_cols[rows_col_idx]), 1.0)
+        int(act_cols[extrap_rows_col_idx]), int(exp_cols[rows_col_idx]), 
diff_perc)
       # Only the table-level row count is stored. The partition row counts
       # are extrapolated.
       if act_cols[0] == "Total":
         self.appx_equals(
-          int(act_cols[rows_col_idx]), int(exp_cols[rows_col_idx]), 1.0)
+          int(act_cols[rows_col_idx]), int(exp_cols[rows_col_idx]), diff_perc)
       elif len(actual.data) > 1:
         # Partition row count is expected to not be set.
         assert int(act_cols[rows_col_idx]) == -1
 
-  def __check_column_stats(self, cols, tbl, expected_tbl):
+  def __check_column_stats(self, cols, tbl, perc, expected_tbl):
     """Checks that the NDVs in SHOW COLUMNS STATS on 'tbl' are within 2x of 
those
     reported for 'expected_tbl'. Assumes that COMPUTE STATS was previously run
     on 'expected_table' and that COMPUTE STATS TABLESAMPLE was run on 'tbl'."""
@@ -184,4 +219,13 @@ class TestStatsExtrapolation(ImpalaTestSuite):
       # caller drops the stats before calling COMPUTE STATS.
       if cols == "" or act_cols[0] in cols:
         assert int(act_cols[ndv_col_idx]) >= 0
-        self.appx_equals(int(act_cols[ndv_col_idx]), 
int(exp_cols[ndv_col_idx]), 1.0)
+        self.appx_equals(int(act_cols[ndv_col_idx]), 
int(exp_cols[ndv_col_idx]),
+                         self.get_diff_perc(perc))
+
+  def get_diff_perc(self, percentage):
+    # Calculates the the parameter 'diff_perc' we give to 'appx_equals()'.
+    # 'diff_perc' should be proportional to the sample percentage. We use 
percentage * 0.9
+    # to give enough margin for misestimation.
+    assert 0 <= percentage <= 100
+    fraction = percentage / 100.0
+    return 1.0 - fraction * 0.9
diff --git a/tests/query_test/test_iceberg.py b/tests/query_test/test_iceberg.py
index 8515fb910..141808c67 100644
--- a/tests/query_test/test_iceberg.py
+++ b/tests/query_test/test_iceberg.py
@@ -1545,14 +1545,6 @@ class TestIcebergV2Table(IcebergTestSuite):
     self.run_test_case('QueryTest/iceberg-v2-read-position-deletes-stats', 
vector)
     self.run_test_case('QueryTest/iceberg-v2-read-position-deletes-orc-stats', 
vector)
 
-  @SkipIfDockerizedCluster.internal_hostname
-  @SkipIf.hardcoded_uris
-  @pytest.mark.execute_serially
-  def test_compute_stats_table_sampling(self, vector):
-    """Tests COMPUTE STATS with table sampling."""
-    vector.get_value('exec_option')['COMPUTE_STATS_MIN_SAMPLE_SIZE'] = 0
-    self.run_test_case('QueryTest/iceberg-v2-compute-stats-table-sampling', 
vector)
-
   @SkipIfFS.hive
   def test_read_mixed_format_position_deletes(self, vector, unique_database):
     self.run_test_case('QueryTest/iceberg-mixed-format-position-deletes',

Reply via email to