(impala) branch master updated: IMPALA-13188: Add test that compute stats does not result in a different tuple cache key

michaelsmith Tue, 22 Oct 2024 09:17:47 -0700

This is an automated email from the ASF dual-hosted git repository.

michaelsmith pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git



The following commit(s) were added to refs/heads/master by this push:
     new 47b638e66 IMPALA-13188: Add test that compute stats does not result in 
a different tuple cache key
47b638e66 is described below

commit 47b638e667dedd655f422609edff2ad14f7cd3f6
Author: Yida Wu <[email protected]>
AuthorDate: Wed Oct 9 16:00:13 2024 -0700

    IMPALA-13188: Add test that compute stats does not result in a different 
tuple cache key
    
    The patch introduces a new test, TestTupleCacheComputeStats, to
    verify that compute stats does not change the tuple cache key.
    The test creates a simple table with one row, runs an explain
    on a basic query, then inserts more rows, computes the stats,
    and reruns the same explain query. It compares the two results
    to ensure that the cache keys are identical in the planning
    phase.
    
    Tests:
    Passed the test.
    
    Change-Id: I918232f0af3a6ab8c32823da4dba8f8cd31369d0
    Reviewed-on: http://gerrit.cloudera.org:8080/21917
    Reviewed-by: Impala Public Jenkins <[email protected]>
    Tested-by: Impala Public Jenkins <[email protected]>
---
 tests/custom_cluster/test_tuple_cache.py | 47 ++++++++++++++++++++++++++++++++
 tests/util/parse_util.py                 | 26 ++++++++++++++++++
 2 files changed, 73 insertions(+)

diff --git a/tests/custom_cluster/test_tuple_cache.py 
b/tests/custom_cluster/test_tuple_cache.py
index 54ce970d9..071b6d7f3 100644
--- a/tests/custom_cluster/test_tuple_cache.py
+++ b/tests/custom_cluster/test_tuple_cache.py
@@ -25,6 +25,8 @@ import string
 from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
 from tests.common.test_dimensions import (
     add_exec_option_dimension, add_mandatory_exec_option)
+from tests.util.parse_util import (
+    match_memory_estimate, parse_mem_to_mb, match_cache_key)
 
 TABLE_LAYOUT = 'name STRING, age INT, address STRING'
 CACHE_START_ARGS = "--tuple_cache_dir=/tmp --log_level=2"
@@ -703,3 +705,48 @@ class TestTupleCacheCountStar(TestTupleCacheBase):
     result1 = self.execute_query(query)
     result2 = self.execute_query(query)
     assert result1.success and result2.success
+
+
+class TestTupleCacheComputeStats(TestTupleCacheBase):
+
+  @classmethod
+  def add_test_dimensions(cls):
+    super(TestTupleCacheComputeStats, cls).add_test_dimensions()
+    add_exec_option_dimension(cls, 'mt_dop', [0, 2])
+
+  @CustomClusterTestSuite.with_args(
+    start_args=CACHE_START_ARGS, cluster_size=1)
+  @pytest.mark.execute_serially
+  def test_tuple_cache_key_with_stats(self, vector, unique_database):
+    """
+    This test verifies if compute stats affect the tuple cache key.
+    """
+    self.client.set_configuration(vector.get_value('exec_option'))
+    fq_table = "{0}.tuple_cache_stats_test".format(unique_database)
+
+    # Create a table.
+    self.create_table(fq_table, scale=1)
+
+    # Get the explain text for a simple query.
+    query = "explain select * from {0}".format(fq_table)
+    result1 = self.execute_query(query)
+
+    # Insert rows to make the stats different.
+    for i in range(10):
+      self.execute_query("INSERT INTO {0} VALUES ({1})".format(
+        fq_table, table_value(i)))
+
+    # Run compute stats and get the explain text again for the same query.
+    self.client.execute("COMPUTE STATS {0}".format(fq_table))
+    result2 = self.execute_query(query)
+
+    # Verify memory estimations are different, while the cache keys are 
identical.
+    assert result1.success and result2.success
+    mem_limit1, units1 = match_memory_estimate(result1.data)
+    mem_limit1 = parse_mem_to_mb(mem_limit1, units1)
+    mem_limit2, units2 = match_memory_estimate(result2.data)
+    mem_limit2 = parse_mem_to_mb(mem_limit2, units2)
+    assert mem_limit1 != mem_limit2
+    cache_key1 = match_cache_key(result1.data)
+    cache_key2 = match_cache_key(result2.data)
+    assert cache_key1 is not None and cache_key1 == cache_key2
diff --git a/tests/util/parse_util.py b/tests/util/parse_util.py
index 79911c796..0d2156f51 100644
--- a/tests/util/parse_util.py
+++ b/tests/util/parse_util.py
@@ -34,6 +34,7 @@ EXPECTED_TPCH_STRESS_QUERIES_COUNT = 
EXPECTED_TPCH_QUERIES_COUNT + 3
 MEM_ESTIMATE_PATTERN = re.compile(
     r"Per-Host Resource Estimates: Memory=(\d+\.?\d*)(P|T|G|M|K)?B")
 NEW_GLOG_ENTRY_PATTERN = re.compile(r"[IWEF](?P<Time>\d{4} 
\d{2}:\d{2}:\d{2}\.\d{6}).*")
+CACHE_KEY_PATTERN = re.compile(r"cache key: ([a-f0-9]+)")
 
 
 def parse_glog(text, start_time=None):
@@ -155,6 +156,31 @@ def match_memory_estimate(explain_lines):
   return mem_limit, units
 
 
+def match_cache_key(explain_lines):
+  """
+  Given a list of strings from EXPLAIN output, find the cache key.
+
+  Params:
+    explain_lines: list of str
+
+  Returns:
+    str - The cache key if found
+
+  Raises:
+    Exception if no cache key is found
+  """
+  cache_key = None
+  for line in explain_lines:
+    regex_result = CACHE_KEY_PATTERN.search(line)
+    if regex_result:
+      cache_key = regex_result.group(1)
+      break
+  if cache_key is None:
+    raise Exception(
+      'could not find cache key in explain string:\n' + 
'\n'.join(explain_lines))
+  return cache_key
+
+
 def get_bytes_summary_stats_counter(counter_name, runtime_profile):
   """Extracts a list of TSummaryStatsCounters from a given runtime profile 
where the units
      are in bytes. Each entry in the returned list corresponds to a single 
occurrence of

(impala) branch master updated: IMPALA-13188: Add test that compute stats does not result in a different tuple cache key

Reply via email to