TPC-DS queries with tuple caching

csringhofer Fri, 16 Aug 2024 01:56:31 -0700

This is an automated email from the ASF dual-hosted git repository.

csringhofer pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


commit f2a09b6ddafdca48b5edcc68fe6da9d1d14c7325
Author: Yida Wu <[email protected]>
AuthorDate: Mon Apr 24 22:15:19 2023 -0700

    IMPALA-12907: Add testcases for TPC-H/TPC-DS queries with tuple caching
    
    Added testcases to run TPC-H and TPC-DS queries twice with tuple
    caching to verify that Impala won't crash and ensure the
    correctness of the results.
    
    Testcases allows mt_dop to be 0 or 4.
    
    Also, added the environment varibles of tuple cache to
    run-all-tests.sh and added skipif to test_tuple_cache_tpc_queries.py
    to skip if not tuple cache enabled.
    
    Tests:
    Ran the tests in the build with tuple cache enabled.
    
    Change-Id: I967372744d8dda25cbe372aefec04faec5a76847
    Reviewed-on: http://gerrit.cloudera.org:8080/21628
    Reviewed-by: Impala Public Jenkins <[email protected]>
    Tested-by: Impala Public Jenkins <[email protected]>
---
 bin/run-all-tests.sh                             | 12 ++++
 tests/common/environ.py                          |  6 ++
 tests/common/skip.py                             |  5 +-
 tests/query_test/test_tuple_cache_tpc_queries.py | 77 ++++++++++++++++++++++++
 tests/util/test_file_parser.py                   | 37 ++++++++++++
 5 files changed, 136 insertions(+), 1 deletion(-)

diff --git a/bin/run-all-tests.sh b/bin/run-all-tests.sh
index 564d79ed1..3e98a40e6 100755
--- a/bin/run-all-tests.sh
+++ b/bin/run-all-tests.sh
@@ -81,6 +81,10 @@ fi
 : ${DATA_CACHE_EVICTION_POLICY:=}
 # Number of data cache async write threads.
 : ${DATA_CACHE_NUM_ASYNC_WRITE_THREADS:=}
+# Tuple cache root directory location.
+: ${TUPLE_CACHE_DIR:=}
+# Tuple cache capacity.
+: ${TUPLE_CACHE_CAPACITY:=}
 if [[ "${TARGET_FILESYSTEM}" == "local" ]]; then
   # TODO: Remove abort_on_config_error flag from here and create-load-data.sh 
once
   # checkConfiguration() accepts the local filesystem (see IMPALA-1850).
@@ -110,6 +114,14 @@ if [[ -n "${DATA_CACHE_DIR}" && -n "${DATA_CACHE_SIZE}" 
]]; then
    fi
 fi
 
+# Enable tuple cache if configured.
+if [[ -n "${TUPLE_CACHE_DIR}" && -n "${TUPLE_CACHE_CAPACITY}" ]]; then
+   TEST_START_CLUSTER_ARGS="${TEST_START_CLUSTER_ARGS} "`
+       `"--tuple_cache_dir=${TUPLE_CACHE_DIR} "
+   TEST_START_CLUSTER_ARGS="${TEST_START_CLUSTER_ARGS} "`
+       `"--tuple_cache_capacity=${TUPLE_CACHE_CAPACITY} "
+fi
+
 if [[ "${ERASURE_CODING}" = true ]]; then
   # We do not run FE tests when erasure coding is enabled because planner tests
   # would fail.
diff --git a/tests/common/environ.py b/tests/common/environ.py
index b3e354c9d..c1de7e892 100644
--- a/tests/common/environ.py
+++ b/tests/common/environ.py
@@ -103,6 +103,12 @@ IS_BUGGY_EL6_KERNEL = 'el6' in kernel_release and 
kernel_version < [2, 6, 32, 67
 IS_TEST_JDK = os.environ.get("TEST_JAVA_HOME_OVERRIDE",
                              os.environ.get("TEST_JDK_VERSION", "")) != ""
 
+# Detect if we are testing with tuple cache enabled.
+IS_TUPLE_CACHE = (
+    os.getenv("TUPLE_CACHE_DIR", "") != ""
+    and os.getenv("TUPLE_CACHE_CAPACITY", "") != ""
+)
+
 class ImpalaBuildFlavors:
   """
   Represents the possible CMAKE_BUILD_TYPE values. These build flavors are 
needed
diff --git a/tests/common/skip.py b/tests/common/skip.py
index c26eb3e16..bde2e3628 100644
--- a/tests/common/skip.py
+++ b/tests/common/skip.py
@@ -27,7 +27,8 @@ from functools import partial
 from tests.common.environ import (ImpalaTestClusterProperties,
                                   IS_DOCKERIZED_TEST_CLUSTER, 
IS_BUGGY_EL6_KERNEL,
                                   HIVE_MAJOR_VERSION, IS_REDHAT_6_DERIVATIVE,
-                                  IS_APACHE_HIVE, IS_TEST_JDK)
+                                  IS_APACHE_HIVE, IS_TEST_JDK,
+                                  IS_TUPLE_CACHE)
 from tests.common.kudu_test_suite import get_kudu_master_flag
 from tests.util.filesystem_utils import (
     IS_ABFS,
@@ -129,6 +130,8 @@ class SkipIf:
   is_test_jdk = pytest.mark.skipif(IS_TEST_JDK, reason="Testing with different 
JDK")
   runs_slowly = 
pytest.mark.skipif(IMPALA_TEST_CLUSTER_PROPERTIES.runs_slowly(),
       reason="Test cluster runs slowly due to enablement of code coverage or 
sanitizer")
+  not_tuple_cache = pytest.mark.skipif(not IS_TUPLE_CACHE,
+      reason="Tuple Cache needed")
 
 class SkipIfLocal:
   # These are skipped due to product limitations.
diff --git a/tests/query_test/test_tuple_cache_tpc_queries.py 
b/tests/query_test/test_tuple_cache_tpc_queries.py
new file mode 100644
index 000000000..984121cca
--- /dev/null
+++ b/tests/query_test/test_tuple_cache_tpc_queries.py
@@ -0,0 +1,77 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Functional tests running the TPCH and TPCDS workload twice to test tuple 
cache.
+from __future__ import absolute_import, division, print_function
+import pytest
+
+from tests.common.impala_test_suite import ImpalaTestSuite
+from tests.common.skip import SkipIf
+from tests.common.test_dimensions import create_single_exec_option_dimension
+from tests.util.test_file_parser import load_tpc_queries_name_sorted
+
+MT_DOP_VALUES = [0, 4]
+
+
+def run_tuple_cache_test(self, vector, query, mtdop):
+  vector.get_value('exec_option')['enable_tuple_cache'] = True
+  vector.get_value('exec_option')['mt_dop'] = mtdop
+  # Run twice to test write and read the tuple cache.
+  self.run_test_case(query, vector)
+  self.run_test_case(query, vector)
+
+
[email protected]_tuple_cache
+class TestTupleCacheTpchQuery(ImpalaTestSuite):
+  @classmethod
+  def get_workload(self):
+    return 'tpch'
+
+  @classmethod
+  def add_test_dimensions(cls):
+    super(TestTupleCacheTpchQuery, cls).add_test_dimensions()
+    if cls.exploration_strategy() != 'exhaustive':
+      cls.ImpalaTestMatrix.add_dimension(create_single_exec_option_dimension())
+      cls.ImpalaTestMatrix.add_constraint(lambda v:
+        v.get_value('table_format').file_format == 'parquet'
+        and v.get_value('table_format').compression_codec == 'none')
+
+  @pytest.mark.parametrize("query", load_tpc_queries_name_sorted('tpch'))
+  @pytest.mark.parametrize("mtdop", MT_DOP_VALUES)
+  def test_tpch(self, vector, query, mtdop):
+    run_tuple_cache_test(self, vector, query, mtdop)
+
+
[email protected]_tuple_cache
+class TestTupleCacheTpcdsQuery(ImpalaTestSuite):
+  @classmethod
+  def get_workload(self):
+    return 'tpcds'
+
+  @classmethod
+  def add_test_dimensions(cls):
+    super(TestTupleCacheTpcdsQuery, cls).add_test_dimensions()
+    if cls.exploration_strategy() != 'exhaustive':
+      cls.ImpalaTestMatrix.add_dimension(create_single_exec_option_dimension())
+      cls.ImpalaTestMatrix.add_constraint(lambda v:
+        v.get_value('table_format').file_format == 'parquet'
+        and v.get_value('table_format').compression_codec == 'none')
+
+  @pytest.mark.parametrize("query", load_tpc_queries_name_sorted('tpcds'))
+  @pytest.mark.parametrize("mtdop", MT_DOP_VALUES)
+  def test_tpcds(self, vector, query, mtdop):
+    run_tuple_cache_test(self, vector, query, mtdop)
diff --git a/tests/util/test_file_parser.py b/tests/util/test_file_parser.py
index 8bd2973d4..f9bb2afbc 100644
--- a/tests/util/test_file_parser.py
+++ b/tests/util/test_file_parser.py
@@ -439,3 +439,40 @@ def load_tpc_queries(workload, 
include_stress_queries=False, query_name_filters=
       raise Exception("Expected exactly 1 query to be in file %s but got %s"
           % (file_path, len(test_cases)))
   return queries
+
+
+def tpc_sort_key(s):
+    """
+    Sorting key for sorting strings in the format "tpch-qx-y" or "tpch-qx" or 
"tpch-qxX".
+    Args:
+        s (str): Input string to be sorted.
+    Returns:
+        tuple: Tuple of (int, int, str), where the first integer is the value 
of "x",
+               the second integer is the value of "y" if present, otherwise 0, 
and the
+               third element is the character of "X" of the string, if present 
and not
+               numeric, otherwise an empty string ('').
+    """
+    parts = s.split("-")
+    match = re.search(r"q(\d+)(\D)?", parts[1])
+    x = int(match.group(1)) if match else 0
+    x_char = match.group(2) if match else ''
+    y = int(parts[2]) if len(parts) > 2 else 0
+    return x, y, x_char
+
+
+def load_tpc_queries_name_sorted(workload):
+    """
+    Returns a list of queries for the given workload. Only tpch and tpcds are 
supported,
+    and only the name of the queries will be returned. The names are be sorted 
and
+    converted to lowercase before being returned.
+    Args:
+        workload (str): tpch or tpcds.
+    """
+    queries = list(load_tpc_queries(workload).keys())
+    queries = [s.lower() for s in queries]
+    queries = sorted(queries, key=tpc_sort_key)
+    if workload == 'tpcds':
+      # TPCDS is assumed to always use decimal_v2, in alignment with 
load_tpc_queries()
+      substring = "tpcds-decimal_v2"
+      queries = [x.replace(workload, substring) for x in queries]
+    return queries

(impala) 02/02: IMPALA-12907: Add testcases for TPC-H/TPC-DS queries with tuple caching

Reply via email to