This is an automated email from the ASF dual-hosted git repository. csringhofer pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit f2a09b6ddafdca48b5edcc68fe6da9d1d14c7325 Author: Yida Wu <[email protected]> AuthorDate: Mon Apr 24 22:15:19 2023 -0700 IMPALA-12907: Add testcases for TPC-H/TPC-DS queries with tuple caching Added testcases to run TPC-H and TPC-DS queries twice with tuple caching to verify that Impala won't crash and ensure the correctness of the results. Testcases allows mt_dop to be 0 or 4. Also, added the environment varibles of tuple cache to run-all-tests.sh and added skipif to test_tuple_cache_tpc_queries.py to skip if not tuple cache enabled. Tests: Ran the tests in the build with tuple cache enabled. Change-Id: I967372744d8dda25cbe372aefec04faec5a76847 Reviewed-on: http://gerrit.cloudera.org:8080/21628 Reviewed-by: Impala Public Jenkins <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> --- bin/run-all-tests.sh | 12 ++++ tests/common/environ.py | 6 ++ tests/common/skip.py | 5 +- tests/query_test/test_tuple_cache_tpc_queries.py | 77 ++++++++++++++++++++++++ tests/util/test_file_parser.py | 37 ++++++++++++ 5 files changed, 136 insertions(+), 1 deletion(-) diff --git a/bin/run-all-tests.sh b/bin/run-all-tests.sh index 564d79ed1..3e98a40e6 100755 --- a/bin/run-all-tests.sh +++ b/bin/run-all-tests.sh @@ -81,6 +81,10 @@ fi : ${DATA_CACHE_EVICTION_POLICY:=} # Number of data cache async write threads. : ${DATA_CACHE_NUM_ASYNC_WRITE_THREADS:=} +# Tuple cache root directory location. +: ${TUPLE_CACHE_DIR:=} +# Tuple cache capacity. +: ${TUPLE_CACHE_CAPACITY:=} if [[ "${TARGET_FILESYSTEM}" == "local" ]]; then # TODO: Remove abort_on_config_error flag from here and create-load-data.sh once # checkConfiguration() accepts the local filesystem (see IMPALA-1850). @@ -110,6 +114,14 @@ if [[ -n "${DATA_CACHE_DIR}" && -n "${DATA_CACHE_SIZE}" ]]; then fi fi +# Enable tuple cache if configured. +if [[ -n "${TUPLE_CACHE_DIR}" && -n "${TUPLE_CACHE_CAPACITY}" ]]; then + TEST_START_CLUSTER_ARGS="${TEST_START_CLUSTER_ARGS} "` + `"--tuple_cache_dir=${TUPLE_CACHE_DIR} " + TEST_START_CLUSTER_ARGS="${TEST_START_CLUSTER_ARGS} "` + `"--tuple_cache_capacity=${TUPLE_CACHE_CAPACITY} " +fi + if [[ "${ERASURE_CODING}" = true ]]; then # We do not run FE tests when erasure coding is enabled because planner tests # would fail. diff --git a/tests/common/environ.py b/tests/common/environ.py index b3e354c9d..c1de7e892 100644 --- a/tests/common/environ.py +++ b/tests/common/environ.py @@ -103,6 +103,12 @@ IS_BUGGY_EL6_KERNEL = 'el6' in kernel_release and kernel_version < [2, 6, 32, 67 IS_TEST_JDK = os.environ.get("TEST_JAVA_HOME_OVERRIDE", os.environ.get("TEST_JDK_VERSION", "")) != "" +# Detect if we are testing with tuple cache enabled. +IS_TUPLE_CACHE = ( + os.getenv("TUPLE_CACHE_DIR", "") != "" + and os.getenv("TUPLE_CACHE_CAPACITY", "") != "" +) + class ImpalaBuildFlavors: """ Represents the possible CMAKE_BUILD_TYPE values. These build flavors are needed diff --git a/tests/common/skip.py b/tests/common/skip.py index c26eb3e16..bde2e3628 100644 --- a/tests/common/skip.py +++ b/tests/common/skip.py @@ -27,7 +27,8 @@ from functools import partial from tests.common.environ import (ImpalaTestClusterProperties, IS_DOCKERIZED_TEST_CLUSTER, IS_BUGGY_EL6_KERNEL, HIVE_MAJOR_VERSION, IS_REDHAT_6_DERIVATIVE, - IS_APACHE_HIVE, IS_TEST_JDK) + IS_APACHE_HIVE, IS_TEST_JDK, + IS_TUPLE_CACHE) from tests.common.kudu_test_suite import get_kudu_master_flag from tests.util.filesystem_utils import ( IS_ABFS, @@ -129,6 +130,8 @@ class SkipIf: is_test_jdk = pytest.mark.skipif(IS_TEST_JDK, reason="Testing with different JDK") runs_slowly = pytest.mark.skipif(IMPALA_TEST_CLUSTER_PROPERTIES.runs_slowly(), reason="Test cluster runs slowly due to enablement of code coverage or sanitizer") + not_tuple_cache = pytest.mark.skipif(not IS_TUPLE_CACHE, + reason="Tuple Cache needed") class SkipIfLocal: # These are skipped due to product limitations. diff --git a/tests/query_test/test_tuple_cache_tpc_queries.py b/tests/query_test/test_tuple_cache_tpc_queries.py new file mode 100644 index 000000000..984121cca --- /dev/null +++ b/tests/query_test/test_tuple_cache_tpc_queries.py @@ -0,0 +1,77 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Functional tests running the TPCH and TPCDS workload twice to test tuple cache. +from __future__ import absolute_import, division, print_function +import pytest + +from tests.common.impala_test_suite import ImpalaTestSuite +from tests.common.skip import SkipIf +from tests.common.test_dimensions import create_single_exec_option_dimension +from tests.util.test_file_parser import load_tpc_queries_name_sorted + +MT_DOP_VALUES = [0, 4] + + +def run_tuple_cache_test(self, vector, query, mtdop): + vector.get_value('exec_option')['enable_tuple_cache'] = True + vector.get_value('exec_option')['mt_dop'] = mtdop + # Run twice to test write and read the tuple cache. + self.run_test_case(query, vector) + self.run_test_case(query, vector) + + [email protected]_tuple_cache +class TestTupleCacheTpchQuery(ImpalaTestSuite): + @classmethod + def get_workload(self): + return 'tpch' + + @classmethod + def add_test_dimensions(cls): + super(TestTupleCacheTpchQuery, cls).add_test_dimensions() + if cls.exploration_strategy() != 'exhaustive': + cls.ImpalaTestMatrix.add_dimension(create_single_exec_option_dimension()) + cls.ImpalaTestMatrix.add_constraint(lambda v: + v.get_value('table_format').file_format == 'parquet' + and v.get_value('table_format').compression_codec == 'none') + + @pytest.mark.parametrize("query", load_tpc_queries_name_sorted('tpch')) + @pytest.mark.parametrize("mtdop", MT_DOP_VALUES) + def test_tpch(self, vector, query, mtdop): + run_tuple_cache_test(self, vector, query, mtdop) + + [email protected]_tuple_cache +class TestTupleCacheTpcdsQuery(ImpalaTestSuite): + @classmethod + def get_workload(self): + return 'tpcds' + + @classmethod + def add_test_dimensions(cls): + super(TestTupleCacheTpcdsQuery, cls).add_test_dimensions() + if cls.exploration_strategy() != 'exhaustive': + cls.ImpalaTestMatrix.add_dimension(create_single_exec_option_dimension()) + cls.ImpalaTestMatrix.add_constraint(lambda v: + v.get_value('table_format').file_format == 'parquet' + and v.get_value('table_format').compression_codec == 'none') + + @pytest.mark.parametrize("query", load_tpc_queries_name_sorted('tpcds')) + @pytest.mark.parametrize("mtdop", MT_DOP_VALUES) + def test_tpcds(self, vector, query, mtdop): + run_tuple_cache_test(self, vector, query, mtdop) diff --git a/tests/util/test_file_parser.py b/tests/util/test_file_parser.py index 8bd2973d4..f9bb2afbc 100644 --- a/tests/util/test_file_parser.py +++ b/tests/util/test_file_parser.py @@ -439,3 +439,40 @@ def load_tpc_queries(workload, include_stress_queries=False, query_name_filters= raise Exception("Expected exactly 1 query to be in file %s but got %s" % (file_path, len(test_cases))) return queries + + +def tpc_sort_key(s): + """ + Sorting key for sorting strings in the format "tpch-qx-y" or "tpch-qx" or "tpch-qxX". + Args: + s (str): Input string to be sorted. + Returns: + tuple: Tuple of (int, int, str), where the first integer is the value of "x", + the second integer is the value of "y" if present, otherwise 0, and the + third element is the character of "X" of the string, if present and not + numeric, otherwise an empty string (''). + """ + parts = s.split("-") + match = re.search(r"q(\d+)(\D)?", parts[1]) + x = int(match.group(1)) if match else 0 + x_char = match.group(2) if match else '' + y = int(parts[2]) if len(parts) > 2 else 0 + return x, y, x_char + + +def load_tpc_queries_name_sorted(workload): + """ + Returns a list of queries for the given workload. Only tpch and tpcds are supported, + and only the name of the queries will be returned. The names are be sorted and + converted to lowercase before being returned. + Args: + workload (str): tpch or tpcds. + """ + queries = list(load_tpc_queries(workload).keys()) + queries = [s.lower() for s in queries] + queries = sorted(queries, key=tpc_sort_key) + if workload == 'tpcds': + # TPCDS is assumed to always use decimal_v2, in alignment with load_tpc_queries() + substring = "tpcds-decimal_v2" + queries = [x.replace(workload, substring) for x in queries] + return queries
