This is an automated email from the ASF dual-hosted git repository.
michaelsmith pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
The following commit(s) were added to refs/heads/master by this push:
new b6ca6ffb9 IMPALA-13317: Enhance tpc_sort_key for wider name support
b6ca6ffb9 is described below
commit b6ca6ffb9cd9f69f6c903c0416cecbc60446097c
Author: Yida Wu <[email protected]>
AuthorDate: Wed Aug 21 14:58:58 2024 -0700
IMPALA-13317: Enhance tpc_sort_key for wider name support
Currently, the tpc_sort_key function is used for sorting TPCH or
TPCDS files while running the TPCH or TPCDS tests, and only
used by test_tuple_cache_tpc_queries now. It is designed to
handle filenames in formats like "tpch-qx-y," "tpch-qx," or
"tpch-qxX." However, it doesn't support filenames in the format
"tpch-qx-yY," and attempting to sort these files results in an error.
This patch improves the robustness of the tpc_sort_key function
by adding more checks to prevent errors and extending support
for filenames in the "tpch-qxX-yY" format.
Tests:
Reran and passed tests with file name like "tpch-qxX-yY" format.
Seems no tests exist for test util functions, I tested the function
with following unit tests locally and passed
test_cases = {
'tpcds-q1': (1, 0, '', ''),
'tpcds-q1X': (1, 0, 'X', ''),
'tpcds-q1-2Y': (1, 2, '', 'Y'),
'tpcds-q1X-2Y': (1, 2, 'X', 'Y'),
'tpcds-q2-3': (2, 3, '', ''),
'tpcds-q10': (10, 0, '', ''),
'tpcds-q10-20': (10, 20, '', ''),
'tpcds-q10a-20': (10, 20, 'a', ''),
'tpcds-q10-20b': (10, 20, '', 'b'),
'tpcds-q10a-20b': (10, 20, 'a', 'b'),
'tpcds-q0': (0, 0, '', ''),
'tpcds-': (0, 0, '', ''),
'tpcds--': (0, 0, '', ''),
'tpcds-xx-xx': (0, 0, '', ''),
'tpcds-x1-x1': (0, 0, '', ''),
'tpcds-x1-x': (0, 0, '', ''),
'tpcds-x-x1': (0, 0, '', ''),
'tpcds': (0, 0, '', ''),
}
for input_str, expected in test_cases.items():
result = tpc_sort_key(input_str)
assert result == expected
Change-Id: Ib238ff09d5a2278c593f2759cf35f136b0ff1344
Reviewed-on: http://gerrit.cloudera.org:8080/21708
Reviewed-by: Impala Public Jenkins <[email protected]>
Tested-by: Impala Public Jenkins <[email protected]>
---
tests/util/test_file_parser.py | 36 ++++++++++++++++++++++++++----------
1 file changed, 26 insertions(+), 10 deletions(-)
diff --git a/tests/util/test_file_parser.py b/tests/util/test_file_parser.py
index f9bb2afbc..4faf9f427 100644
--- a/tests/util/test_file_parser.py
+++ b/tests/util/test_file_parser.py
@@ -443,21 +443,37 @@ def load_tpc_queries(workload,
include_stress_queries=False, query_name_filters=
def tpc_sort_key(s):
"""
- Sorting key for sorting strings in the format "tpch-qx-y" or "tpch-qx" or
"tpch-qxX".
+ Sorting key for sorting strings in the format "tpch-qxX-yY".
+ The valid format is expected to be split into parts by '-'.
+ If the format doesn't match, 0 or an empty string will be returned
+ as appropriate.
Args:
s (str): Input string to be sorted.
Returns:
- tuple: Tuple of (int, int, str), where the first integer is the value
of "x",
- the second integer is the value of "y" if present, otherwise 0,
and the
- third element is the character of "X" of the string, if present
and not
- numeric, otherwise an empty string ('').
+ tuple: A tuple of (int, int, str, str):
+ - The first integer is the value of "x", if present, otherwise 0.
+ - The second integer is the value of "y",
+ included only if "x" is present, otherwise returns an 0.
+ - The third element is the character "X", not numeric,
+ included only if "x" is present, otherwise returns an empty
string ('').
+ - The fourth element is the character "Y", not numeric,
+ included only if "y" is present, otherwise returns an empty
string ('').
"""
+ x, y = 0, 0
+ x_char, y_char = '', ''
parts = s.split("-")
- match = re.search(r"q(\d+)(\D)?", parts[1])
- x = int(match.group(1)) if match else 0
- x_char = match.group(2) if match else ''
- y = int(parts[2]) if len(parts) > 2 else 0
- return x, y, x_char
+ if len(parts) < 2:
+ return x, y, x_char, y_char
+ match = re.search(r"^q(\d+)(\D)?", parts[1])
+ if match:
+ x = int(match.group(1)) if match.group(1) else 0
+ x_char = match.group(2) if match.group(2) else ''
+ if len(parts) == 3 and match:
+ match = re.search(r"^(\d+)(\D)?", parts[2])
+ if match:
+ y = int(match.group(1)) if match.group(1) else 0
+ y_char = match.group(2) if match.group(2) else ''
+ return x, y, x_char, y_char
def load_tpc_queries_name_sorted(workload):