This is an automated email from the ASF dual-hosted git repository.

michaelsmith pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


The following commit(s) were added to refs/heads/master by this push:
     new b6ca6ffb9 IMPALA-13317: Enhance tpc_sort_key for wider name support
b6ca6ffb9 is described below

commit b6ca6ffb9cd9f69f6c903c0416cecbc60446097c
Author: Yida Wu <[email protected]>
AuthorDate: Wed Aug 21 14:58:58 2024 -0700

    IMPALA-13317: Enhance tpc_sort_key for wider name support
    
    Currently, the tpc_sort_key function is used for sorting TPCH or
    TPCDS files while running the TPCH or TPCDS tests, and only
    used by test_tuple_cache_tpc_queries now. It is designed to
    handle filenames in formats like "tpch-qx-y," "tpch-qx," or
    "tpch-qxX." However, it doesn't support filenames in the format
    "tpch-qx-yY," and attempting to sort these files results in an error.
    
    This patch improves the robustness of the tpc_sort_key function
    by adding more checks to prevent errors and extending support
    for filenames in the "tpch-qxX-yY" format.
    
    Tests:
    Reran and passed tests with file name like "tpch-qxX-yY" format.
    Seems no tests exist for test util functions, I tested the function
    with following unit tests locally and passed
    test_cases = {
        'tpcds-q1': (1, 0, '', ''),
        'tpcds-q1X': (1, 0, 'X', ''),
        'tpcds-q1-2Y': (1, 2, '', 'Y'),
        'tpcds-q1X-2Y': (1, 2, 'X', 'Y'),
        'tpcds-q2-3': (2, 3, '', ''),
        'tpcds-q10': (10, 0, '', ''),
        'tpcds-q10-20': (10, 20, '', ''),
        'tpcds-q10a-20': (10, 20, 'a', ''),
        'tpcds-q10-20b': (10, 20, '', 'b'),
        'tpcds-q10a-20b': (10, 20, 'a', 'b'),
        'tpcds-q0': (0, 0, '', ''),
        'tpcds-': (0, 0, '', ''),
        'tpcds--': (0, 0, '', ''),
        'tpcds-xx-xx': (0, 0, '', ''),
        'tpcds-x1-x1': (0, 0, '', ''),
        'tpcds-x1-x': (0, 0, '', ''),
        'tpcds-x-x1': (0, 0, '', ''),
        'tpcds': (0, 0, '', ''),
    }
    for input_str, expected in test_cases.items():
        result = tpc_sort_key(input_str)
        assert result == expected
    
    Change-Id: Ib238ff09d5a2278c593f2759cf35f136b0ff1344
    Reviewed-on: http://gerrit.cloudera.org:8080/21708
    Reviewed-by: Impala Public Jenkins <[email protected]>
    Tested-by: Impala Public Jenkins <[email protected]>
---
 tests/util/test_file_parser.py | 36 ++++++++++++++++++++++++++----------
 1 file changed, 26 insertions(+), 10 deletions(-)

diff --git a/tests/util/test_file_parser.py b/tests/util/test_file_parser.py
index f9bb2afbc..4faf9f427 100644
--- a/tests/util/test_file_parser.py
+++ b/tests/util/test_file_parser.py
@@ -443,21 +443,37 @@ def load_tpc_queries(workload, 
include_stress_queries=False, query_name_filters=
 
 def tpc_sort_key(s):
     """
-    Sorting key for sorting strings in the format "tpch-qx-y" or "tpch-qx" or 
"tpch-qxX".
+    Sorting key for sorting strings in the format "tpch-qxX-yY".
+    The valid format is expected to be split into parts by '-'.
+    If the format doesn't match, 0 or an empty string will be returned
+    as appropriate.
     Args:
         s (str): Input string to be sorted.
     Returns:
-        tuple: Tuple of (int, int, str), where the first integer is the value 
of "x",
-               the second integer is the value of "y" if present, otherwise 0, 
and the
-               third element is the character of "X" of the string, if present 
and not
-               numeric, otherwise an empty string ('').
+        tuple: A tuple of (int, int, str, str):
+            - The first integer is the value of "x", if present, otherwise 0.
+            - The second integer is the value of "y",
+              included only if "x" is present, otherwise returns an 0.
+            - The third element is the character "X", not numeric,
+              included only if "x" is present, otherwise returns an empty 
string ('').
+            - The fourth element is the character "Y", not numeric,
+              included only if "y" is present, otherwise returns an empty 
string ('').
     """
+    x, y = 0, 0
+    x_char, y_char = '', ''
     parts = s.split("-")
-    match = re.search(r"q(\d+)(\D)?", parts[1])
-    x = int(match.group(1)) if match else 0
-    x_char = match.group(2) if match else ''
-    y = int(parts[2]) if len(parts) > 2 else 0
-    return x, y, x_char
+    if len(parts) < 2:
+      return x, y, x_char, y_char
+    match = re.search(r"^q(\d+)(\D)?", parts[1])
+    if match:
+      x = int(match.group(1)) if match.group(1) else 0
+      x_char = match.group(2) if match.group(2) else ''
+    if len(parts) == 3 and match:
+      match = re.search(r"^(\d+)(\D)?", parts[2])
+      if match:
+        y = int(match.group(1)) if match.group(1) else 0
+        y_char = match.group(2) if match.group(2) else ''
+    return x, y, x_char, y_char
 
 
 def load_tpc_queries_name_sorted(workload):

Reply via email to