This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a commit to branch branch-4.4.1
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 5d3d41e5c8986b53dd59bb1a635dd7e434663f94
Author: wzhou-code <[email protected]>
AuthorDate: Thu Jun 6 21:56:49 2024 -0700

    IMPALA-13143: Fix flaky test_catalogd_failover_with_sync_ddl
    
    The test_catalogd_failover_with_sync_ddl test which was added to
    custom_cluster/test_catalogd_ha.py in IMPALA-13134 failed on s3.
    The test relies on specific timing with a sleep injected via a
    debug action so that the DDL query is still running when catalogd
    failover is triggered. The failures were caused by slowly restarting
    for catalogd on s3 so that the query finished before catalogd
    failover was triggered.
    
    This patch fixed the issue by increasing the sleep time for s3 builds
    and other slow builds.
    
    Testing:
     - Ran the test 100 times in a loop on s3.
    
    Change-Id: I15bb6aae23a2f544067f993533e322969372ebd5
    Reviewed-on: http://gerrit.cloudera.org:8080/21491
    Reviewed-by: Riza Suminto <[email protected]>
    Tested-by: Impala Public Jenkins <[email protected]>
---
 tests/custom_cluster/test_catalogd_ha.py | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/tests/custom_cluster/test_catalogd_ha.py 
b/tests/custom_cluster/test_catalogd_ha.py
index ae1f7d2f9..37dee5603 100644
--- a/tests/custom_cluster/test_catalogd_ha.py
+++ b/tests/custom_cluster/test_catalogd_ha.py
@@ -18,16 +18,24 @@
 from __future__ import absolute_import, division, print_function
 import logging
 import re
+import time
 
 from beeswaxd.BeeswaxService import QueryState
+from builtins import round
 from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
 from tests.common.environ import build_flavor_timeout
-from tests.util.filesystem_utils import get_fs_path
+from tests.util.filesystem_utils import IS_S3, get_fs_path
 from time import sleep
 
 LOG = logging.getLogger('catalogd_ha_test')
 DEFAULT_STATESTORE_SERVICE_PORT = 24000
 DEFAULT_CATALOG_SERVICE_PORT = 26000
+SLOW_BUILD_SYNC_DDL_DELAY_S = 20
+SYNC_DDL_DELAY_S = build_flavor_timeout(
+    10, slow_build_timeout=SLOW_BUILD_SYNC_DDL_DELAY_S)
+# s3 can behave as a slow build.
+if IS_S3:
+  SYNC_DDL_DELAY_S = SLOW_BUILD_SYNC_DDL_DELAY_S
 
 
 class TestCatalogdHA(CustomClusterTestSuite):
@@ -430,7 +438,8 @@ class TestCatalogdHA(CustomClusterTestSuite):
 
   @CustomClusterTestSuite.with_args(
     statestored_args="--use_subscriber_id_as_catalogd_priority=true",
-    
catalogd_args="--debug_actions='catalogd_wait_sync_ddl_version_delay:SLEEP@5000'",
+    
catalogd_args="--debug_actions='catalogd_wait_sync_ddl_version_delay:SLEEP@{0}'"
+                  .format(SYNC_DDL_DELAY_S * 1000),
     start_args="--enable_catalogd_ha")
   def test_catalogd_failover_with_sync_ddl(self, unique_database):
     """Tests for Catalog Service force fail-over when running DDL with SYNC_DDL
@@ -451,6 +460,7 @@ class TestCatalogdHA(CustomClusterTestSuite):
     handle = client.execute_async(ddl_query.format(database=unique_database))
 
     # Restart standby catalogd with force_catalogd_active as true.
+    start_s = time.time()
     catalogds[1].kill()
     catalogds[1].start(wait_until_ready=True,
                        additional_args="--force_catalogd_active=true")
@@ -459,9 +469,14 @@ class TestCatalogdHA(CustomClusterTestSuite):
     catalogd_service_1.wait_for_metric_value(
         "catalog-server.active-status", expected_value=False, timeout=15)
     assert(not 
catalogd_service_1.get_metric_value("catalog-server.active-status"))
+    elapsed_s = time.time() - start_s
+    assert elapsed_s < SYNC_DDL_DELAY_S, \
+        "Catalogd failover took %s seconds to complete" % (elapsed_s)
+    LOG.info("Catalogd failover took %s seconds to complete" % 
round(elapsed_s, 1))
 
     # Verify that the query is failed due to the Catalogd HA fail-over.
-    self.wait_for_state(handle, QueryState.EXCEPTION, 30, client=client)
+    self.wait_for_state(
+        handle, QueryState.EXCEPTION, SYNC_DDL_DELAY_S * 2 + 10, client=client)
     client.close()
 
   @CustomClusterTestSuite.with_args(

Reply via email to