[jira] [Commented] (FLINK-10842) Waiting loops are broken in e2e/common.sh

ASF GitHub Bot (JIRA) Fri, 23 Nov 2018 02:35:40 -0800


    [ 
https://issues.apache.org/jira/browse/FLINK-10842?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16696646#comment-16696646
 ]


ASF GitHub Bot commented on FLINK-10842:
----------------------------------------

asfgit closed pull request #7073: [FLINK-10842][E2E tests] fix broken waiting 
loops in common.sh
URL: https://github.com/apache/flink/pull/7073
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/flink-end-to-end-tests/test-scripts/common.sh 
b/flink-end-to-end-tests/test-scripts/common.sh
index 275d9c49f4b..645cf2a2f1e 100644
--- a/flink-end-to-end-tests/test-scripts/common.sh
+++ b/flink-end-to-end-tests/test-scripts/common.sh
@@ -212,19 +212,22 @@ function start_local_zk {
 function wait_dispatcher_running {
   # wait at most 10 seconds until the dispatcher is up
   local QUERY_URL="${REST_PROTOCOL}://${NODENAME}:8081/taskmanagers"
-  for i in {1..10}; do
+  local TIMEOUT=10
+  for i in $(seq 1 ${TIMEOUT}); do
     # without the || true this would exit our script if the JobManager is not 
yet up
     QUERY_RESULT=$(curl ${CURL_SSL_ARGS} "$QUERY_URL" 2> /dev/null || true)
 
     # ensure the taskmanagers field is there at all and is not empty
     if [[ ${QUERY_RESULT} =~ \{\"taskmanagers\":\[.+\]\} ]]; then
       echo "Dispatcher REST endpoint is up."
-      break
+      return
     fi
 
     echo "Waiting for dispatcher REST endpoint to come up..."
     sleep 1
   done
+  echo "Dispatcher REST endpoint has not started within a timeout of 
${TIMEOUT} sec"
+  exit 1
 }
 
 function start_cluster {
@@ -242,30 +245,45 @@ function start_taskmanagers {
 }
 
 function start_and_wait_for_tm {
-  local url="${REST_PROTOCOL}://${NODENAME}:8081/taskmanagers"
-
-  tm_query_result=$(curl ${CURL_SSL_ARGS} -s "${url}")
-
+  tm_query_result=`query_running_tms`
   # we assume that the cluster is running
   if ! [[ ${tm_query_result} =~ \{\"taskmanagers\":\[.*\]\} ]]; then
     echo "Your cluster seems to be unresponsive at the moment: 
${tm_query_result}" 1>&2
     exit 1
   fi
 
-  running_tms=`curl ${CURL_SSL_ARGS} -s "${url}" | grep -o "id" | wc -l`
-
+  running_tms=`query_number_of_running_tms`
   ${FLINK_DIR}/bin/taskmanager.sh start
+  wait_for_number_of_running_tms $((running_tms+1))
+}
 
-  for i in {1..10}; do
-    local new_running_tms=`curl ${CURL_SSL_ARGS} -s "${url}" | grep -o "id" | 
wc -l`
-    if [ $((new_running_tms-running_tms)) -eq 0 ]; then
-      echo "TaskManager is not yet up."
+function query_running_tms {
+  local url="${REST_PROTOCOL}://${NODENAME}:8081/taskmanagers"
+  curl ${CURL_SSL_ARGS} -s "${url}"
+}
+
+function query_number_of_running_tms {
+  query_running_tms | grep -o "id" | wc -l
+}
+
+function wait_for_number_of_running_tms {
+  local TM_NUM_TO_WAIT=${1}
+  local TIMEOUT_COUNTER=10
+  local TIMEOUT_INC=4
+  local TIMEOUT=$(( $TIMEOUT_COUNTER * $TIMEOUT_INC ))
+  local TM_NUM_TEXT="Number of running task managers"
+  for i in $(seq 1 ${TIMEOUT_COUNTER}); do
+    local TM_NUM=`query_number_of_running_tms`
+    if [ $((TM_NUM - TM_NUM_TO_WAIT)) -eq 0 ]; then
+      echo "${TM_NUM_TEXT} has reached ${TM_NUM_TO_WAIT}."
+      return
     else
-      echo "TaskManager is up."
-      break
+      echo "${TM_NUM_TEXT} ${TM_NUM} is not yet ${TM_NUM_TO_WAIT}."
     fi
-    sleep 4
+    sleep ${TIMEOUT_INC}
   done
+  echo "${TM_NUM_TEXT} has not reached ${TM_NUM_TO_WAIT} within a timeout of 
${TIMEOUT} sec"
+  exit 1
 }
 
 function check_logs_for_errors {
@@ -376,17 +394,20 @@ function wait_for_job_state_transition {
 }
 
 function wait_job_running {
-  for i in {1..10}; do
+  local TIMEOUT=10
+  for i in $(seq 1 ${TIMEOUT}); do
     JOB_LIST_RESULT=$("$FLINK_DIR"/bin/flink list -r | grep "$1")
 
     if [[ "$JOB_LIST_RESULT" == "" ]]; then
       echo "Job ($1) is not yet running."
     else
       echo "Job ($1) is running."
-      break
+      return
     fi
     sleep 1
   done
+  echo "Job ($1) has not started within a timeout of ${TIMEOUT} sec"
+  exit 1
 }
 
 function wait_job_terminal_state {
diff --git 
a/flink-end-to-end-tests/test-scripts/test_queryable_state_restart_tm.sh 
b/flink-end-to-end-tests/test-scripts/test_queryable_state_restart_tm.sh
index 56d811e14a1..db0174a2160 100755
--- a/flink-end-to-end-tests/test-scripts/test_queryable_state_restart_tm.sh
+++ b/flink-end-to-end-tests/test-scripts/test_queryable_state_restart_tm.sh
@@ -86,6 +86,7 @@ function run_test() {
     fi
 
     kill_random_taskmanager
+    wait_for_number_of_running_tms 0
 
     latest_snapshot_count=$(cat $FLINK_DIR/log/*out* | grep "on snapshot" | 
tail -n 1 | awk '{print $4}')
     echo "Latest snapshot count was ${latest_snapshot_count}"
diff --git 
a/flink-end-to-end-tests/test-scripts/test_resume_externalized_checkpoints.sh 
b/flink-end-to-end-tests/test-scripts/test_resume_externalized_checkpoints.sh
index 35fe30b6b25..48d68c98e9d 100755
--- 
a/flink-end-to-end-tests/test-scripts/test_resume_externalized_checkpoints.sh
+++ 
b/flink-end-to-end-tests/test-scripts/test_resume_externalized_checkpoints.sh
@@ -100,11 +100,10 @@ fi
 
 DATASTREAM_JOB=$($JOB_CMD | grep "Job has been submitted with JobID" | sed 
's/.* //g')
 
-wait_job_running $DATASTREAM_JOB
-
 if [[ $SIMULATE_FAILURE == "true" ]]; then
   wait_job_terminal_state $DATASTREAM_JOB FAILED
 else
+  wait_job_running $DATASTREAM_JOB
   wait_num_checkpoints $DATASTREAM_JOB 1
   wait_oper_metric_num_in_records SemanticsCheckMapper.0 200
 


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> Waiting loops are broken in e2e/common.sh
> -----------------------------------------
>
>                 Key: FLINK-10842
>                 URL: https://issues.apache.org/jira/browse/FLINK-10842
>             Project: Flink
>          Issue Type: Bug
>          Components: E2E Tests
>    Affects Versions: 1.7.0
>            Reporter: Andrey Zagrebin
>            Assignee: Andrey Zagrebin
>            Priority: Major
>              Labels: pull-request-available
>             Fix For: 1.8.0
>
>
> There are 3 loops in flink-end-to-end-tests/test-scripts/common.sh where the 
> script waits for some event to happen (for i in \{1..10}; do):
>  - wait_dispatcher_running
>  - start_and_wait_for_tm
>  - wait_job_running
> All loops have 10 iterations and the loop breaks if the awaited event 
> happens. If timeout occurs then the script does not fail and the function 
> just continues after 10 iterations ignoring that the awaited event did not 
> happen.



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

[jira] [Commented] (FLINK-10842) Waiting loops are broken in e2e/common.sh

Reply via email to