XComp commented on a change in pull request #14033:
URL: https://github.com/apache/flink/pull/14033#discussion_r523122630



##########
File path: flink-end-to-end-tests/test-scripts/common.sh
##########
@@ -587,9 +588,25 @@ function tm_kill_all {
 
 # Kills all processes that match the given name.
 function kill_all {
-  local pid=`jps | grep -E "${1}" | cut -d " " -f 1 || true`
-  kill ${pid} 2> /dev/null || true
-  wait ${pid} 2> /dev/null || true
+  # using ps instead of jps to identify jvms in shutdown as well, so that we 
can wait for the shutdown to finish (jps doesn't show killed JVMs while 
shutting down)
+  # use awk to strip leading and trailing whitespaces
+  # use cut to get the pid
+  for pid in $(ps ax | grep "java" | grep -E "${1}" | awk '{$1=$1;print}' | 
cut -d " " -f 1)
+  do
+      echo "Waiting till process is stopped: pid = $pid pattern = '${1}'"
+      kill ${pid} 2> /dev/null || true
+      if [[ "$OS_TYPE" == "mac" ]]; then
+          # works on mac, but does seem to return before the process has 
finished on Linux
+          wait ${pid} 2> /dev/null || true
+      else
+          # use tail to wait for a process to finish: 
https://stackoverflow.com/questions/1058047/wait-for-a-process-to-finish/11719943
+          timeout 60 tail --pid=${pid} -f /dev/null
+          if [ "$?" -eq 124 ]; then
+            echo "Process (pid = $pid) didn't stop within 60 seconds. Killing 
it:"
+            kill -9 $pid
+          fi
+      fi

Review comment:
       Shouldn't we move that out into its own common utility function 
considering that we used almost the exact same code in PR #14062 
([FLINK-17470](https://issues.apache.org/jira/browse/FLINK-17470))?

##########
File path: flink-end-to-end-tests/test-scripts/test-runner-common.sh
##########
@@ -110,10 +114,40 @@ function post_test_validation {
     fi
 }
 
+# returns the number of allocated ports
+function get_num_ports {
+    # "ps --ppid 2 -p 2 --deselect" shows all non-kernel processes
+    # "ps --ppid $$" shows all children of this bash process
+    # "ps -o pid= -o comm=" removes the header line
+    echo $(sudo netstat -tulpn | wc -l)

Review comment:
       why do we have to run sudo here? Isn't the test executed by `root`? 🤔 

##########
File path: flink-end-to-end-tests/test-scripts/common.sh
##########
@@ -822,11 +839,20 @@ internal_run_with_timeout() {
 
   (
       command_pid=$BASHPID
-      (sleep "${timeout_in_seconds}" # set a timeout for this command
-      echo "${command_label:-"The command '${command}'"} (pid: $command_pid) 
did not finish after $timeout_in_seconds seconds."
-      eval "${on_failure}"
-      kill "$command_pid") & watchdog_pid=$!
+      (# this subshell contains the watchdog
+        local wakeup_time=$(( ${timeout_in_seconds} + $(date +%s) ))
+        while true; do
+          sleep 1
+          if [ $wakeup_time -le $(date +%s) ]; then
+            echo "${command_label:-"The command '${command}'"} (pid: 
$command_pid) did not finish after $timeout_in_seconds seconds."
+            eval "${on_failure}"
+            kill "$command_pid"
+            pkill -P "$command_pid"

Review comment:
       I couldn't verify it that's why I'm asking: Are you sure that this is 
what you want to do? AFAIK, `pkill` would need some `pattern` and `-P` just 
restricts the parent PID?! 🤔 




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to