mateczagany commented on code in PR #777: URL: https://github.com/apache/flink-kubernetes-operator/pull/777#discussion_r1489096666
########## flink-kubernetes-operator/src/main/java/org/apache/flink/kubernetes/operator/service/AbstractFlinkService.java: ########## @@ -898,58 +901,54 @@ public JobDetailsInfo getJobDetailsInfo(JobID jobID, Configuration conf) throws } } - /** Wait until the FLink cluster has completely shut down. */ - @VisibleForTesting - void waitForClusterShutdown(String namespace, String clusterId, long shutdownTimeout) { - LOG.info("Waiting for cluster shutdown..."); - - boolean jobManagerRunning = true; - boolean taskManagerRunning = true; - boolean serviceRunning = true; + /** Returns a list of Kubernetes Deployment names for given cluster. */ + protected abstract List<String> getDeploymentNames(String namespace, String clusterId); - for (int i = 0; i < shutdownTimeout; i++) { - if (jobManagerRunning) { - PodList jmPodList = getJmPodList(namespace, clusterId); + /** Wait until the FLink cluster has completely shut down. */ + protected void waitForClusterShutdown( + String namespace, String clusterId, long shutdownTimeout) { + long timeoutAt = System.currentTimeMillis() + shutdownTimeout * 1000; + LOG.info("Waiting {} seconds for cluster shutdown...", shutdownTimeout); - if (jmPodList == null || jmPodList.getItems().isEmpty()) { - jobManagerRunning = false; - } - } - if (taskManagerRunning) { - PodList tmPodList = getTmPodList(namespace, clusterId); + for (var deploymentName : getDeploymentNames(namespace, clusterId)) { + long deploymentTimeout = timeoutAt - System.currentTimeMillis(); - if (tmPodList.getItems().isEmpty()) { - taskManagerRunning = false; - } + if (!waitForDeploymentToBeRemoved(namespace, deploymentName, deploymentTimeout)) { + LOG.error( Review Comment: In case of a timeout, we just log the message as an error, but during upgrade this means that the operator will just proceed to create a new `Deployment`, then get `AlreadyExists` error, that may result in a deadlock. Not sure what else can we do in this case. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@flink.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org