Tobias Hofer created FLINK-31064: ------------------------------------ Summary: Error while retrieving the leader gateway Key: FLINK-31064 URL: https://issues.apache.org/jira/browse/FLINK-31064 Project: Flink Issue Type: Bug Components: Kubernetes Operator Affects Versions: kubernetes-operator-1.3.1 Environment:
Reporter: Tobias Hofer Attachments: jobmanager_log.txt JobManager enters corrupt state after restart (e.g. increasing restartNonce). {code:java} WARN | flink-akka.actor.default-dispatcher-5 | RpcGatewayRetriever | Errorwhile retrieving the leader gateway. Retrying to connect to akka.tcp://flink@my-session.flink:6123/user/rpc/resourcemanager_*.org.apache.flink.util.concurrent.FutureUtils$RetryException: Could not complete the operation. Number of retries has been exhausted. at org.apache.flink.util.concurrent.FutureUtils.lambda$retryOperationWithDelay$6(FutureUtils.java:293) at java.base/java.util.concurrent.CompletableFuture.uniWhenComplete(Unknown Source) at java.base/java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(UnknownSource) at java.base/java.util.concurrent.CompletableFuture.postComplete(Unknown Source) at java.base/java.util.concurrent.CompletableFuture.completeExceptionally(Unknown Source) at org.apache.flink.util.concurrent.FutureUtils.doForward(FutureUtils.java:1275) at org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.lambda$null$1(ClassLoadingUtils.java:93) at org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.runWithContextClassLoader(ClassLoadingUtils.java:68) at org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.lambda$guardCompletionWithContextClassLoader$2(ClassLoadingUtils.java:92) at java.base/java.util.concurrent.CompletableFuture.uniWhenComplete(Unknown Source) at java.base/java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(UnknownSource) at java.base/java.util.concurrent.CompletableFuture.postComplete(Unknown Source) at java.base/java.util.concurrent.CompletableFuture.completeExceptionally(Unknown Source) at scala.concurrent.java8.FuturesConvertersImpl$CF$$anon$1.accept(FutureConvertersImpl.scala:61) at scala.concurrent.java8.FuturesConvertersImpl$CF$$anon$1.accept(FutureConvertersImpl.scala:53) at java.base/java.util.concurrent.CompletableFuture.uniWhenComplete(Unknown Source) at java.base/java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(UnknownSource) at java.base/java.util.concurrent.CompletableFuture$Completion.run(Unknown Source) at java.base/java.lang.Thread.run(Unknown Source)Caused by: java.util.concurrent.CompletionException: org.apache.flink.runtime.rpc.exceptions.RpcConnectionException: Could not connect to rpc endpoint under address akka.tcp://flink@my-session.flink:6123/user/rpc/resourcemanager_*. at org.apache.flink.runtime.rpc.akka.AkkaRpcService.lambda$resolveActorAddress$11(AkkaRpcService.java:604) at scala.concurrent.java8.FuturesConvertersImpl$CF$$anon$1.accept(FutureConvertersImpl.scala:59) ... 5 moreCaused by: org.apache.flink.runtime.rpc.exceptions.RpcConnectionException: Could not connect to rpc endpoint under address akka.tcp://flink@my-session.flink:6123/user/rpc/resourcemanager_*. ... 7 moreCaused by: akka.actor.ActorNotFound: Actor not found for: ActorSelection[Anchor(akka://flink/), Path(/user/rpc/resourcemanager_*)] at akka.actor.ActorSelection.$anonfun$resolveOne$1(ActorSelection.scala:74) at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:60) at akka.dispatch.BatchingExecutor$AbstractBatch.processBatch(BatchingExecutor.scala:63) at akka.dispatch.BatchingExecutor$Batch.run(BatchingExecutor.scala:81) at akka.dispatch.internal.SameThreadExecutionContext$$anon$1.unbatchedExecute(SameThreadExecutionContext.scala:21) at akka.dispatch.BatchingExecutor.execute(BatchingExecutor.scala:130) at akka.dispatch.BatchingExecutor.execute$(BatchingExecutor.scala:124) at akka.dispatch.internal.SameThreadExecutionContext$$anon$1.execute(SameThreadExecutionContext.scala:20) at scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:68) at scala.concurrent.impl.Promise$DefaultPromise.dispatchOrAddCallback(Promise.scala:312) at scala.concurrent.impl.Promise$DefaultPromise.onComplete(Promise.scala:303) at akka.actor.ActorSelection.resolveOne(ActorSelection.scala:72) at akka.actor.ActorSelection.resolveOne(ActorSelection.scala:89) at akka.actor.ActorSelection.resolveOne(ActorSelection.scala:130) at org.apache.flink.runtime.rpc.akka.AkkaRpcService.resolveActorAddress(AkkaRpcService.java:598) at org.apache.flink.runtime.rpc.akka.AkkaRpcService.connectInternal(AkkaRpcService.java:549) at org.apache.flink.runtime.rpc.akka.AkkaRpcService.connect(AkkaRpcService.java:232) at org.apache.flink.runtime.webmonitor.retriever.impl.RpcGatewayRetriever.lambda$null$0(RpcGatewayRetriever.java:66) at java.base/java.util.concurrent.CompletableFuture.uniComposeStage(Unknown Source) at java.base/java.util.concurrent.CompletableFuture.thenCompose(Unknown Source) at org.apache.flink.runtime.webmonitor.retriever.impl.RpcGatewayRetriever.lambda$createGateway$1(RpcGatewayRetriever.java:64) at org.apache.flink.util.concurrent.FutureUtils.retryOperationWithDelay(FutureUtils.java:259) at org.apache.flink.util.concurrent.FutureUtils.lambda$null$4(FutureUtils.java:279) at java.base/java.util.concurrent.Executors$RunnableAdapter.call(Unknown Source) at java.base/java.util.concurrent.FutureTask.run(Unknown Source) at org.apache.flink.runtime.concurrent.akka.ActorSystemScheduledExecutorAdapter$ScheduledFutureTask.run(ActorSystemScheduledExecutorAdapter.java:171) at org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.runWithContextClassLoader(ClassLoadingUtils.java:68) at org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.lambda$withContextClassLoader$0(ClassLoadingUtils.java:41) at akka.dispatch.TaskInvocation.run(AbstractDispatcher.scala:49) at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(ForkJoinExecutorConfigurator.scala:48) at java.base/java.util.concurrent.ForkJoinTask.doExec(Unknown Source) at java.base/java.util.concurrent.ForkJoinPool$WorkQueue.topLevelExec(Unknown Source) at java.base/java.util.concurrent.ForkJoinPool.scan(Unknown Source) at java.base/java.util.concurrent.ForkJoinPool.runWorker(Unknown Source) at java.base/java.util.concurrent.ForkJoinWorkerThread.run(Unknown Source) {code} I run Flink v1.16 on GKE v1.24. The hostname ‘my-session.flink’ can get resolved successfully in k8s. -- This message was sent by Atlassian Jira (v8.20.10#820010)