[ 
https://issues.apache.org/jira/browse/FLINK-36570?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Matthias Pohl updated FLINK-36570:
----------------------------------
    Description: 
I commit batch jobs to my session cluster with rest api. The jobmanager pod 
would restart when occurred the error.

Seems to be because the new leader elected and exist running job     at the 
same time. Then the job send to the new leader and error.
{code:java}

2024-10-18 03:07:22,107 INFO  
org.apache.flink.client.deployment.application.executors.EmbeddedExecutor [] - 
Submitting Job with JobId=a9d339b6ba26ab51746514cc7aea0537.2024-10-18 
03:07:22,546 INFO  
org.apache.flink.kubernetes.kubeclient.resources.KubernetesLeaderElector [] - 
New leader elected 57c4be1d-58f0-4c2c-89d8-11aefe1ec273 for 
flink-cluster-cluster-config-map.
2024-10-18 03:07:22,549 ERROR 
org.apache.flink.runtime.entrypoint.ClusterEntrypoint        [] - Fatal error 
occurred in the cluster entrypoint.org.apache.flink.util.FlinkException: 
JobMaster for job 99c02051a54c77499f53f09cd4b7a0d9 failed.    
    at 
org.apache.flink.runtime.dispatcher.Dispatcher.jobMasterFailed(Dispatcher.java:1360)
 ~[flink-dist-1.17.2.jar:1.17.2]
    at 
org.apache.flink.runtime.dispatcher.Dispatcher.jobManagerRunnerFailed(Dispatcher.java:772)
 ~[flink-dist-1.17.2.jar:1.17.2]
    at 
org.apache.flink.runtime.dispatcher.Dispatcher.lambda$runJob$6(Dispatcher.java:694)
 ~[flink-dist-1.17.2.jar:1.17.2]
    at java.util.concurrent.CompletableFuture.uniHandle(Unknown Source) ~[?:?]
    at java.util.concurrent.CompletableFuture$UniHandle.tryFire(Unknown Source) 
~[?:?]
    at java.util.concurrent.CompletableFuture$Completion.run(Unknown Source) 
~[?:?]
    at 
org.apache.flink.runtime.rpc.akka.AkkaRpcActor.lambda$handleRunAsync$4(AkkaRpcActor.java:453)
 ~[flink-rpc-akka_27725420-d3ff-407e-864b-  d8e6936565db.jar:1.17.2]
    at 
org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.runWithContextClassLoader(ClassLoadingUtils.java:68)
 ~[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]
    at 
org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRunAsync(AkkaRpcActor.java:453)
 ~[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]
    at 
org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcMessage(AkkaRpcActor.java:218)
 ~[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]
    at 
org.apache.flink.runtime.rpc.akka.FencedAkkaRpcActor.handleRpcMessage(FencedAkkaRpcActor.java:84)
 ~[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]
    at 
org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleMessage(AkkaRpcActor.java:168)
 ~[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]
    at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:24) 
[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]
    at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:20) 
[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]
    at scala.PartialFunction.applyOrElse(PartialFunction.scala:127) 
[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]
    at scala.PartialFunction.applyOrElse$(PartialFunction.scala:126) 
[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]
    at akka.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:20) 
[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]
    at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:175) 
[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]
    at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:176) 
[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]
    at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:176) 
[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]
    at akka.actor.Actor.aroundReceive(Actor.scala:537) 
[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]
    at akka.actor.Actor.aroundReceive$(Actor.scala:535) 
[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]
    at akka.actor.AbstractActor.aroundReceive(AbstractActor.scala:220) 
[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]
    at akka.actor.ActorCell.receiveMessage(ActorCell.scala:579) 
[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]
    at akka.actor.ActorCell.invoke(ActorCell.scala:547) 
[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]
    at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:270) 
[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]
    at akka.dispatch.Mailbox.run(Mailbox.scala:231) 
[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]
    at akka.dispatch.Mailbox.exec(Mailbox.scala:243) 
[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]
    at java.util.concurrent.ForkJoinTask.doExec(Unknown Source) [?:?]
    at java.util.concurrent.ForkJoinPool$WorkQueue.topLevelExec(Unknown Source) 
[?:?]
    at java.util.concurrent.ForkJoinPool.scan(Unknown Source) [?:?]
    at java.util.concurrent.ForkJoinPool.runWorker(Unknown Source) [?:?]
    at java.util.concurrent.ForkJoinWorkerThread.run(Unknown Source) [?:?]
caused by: org.apache.flink.util.FlinkException: Could not suspend the job 
manager.
    at 
org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.lambda$null$13(JobMasterServiceLeadershipRunner.java:438)
 ~[flink-dist-1.17.2.jar:1.17.2]
    at 
org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.runIfStateRunning(JobMasterServiceLeadershipRunner.java:456)
 ~[flink-dist-1.17.2.jar:1.17.2]
    at 
org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.lambda$handleAsyncOperationError$14(JobMasterServiceLeadershipRunner.java:436)
 ~[flink-dist-1.17.2.jar:1.17.2]
    at java.util.concurrent.CompletableFuture.uniWhenComplete(Unknown Source) 
~[?:?]
    at java.util.concurrent.CompletableFuture.uniWhenCompleteStage(Unknown 
Source) ~[?:?]
    at java.util.concurrent.CompletableFuture.whenComplete(Unknown Source) 
~[?:?]
    at 
org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.handleAsyncOperationError(JobMasterServiceLeadershipRunner.java:433)
 ~[flink-dist-1.17.2.jar:1.17.2]
    at 
org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.stopJobMasterServiceProcessAsync(JobMasterServiceLeadershipRunner.java:405)
 ~[flink-dist-1.17.2.jar:1.17.2]
    at 
org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.runIfStateRunning(JobMasterServiceLeadershipRunner.java:456)
 ~[flink-dist-1.17.2.jar:1.17.2]
    at 
org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.revokeLeadership(JobMasterServiceLeadershipRunner.java:390)
 ~[flink-dist-1.17.2.jar:1.17.2]
    at 
org.apache.flink.runtime.leaderelection.DefaultLeaderElectionService.onRevokeLeadership(DefaultLeaderElectionService.java:236)
 ~[flink-dist-1.17.2.jar:1.17.2]
    at 
org.apache.flink.runtime.leaderelection.DefaultMultipleComponentLeaderElectionService.lambda$forEachLeaderElectionEventHandler$2(DefaultMultipleComponentLeaderElectionService.java:225)
 ~[flink-dist-1.17.2.jar:1.17.2]
    at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source) ~[?:?]
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source) ~[?:?]
    at java.lang.Thread.run(Unknown Source) ~[?:?]
caused by: java.util.concurrent.CompletionException: 
java.lang.UnsupportedOperationException: Still waiting for the leadership.
    at java.util.concurrent.CompletableFuture.encodeThrowable(Unknown Source) 
~[?:?]
    at java.util.concurrent.CompletableFuture.uniComposeStage(Unknown Source) 
~[?:?]
    at java.util.concurrent.CompletableFuture.thenCompose(Unknown Source) ~[?:?]
    at 
org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.stopJobMasterServiceProcessAsync(JobMasterServiceLeadershipRunner.java:398)
 ~[flink-dist-1.17.2.jar:1.17.2]
    at 
org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.runIfStateRunning(JobMasterServiceLeadershipRunner.java:456)
 ~[flink-dist-1.17.2.jar:1.17.2]
    at 
org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.revokeLeadership(JobMasterServiceLeadershipRunner.java:390)
 ~[flink-dist-1.17.2.jar:1.17.2]
    at 
org.apache.flink.runtime.leaderelection.DefaultLeaderElectionService.onRevokeLeadership(DefaultLeaderElectionService.java:236)
 ~[flink-dist-1.17.2.jar:1.17.2]
    at 
org.apache.flink.runtime.leaderelection.DefaultMultipleComponentLeaderElectionService.lambda$forEachLeaderElectionEventHandler$2(DefaultMultipleComponentLeaderElectionService.java:225)
 ~[flink-dist-1.17.2.jar:1.17.2]
    at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source) ~[?:?]
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source) ~[?:?]
    at java.lang.Thread.run(Unknown Source) ~[?:?]
caused by: java.lang.UnsupportedOperationException: Still waiting for the 
leadership.
    at 
org.apache.flink.runtime.jobmaster.JobMasterServiceProcess$WaitingForLeadership.getLeaderSessionId(JobMasterServiceProcess.java:71)
 ~[flink-dist-1.17.2.jar:1.17.2]
    at 
org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.stopJobMasterServiceProcess(JobMasterServiceLeadershipRunner.java:414)
 ~[flink-dist-1.17.2.jar:1.17.2]
    at 
org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.callIfRunning(JobMasterServiceLeadershipRunner.java:469)
 ~[flink-dist-1.17.2.jar:1.17.2]
    at 
org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.lambda$stopJobMasterServiceProcessAsync$12(JobMasterServiceLeadershipRunner.java:400)
 ~[flink-dist-1.17.2.jar:1.17.2]
    at java.util.concurrent.CompletableFuture.uniComposeStage(Unknown Source) 
~[?:?]
    at java.util.concurrent.CompletableFuture.thenCompose(Unknown Source) ~[?:?]
    at 
org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.stopJobMasterServiceProcessAsync(JobMasterServiceLeadershipRunner.java:398)
 ~[flink-dist-1.17.2.jar:1.17.2]
    at 
org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.runIfStateRunning(JobMasterServiceLeadershipRunner.java:456)
 ~[flink-dist-1.17.2.jar:1.17.2]
    at 
org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.revokeLeadership(JobMasterServiceLeadershipRunner.java:390)
 ~[flink-dist-1.17.2.jar:1.17.2]
    at 
org.apache.flink.runtime.leaderelection.DefaultLeaderElectionService.onRevokeLeadership(DefaultLeaderElectionService.java:236)
 ~[flink-dist-1.17.2.jar:1.17.2]
    at 
org.apache.flink.runtime.leaderelection.DefaultMultipleComponentLeaderElectionService.lambda$forEachLeaderElectionEventHandler$2(DefaultMultipleComponentLeaderElectionService.java:225)
 ~[flink-dist-1.17.2.jar:1.17.2]
    at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source) ~[?:?]
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source) ~[?:?]
    at java.lang.Thread.run(Unknown Source) ~[?:?]    
INFO  org.apache.flink.runtime.entrypoint.ClusterEntrypoint        [] - 
Shutting KubernetesSessionClusterEntrypoint down with application status 
UNKNOWN. Diagnostics Cluster entrypoint has been closed externally..INFO  
org.apache.flink.runtime.blob.BlobServer                     [] - Stopped BLOB 
server     at 0.0.0.0:6124
{code}

  was:
I commit batch jobs to my session cluster with rest api. The jobmanager pod 
would restart when occurred the error.

Seems to be because the new leader elected and exist running job at the same 
time. Then the job send to the new leader and error.
{code:java}

2024-10-18 03:07:22,107 INFO  
org.apache.flink.client.deployment.application.executors.EmbeddedExecutor [] - 
Submitting Job with JobId=a9d339b6ba26ab51746514cc7aea0537.2024-10-18 
03:07:22,546 INFO  
org.apache.flink.kubernetes.kubeclient.resources.KubernetesLeaderElector [] - 
New leader elected 57c4be1d-58f0-4c2c-89d8-11aefe1ec273 for 
flink-cluster-cluster-config-map.2024-10-18 03:07:22,549 ERROR 
org.apache.flink.runtime.entrypoint.ClusterEntrypoint        [] - Fatal error 
occurred in the cluster entrypoint.org.apache.flink.util.FlinkException: 
JobMaster for job 99c02051a54c77499f53f09cd4b7a0d9 failed.at 
org.apache.flink.runtime.dispatcher.Dispatcher.jobMasterFailed(Dispatcher.java:1360)
 ~[flink-dist-1.17.2.jar:1.17.2]


at 
org.apache.flink.runtime.dispatcher.Dispatcher.jobManagerRunnerFailed(Dispatcher.java:772)
 ~[flink-dist-1.17.2.jar:1.17.2]


at 
org.apache.flink.runtime.dispatcher.Dispatcher.lambda$runJob$6(Dispatcher.java:694)
 ~[flink-dist-1.17.2.jar:1.17.2]


at java.util.concurrent.CompletableFuture.uniHandle(Unknown Source) ~[?:?]


at java.util.concurrent.CompletableFuture$UniHandle.tryFire(Unknown Source) 
~[?:?]


at java.util.concurrent.CompletableFuture$Completion.run(Unknown Source) ~[?:?]


at 
org.apache.flink.runtime.rpc.akka.AkkaRpcActor.lambda$handleRunAsync$4(AkkaRpcActor.java:453)
 ~[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]


at 
org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.runWithContextClassLoader(ClassLoadingUtils.java:68)
 ~[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]


at 
org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRunAsync(AkkaRpcActor.java:453)
 ~[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]


at 
org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcMessage(AkkaRpcActor.java:218)
 ~[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]


at 
org.apache.flink.runtime.rpc.akka.FencedAkkaRpcActor.handleRpcMessage(FencedAkkaRpcActor.java:84)
 ~[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]


at 
org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleMessage(AkkaRpcActor.java:168)
 ~[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]


at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:24) 
[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]


at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:20) 
[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]


at scala.PartialFunction.applyOrElse(PartialFunction.scala:127) 
[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]


at scala.PartialFunction.applyOrElse$(PartialFunction.scala:126) 
[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]


at akka.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:20) 
[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]


at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:175) 
[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]


at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:176) 
[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]


at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:176) 
[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]


at akka.actor.Actor.aroundReceive(Actor.scala:537) 
[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]


at akka.actor.Actor.aroundReceive$(Actor.scala:535) 
[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]


at akka.actor.AbstractActor.aroundReceive(AbstractActor.scala:220) 
[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]


at akka.actor.ActorCell.receiveMessage(ActorCell.scala:579) 
[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]


at akka.actor.ActorCell.invoke(ActorCell.scala:547) 
[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]


at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:270) 
[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]


at akka.dispatch.Mailbox.run(Mailbox.scala:231) 
[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]


at akka.dispatch.Mailbox.exec(Mailbox.scala:243) 
[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]


at java.util.concurrent.ForkJoinTask.doExec(Unknown Source) [?:?]


at java.util.concurrent.ForkJoinPool$WorkQueue.topLevelExec(Unknown Source) 
[?:?]


at java.util.concurrent.ForkJoinPool.scan(Unknown Source) [?:?]


at java.util.concurrent.ForkJoinPool.runWorker(Unknown Source) [?:?]


at java.util.concurrent.ForkJoinWorkerThread.run(Unknown Source) [?:?]


ed by: org.apache.flink.util.FlinkException: Could not suspend the job manager.


at 
org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.lambda$null$13(JobMasterServiceLeadershipRunner.java:438)
 ~[flink-dist-1.17.2.jar:1.17.2]


at 
org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.runIfStateRunning(JobMasterServiceLeadershipRunner.java:456)
 ~[flink-dist-1.17.2.jar:1.17.2]


at 
org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.lambda$handleAsyncOperationError$14(JobMasterServiceLeadershipRunner.java:436)
 ~[flink-dist-1.17.2.jar:1.17.2]


at java.util.concurrent.CompletableFuture.uniWhenComplete(Unknown Source) ~[?:?]


at java.util.concurrent.CompletableFuture.uniWhenCompleteStage(Unknown Source) 
~[?:?]


at java.util.concurrent.CompletableFuture.whenComplete(Unknown Source) ~[?:?]


at 
org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.handleAsyncOperationError(JobMasterServiceLeadershipRunner.java:433)
 ~[flink-dist-1.17.2.jar:1.17.2]


at 
org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.stopJobMasterServiceProcessAsync(JobMasterServiceLeadershipRunner.java:405)
 ~[flink-dist-1.17.2.jar:1.17.2]


at 
org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.runIfStateRunning(JobMasterServiceLeadershipRunner.java:456)
 ~[flink-dist-1.17.2.jar:1.17.2]


at 
org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.revokeLeadership(JobMasterServiceLeadershipRunner.java:390)
 ~[flink-dist-1.17.2.jar:1.17.2]


at 
org.apache.flink.runtime.leaderelection.DefaultLeaderElectionService.onRevokeLeadership(DefaultLeaderElectionService.java:236)
 ~[flink-dist-1.17.2.jar:1.17.2]


at 
org.apache.flink.runtime.leaderelection.DefaultMultipleComponentLeaderElectionService.lambda$forEachLeaderElectionEventHandler$2(DefaultMultipleComponentLeaderElectionService.java:225)
 ~[flink-dist-1.17.2.jar:1.17.2]


at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source) ~[?:?]


at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source) ~[?:?]


at java.lang.Thread.run(Unknown Source) ~[?:?]


ed by: java.util.concurrent.CompletionException: 
java.lang.UnsupportedOperationException: Still waiting for the leadership.


at java.util.concurrent.CompletableFuture.encodeThrowable(Unknown Source) ~[?:?]


at java.util.concurrent.CompletableFuture.uniComposeStage(Unknown Source) ~[?:?]


at java.util.concurrent.CompletableFuture.thenCompose(Unknown Source) ~[?:?]


at 
org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.stopJobMasterServiceProcessAsync(JobMasterServiceLeadershipRunner.java:398)
 ~[flink-dist-1.17.2.jar:1.17.2]


at 
org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.runIfStateRunning(JobMasterServiceLeadershipRunner.java:456)
 ~[flink-dist-1.17.2.jar:1.17.2]


at 
org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.revokeLeadership(JobMasterServiceLeadershipRunner.java:390)
 ~[flink-dist-1.17.2.jar:1.17.2]


at 
org.apache.flink.runtime.leaderelection.DefaultLeaderElectionService.onRevokeLeadership(DefaultLeaderElectionService.java:236)
 ~[flink-dist-1.17.2.jar:1.17.2]


at 
org.apache.flink.runtime.leaderelection.DefaultMultipleComponentLeaderElectionService.lambda$forEachLeaderElectionEventHandler$2(DefaultMultipleComponentLeaderElectionService.java:225)
 ~[flink-dist-1.17.2.jar:1.17.2]


at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source) ~[?:?]


at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source) ~[?:?]


at java.lang.Thread.run(Unknown Source) ~[?:?]


ed by: java.lang.UnsupportedOperationException: Still waiting for the 
leadership.


at 
org.apache.flink.runtime.jobmaster.JobMasterServiceProcess$WaitingForLeadership.getLeaderSessionId(JobMasterServiceProcess.java:71)
 ~[flink-dist-1.17.2.jar:1.17.2]


at 
org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.stopJobMasterServiceProcess(JobMasterServiceLeadershipRunner.java:414)
 ~[flink-dist-1.17.2.jar:1.17.2]


at 
org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.callIfRunning(JobMasterServiceLeadershipRunner.java:469)
 ~[flink-dist-1.17.2.jar:1.17.2]


at 
org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.lambda$stopJobMasterServiceProcessAsync$12(JobMasterServiceLeadershipRunner.java:400)
 ~[flink-dist-1.17.2.jar:1.17.2]


at java.util.concurrent.CompletableFuture.uniComposeStage(Unknown Source) ~[?:?]


at java.util.concurrent.CompletableFuture.thenCompose(Unknown Source) ~[?:?]


at 
org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.stopJobMasterServiceProcessAsync(JobMasterServiceLeadershipRunner.java:398)
 ~[flink-dist-1.17.2.jar:1.17.2]


at 
org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.runIfStateRunning(JobMasterServiceLeadershipRunner.java:456)
 ~[flink-dist-1.17.2.jar:1.17.2]


at 
org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.revokeLeadership(JobMasterServiceLeadershipRunner.java:390)
 ~[flink-dist-1.17.2.jar:1.17.2]


at 
org.apache.flink.runtime.leaderelection.DefaultLeaderElectionService.onRevokeLeadership(DefaultLeaderElectionService.java:236)
 ~[flink-dist-1.17.2.jar:1.17.2]


at 
org.apache.flink.runtime.leaderelection.DefaultMultipleComponentLeaderElectionService.lambda$forEachLeaderElectionEventHandler$2(DefaultMultipleComponentLeaderElectionService.java:225)
 ~[flink-dist-1.17.2.jar:1.17.2]


at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source) ~[?:?]


at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source) ~[?:?]


at java.lang.Thread.run(Unknown Source) ~[?:?]    
INFO  org.apache.flink.runtime.entrypoint.ClusterEntrypoint        [] - 
Shutting KubernetesSessionClusterEntrypoint down with application status 
UNKNOWN. Diagnostics Cluster entrypoint has been closed externally..INFO  
org.apache.flink.runtime.blob.BlobServer                     [] - Stopped BLOB 
server at 0.0.0.0:6124

{code}
 

 


> Fatal error occurred in the cluster entrypoint
> ----------------------------------------------
>
>                 Key: FLINK-36570
>                 URL: https://issues.apache.org/jira/browse/FLINK-36570
>             Project: Flink
>          Issue Type: Bug
>          Components: Deployment / Kubernetes
>    Affects Versions: 1.17.2
>         Environment: * flink 1.17.2
>  * native kubernetes session cluster HA(3 jobmanager replica)
>            Reporter: zyh
>            Priority: Critical
>
> I commit batch jobs to my session cluster with rest api. The jobmanager pod 
> would restart when occurred the error.
> Seems to be because the new leader elected and exist running job     at the 
> same time. Then the job send to the new leader and error.
> {code:java}
> 2024-10-18 03:07:22,107 INFO  
> org.apache.flink.client.deployment.application.executors.EmbeddedExecutor [] 
> - Submitting Job with JobId=a9d339b6ba26ab51746514cc7aea0537.2024-10-18 
> 03:07:22,546 INFO  
> org.apache.flink.kubernetes.kubeclient.resources.KubernetesLeaderElector [] - 
> New leader elected 57c4be1d-58f0-4c2c-89d8-11aefe1ec273 for 
> flink-cluster-cluster-config-map.
> 2024-10-18 03:07:22,549 ERROR 
> org.apache.flink.runtime.entrypoint.ClusterEntrypoint        [] - Fatal error 
> occurred in the cluster entrypoint.org.apache.flink.util.FlinkException: 
> JobMaster for job 99c02051a54c77499f53f09cd4b7a0d9 failed.    
>     at 
> org.apache.flink.runtime.dispatcher.Dispatcher.jobMasterFailed(Dispatcher.java:1360)
>  ~[flink-dist-1.17.2.jar:1.17.2]
>     at 
> org.apache.flink.runtime.dispatcher.Dispatcher.jobManagerRunnerFailed(Dispatcher.java:772)
>  ~[flink-dist-1.17.2.jar:1.17.2]
>     at 
> org.apache.flink.runtime.dispatcher.Dispatcher.lambda$runJob$6(Dispatcher.java:694)
>  ~[flink-dist-1.17.2.jar:1.17.2]
>     at java.util.concurrent.CompletableFuture.uniHandle(Unknown Source) ~[?:?]
>     at java.util.concurrent.CompletableFuture$UniHandle.tryFire(Unknown 
> Source) ~[?:?]
>     at java.util.concurrent.CompletableFuture$Completion.run(Unknown Source) 
> ~[?:?]
>     at 
> org.apache.flink.runtime.rpc.akka.AkkaRpcActor.lambda$handleRunAsync$4(AkkaRpcActor.java:453)
>  ~[flink-rpc-akka_27725420-d3ff-407e-864b-  d8e6936565db.jar:1.17.2]
>     at 
> org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.runWithContextClassLoader(ClassLoadingUtils.java:68)
>  ~[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]
>     at 
> org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRunAsync(AkkaRpcActor.java:453)
>  ~[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]
>     at 
> org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcMessage(AkkaRpcActor.java:218)
>  ~[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]
>     at 
> org.apache.flink.runtime.rpc.akka.FencedAkkaRpcActor.handleRpcMessage(FencedAkkaRpcActor.java:84)
>  ~[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]
>     at 
> org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleMessage(AkkaRpcActor.java:168)
>  ~[flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]
>     at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:24) 
> [flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]
>     at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:20) 
> [flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]
>     at scala.PartialFunction.applyOrElse(PartialFunction.scala:127) 
> [flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]
>     at scala.PartialFunction.applyOrElse$(PartialFunction.scala:126) 
> [flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]
>     at akka.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:20) 
> [flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]
>     at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:175) 
> [flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]
>     at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:176) 
> [flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]
>     at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:176) 
> [flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]
>     at akka.actor.Actor.aroundReceive(Actor.scala:537) 
> [flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]
>     at akka.actor.Actor.aroundReceive$(Actor.scala:535) 
> [flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]
>     at akka.actor.AbstractActor.aroundReceive(AbstractActor.scala:220) 
> [flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]
>     at akka.actor.ActorCell.receiveMessage(ActorCell.scala:579) 
> [flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]
>     at akka.actor.ActorCell.invoke(ActorCell.scala:547) 
> [flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]
>     at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:270) 
> [flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]
>     at akka.dispatch.Mailbox.run(Mailbox.scala:231) 
> [flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]
>     at akka.dispatch.Mailbox.exec(Mailbox.scala:243) 
> [flink-rpc-akka_27725420-d3ff-407e-864b-d8e6936565db.jar:1.17.2]
>     at java.util.concurrent.ForkJoinTask.doExec(Unknown Source) [?:?]
>     at java.util.concurrent.ForkJoinPool$WorkQueue.topLevelExec(Unknown 
> Source) [?:?]
>     at java.util.concurrent.ForkJoinPool.scan(Unknown Source) [?:?]
>     at java.util.concurrent.ForkJoinPool.runWorker(Unknown Source) [?:?]
>     at java.util.concurrent.ForkJoinWorkerThread.run(Unknown Source) [?:?]
> caused by: org.apache.flink.util.FlinkException: Could not suspend the job 
> manager.
>     at 
> org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.lambda$null$13(JobMasterServiceLeadershipRunner.java:438)
>  ~[flink-dist-1.17.2.jar:1.17.2]
>     at 
> org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.runIfStateRunning(JobMasterServiceLeadershipRunner.java:456)
>  ~[flink-dist-1.17.2.jar:1.17.2]
>     at 
> org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.lambda$handleAsyncOperationError$14(JobMasterServiceLeadershipRunner.java:436)
>  ~[flink-dist-1.17.2.jar:1.17.2]
>     at java.util.concurrent.CompletableFuture.uniWhenComplete(Unknown Source) 
> ~[?:?]
>     at java.util.concurrent.CompletableFuture.uniWhenCompleteStage(Unknown 
> Source) ~[?:?]
>     at java.util.concurrent.CompletableFuture.whenComplete(Unknown Source) 
> ~[?:?]
>     at 
> org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.handleAsyncOperationError(JobMasterServiceLeadershipRunner.java:433)
>  ~[flink-dist-1.17.2.jar:1.17.2]
>     at 
> org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.stopJobMasterServiceProcessAsync(JobMasterServiceLeadershipRunner.java:405)
>  ~[flink-dist-1.17.2.jar:1.17.2]
>     at 
> org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.runIfStateRunning(JobMasterServiceLeadershipRunner.java:456)
>  ~[flink-dist-1.17.2.jar:1.17.2]
>     at 
> org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.revokeLeadership(JobMasterServiceLeadershipRunner.java:390)
>  ~[flink-dist-1.17.2.jar:1.17.2]
>     at 
> org.apache.flink.runtime.leaderelection.DefaultLeaderElectionService.onRevokeLeadership(DefaultLeaderElectionService.java:236)
>  ~[flink-dist-1.17.2.jar:1.17.2]
>     at 
> org.apache.flink.runtime.leaderelection.DefaultMultipleComponentLeaderElectionService.lambda$forEachLeaderElectionEventHandler$2(DefaultMultipleComponentLeaderElectionService.java:225)
>  ~[flink-dist-1.17.2.jar:1.17.2]
>     at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source) 
> ~[?:?]
>     at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source) 
> ~[?:?]
>     at java.lang.Thread.run(Unknown Source) ~[?:?]
> caused by: java.util.concurrent.CompletionException: 
> java.lang.UnsupportedOperationException: Still waiting for the leadership.
>     at java.util.concurrent.CompletableFuture.encodeThrowable(Unknown Source) 
> ~[?:?]
>     at java.util.concurrent.CompletableFuture.uniComposeStage(Unknown Source) 
> ~[?:?]
>     at java.util.concurrent.CompletableFuture.thenCompose(Unknown Source) 
> ~[?:?]
>     at 
> org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.stopJobMasterServiceProcessAsync(JobMasterServiceLeadershipRunner.java:398)
>  ~[flink-dist-1.17.2.jar:1.17.2]
>     at 
> org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.runIfStateRunning(JobMasterServiceLeadershipRunner.java:456)
>  ~[flink-dist-1.17.2.jar:1.17.2]
>     at 
> org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.revokeLeadership(JobMasterServiceLeadershipRunner.java:390)
>  ~[flink-dist-1.17.2.jar:1.17.2]
>     at 
> org.apache.flink.runtime.leaderelection.DefaultLeaderElectionService.onRevokeLeadership(DefaultLeaderElectionService.java:236)
>  ~[flink-dist-1.17.2.jar:1.17.2]
>     at 
> org.apache.flink.runtime.leaderelection.DefaultMultipleComponentLeaderElectionService.lambda$forEachLeaderElectionEventHandler$2(DefaultMultipleComponentLeaderElectionService.java:225)
>  ~[flink-dist-1.17.2.jar:1.17.2]
>     at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source) 
> ~[?:?]
>     at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source) 
> ~[?:?]
>     at java.lang.Thread.run(Unknown Source) ~[?:?]
> caused by: java.lang.UnsupportedOperationException: Still waiting for the 
> leadership.
>     at 
> org.apache.flink.runtime.jobmaster.JobMasterServiceProcess$WaitingForLeadership.getLeaderSessionId(JobMasterServiceProcess.java:71)
>  ~[flink-dist-1.17.2.jar:1.17.2]
>     at 
> org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.stopJobMasterServiceProcess(JobMasterServiceLeadershipRunner.java:414)
>  ~[flink-dist-1.17.2.jar:1.17.2]
>     at 
> org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.callIfRunning(JobMasterServiceLeadershipRunner.java:469)
>  ~[flink-dist-1.17.2.jar:1.17.2]
>     at 
> org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.lambda$stopJobMasterServiceProcessAsync$12(JobMasterServiceLeadershipRunner.java:400)
>  ~[flink-dist-1.17.2.jar:1.17.2]
>     at java.util.concurrent.CompletableFuture.uniComposeStage(Unknown Source) 
> ~[?:?]
>     at java.util.concurrent.CompletableFuture.thenCompose(Unknown Source) 
> ~[?:?]
>     at 
> org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.stopJobMasterServiceProcessAsync(JobMasterServiceLeadershipRunner.java:398)
>  ~[flink-dist-1.17.2.jar:1.17.2]
>     at 
> org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.runIfStateRunning(JobMasterServiceLeadershipRunner.java:456)
>  ~[flink-dist-1.17.2.jar:1.17.2]
>     at 
> org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.revokeLeadership(JobMasterServiceLeadershipRunner.java:390)
>  ~[flink-dist-1.17.2.jar:1.17.2]
>     at 
> org.apache.flink.runtime.leaderelection.DefaultLeaderElectionService.onRevokeLeadership(DefaultLeaderElectionService.java:236)
>  ~[flink-dist-1.17.2.jar:1.17.2]
>     at 
> org.apache.flink.runtime.leaderelection.DefaultMultipleComponentLeaderElectionService.lambda$forEachLeaderElectionEventHandler$2(DefaultMultipleComponentLeaderElectionService.java:225)
>  ~[flink-dist-1.17.2.jar:1.17.2]
>     at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source) 
> ~[?:?]
>     at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source) 
> ~[?:?]
>     at java.lang.Thread.run(Unknown Source) ~[?:?]    
> INFO  org.apache.flink.runtime.entrypoint.ClusterEntrypoint        [] - 
> Shutting KubernetesSessionClusterEntrypoint down with application status 
> UNKNOWN. Diagnostics Cluster entrypoint has been closed externally..INFO  
> org.apache.flink.runtime.blob.BlobServer                     [] - Stopped 
> BLOB server     at 0.0.0.0:6124
> {code}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

Reply via email to