[ https://issues.apache.org/jira/browse/FLINK-29234?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17627692#comment-17627692 ]
Fabian Paul commented on FLINK-29234: ------------------------------------- [~Weijie Guo] can you give an estimate of when this will be fixed? I want to start the release process of 1.15.3 and include this change. > Dead lock in DefaultLeaderElectionService > ----------------------------------------- > > Key: FLINK-29234 > URL: https://issues.apache.org/jira/browse/FLINK-29234 > Project: Flink > Issue Type: Bug > Components: Runtime / Coordination > Affects Versions: 1.13.5, 1.14.5, 1.15.2 > Reporter: Yu Wang > Assignee: Weijie Guo > Priority: Critical > Labels: pull-request-available > > Jobmanager stop working because the deadlock in DefaultLeaderElectionService. > The log stopped at > {code:java} > org.apache.flink.runtime.leaderelection.DefaultLeaderElectionService [] - > Stopping DefaultLeaderElectionService. {code} > Which may similar to this ticket > https://issues.apache.org/jira/browse/FLINK-20008 > Here is the jstack info > {code:java} > Found one Java-level deadlock: > ============================= > "flink-akka.actor.default-dispatcher-18": waiting to lock monitor > 0x00007f15c7eae3a8 (object 0x0000000678d395e8, a java.lang.Object), which is > held by "main-EventThread" "main-EventThread": waiting to lock monitor > 0x00007f15a3811258 (object 0x0000000678cf1be0, a java.lang.Object), which is > held by "flink-akka.actor.default-dispatcher-18" Java stack information for > the threads listed above: > =================================================== > "flink-akka.actor.default-dispatcher-18": > at > org.apache.flink.runtime.leaderelection.DefaultLeaderElectionService.stop(DefaultLeaderElectionService.java:104) > - waiting to lock <0x0000000678d395e8> (a java.lang.Object) > at > org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.lambda$closeAsync$0(JobMasterServiceLeadershipRunner.java:147) > at > org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner$$Lambda$735/1742012752.run(Unknown > Source) > at > org.apache.flink.runtime.concurrent.FutureUtils.lambda$runAfterwardsAsync$18(FutureUtils.java:687) > at > org.apache.flink.runtime.concurrent.FutureUtils$$Lambda$736/6716561.accept(Unknown > Source) > at > java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:774) > at > java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:750) > at > java.util.concurrent.CompletableFuture$Completion.run(CompletableFuture.java:456) > at > org.apache.flink.runtime.concurrent.DirectExecutorService.execute(DirectExecutorService.java:217) > at > java.util.concurrent.CompletableFuture$UniCompletion.claim(CompletableFuture.java:543) > at > java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:765) > at > java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:750) > at > java.util.concurrent.CompletableFuture.uniWhenCompleteStage(CompletableFuture.java:795) > at > java.util.concurrent.CompletableFuture.whenCompleteAsync(CompletableFuture.java:2163) > at > org.apache.flink.runtime.concurrent.FutureUtils.runAfterwardsAsync(FutureUtils.java:684) > at > org.apache.flink.runtime.concurrent.FutureUtils.runAfterwards(FutureUtils.java:651) > at > org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.closeAsync(JobMasterServiceLeadershipRunner.java:143) > - locked <0x0000000678cf1be0> (a java.lang.Object) > at > org.apache.flink.runtime.dispatcher.Dispatcher.terminateJob(Dispatcher.java:807) > at > org.apache.flink.runtime.dispatcher.Dispatcher.terminateRunningJobs(Dispatcher.java:799) > at > org.apache.flink.runtime.dispatcher.Dispatcher.terminateRunningJobsAndGetTerminationFuture(Dispatcher.java:812) > at org.apache.flink.runtime.dispatcher.Dispatcher.onStop(Dispatcher.java:268) > at > org.apache.flink.runtime.rpc.RpcEndpoint.internalCallOnStop(RpcEndpoint.java:214) > at > org.apache.flink.runtime.rpc.akka.AkkaRpcActor$StartedState.terminate(AkkaRpcActor.java:563) > at > org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleControlMessage(AkkaRpcActor.java:186) > at > org.apache.flink.runtime.rpc.akka.AkkaRpcActor$$Lambda$444/1289054037.apply(Unknown > Source) > at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:26) > at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:21) > at scala.PartialFunction.applyOrElse(PartialFunction.scala:123) > at scala.PartialFunction.applyOrElse$(PartialFunction.scala:122) > at akka.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:21) > at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171) > at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172) > at akka.actor.Actor.aroundReceive(Actor.scala:517) > at akka.actor.Actor.aroundReceive$(Actor.scala:515) > at akka.actor.AbstractActor.aroundReceive(AbstractActor.scala:225) > at akka.actor.ActorCell.receiveMessage(ActorCell.scala:592) > at akka.actor.ActorCell.invoke(ActorCell.scala:561) > at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:258) > at akka.dispatch.Mailbox.run(Mailbox.scala:225) > at akka.dispatch.Mailbox.exec(Mailbox.scala:235) > at akka.dispatch.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) > at > akka.dispatch.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339) > at akka.dispatch.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) > at > akka.dispatch.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107) > "main-EventThread": > at > org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.runIfStateRunning(JobMasterServiceLeadershipRunner.java:468) > - waiting to lock <0x0000000678cf1be0> (a java.lang.Object) > at > org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner.grantLeadership(JobMasterServiceLeadershipRunner.java:248) > at > org.apache.flink.runtime.leaderelection.DefaultLeaderElectionService.onGrantLeadership(DefaultLeaderElectionService.java:211) > - locked <0x0000000678d395e8> (a java.lang.Object) > at > org.apache.flink.runtime.leaderelection.ZooKeeperLeaderElectionDriver.isLeader(ZooKeeperLeaderElectionDriver.java:166) > at > org.apache.flink.shaded.curator4.org.apache.curator.framework.recipes.leader.LeaderLatch$9.apply(LeaderLatch.java:693) > at > org.apache.flink.shaded.curator4.org.apache.curator.framework.recipes.leader.LeaderLatch$9.apply(LeaderLatch.java:689) > at > org.apache.flink.shaded.curator4.org.apache.curator.framework.listen.ListenerContainer$1.run(ListenerContainer.java:100) > at > org.apache.flink.shaded.curator4.org.apache.curator.shaded.com.google.common.util.concurrent.DirectExecutor.execute(DirectExecutor.java:30) > at > org.apache.flink.shaded.curator4.org.apache.curator.framework.listen.ListenerContainer.forEach(ListenerContainer.java:92) > at > org.apache.flink.shaded.curator4.org.apache.curator.framework.recipes.leader.LeaderLatch.setLeadership(LeaderLatch.java:688) > - locked <0x0000000678d39788> (a > org.apache.flink.shaded.curator4.org.apache.curator.framework.recipes.leader.LeaderLatch) > at > org.apache.flink.shaded.curator4.org.apache.curator.framework.recipes.leader.LeaderLatch.checkLeadership(LeaderLatch.java:567) > at > org.apache.flink.shaded.curator4.org.apache.curator.framework.recipes.leader.LeaderLatch.access$700(LeaderLatch.java:65) > at > org.apache.flink.shaded.curator4.org.apache.curator.framework.recipes.leader.LeaderLatch$7.processResult(LeaderLatch.java:618) > at > org.apache.flink.shaded.curator4.org.apache.curator.framework.imps.CuratorFrameworkImpl.sendToBackgroundCallback(CuratorFrameworkImpl.java:883) > at > org.apache.flink.shaded.curator4.org.apache.curator.framework.imps.CuratorFrameworkImpl.processBackgroundOperation(CuratorFrameworkImpl.java:653) > at > org.apache.flink.shaded.curator4.org.apache.curator.framework.imps.WatcherRemovalFacade.processBackgroundOperation(WatcherRemovalFacade.java:152) > at > org.apache.flink.shaded.curator4.org.apache.curator.framework.imps.GetChildrenBuilderImpl$2.processResult(GetChildrenBuilderImpl.java:187) > at > org.apache.flink.shaded.zookeeper3.org.apache.zookeeper.ClientCnxn$EventThread.processEvent(ClientCnxn.java:601) > at > org.apache.flink.shaded.zookeeper3.org.apache.zookeeper.ClientCnxn$EventThread.run(ClientCnxn.java:508) > {code} -- This message was sent by Atlassian Jira (v8.20.10#820010)