[ https://issues.apache.org/jira/browse/FLINK-32876?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Junrui Li updated FLINK-32876: ------------------------------ Description: When we enable speculative execution and configure job with the following configuration: {code:java} execution.batch.speculative.enabled: true slow-task-detector.execution-time.baseline-ratio: 0.0 slow-task-detector.execution-time.baseline-lower-bound: 0s{code} The ExecutionTimeBasedSlowTaskDetector will identify ExecutionJobVertex that has not yet been scheduled as slow tasks and notify them to the SpeculativeScheduler. However, the SpeculativeScheduler requires that the corresponding ExecutionVertex has entered the scheduled state before scheduling backup tasks. If this requirement is not met, it will result in speculative execution failure. The exception stack trace is as follows: {code:java} java.lang.IllegalStateException: Execution vertex b3f44e8b1dc132ff2a47f7955c75ef7d_0 does not have a recorded version at org.apache.flink.util.Preconditions.checkState(Preconditions.java:215) ~[flink-dist-1.18-SNAPSHOT.jar:1.18-SNAPSHOT] at org.apache.flink.runtime.scheduler.ExecutionVertexVersioner.getCurrentVersion(ExecutionVertexVersioner.java:71) ~[flink-dist-1.18-SNAPSHOT.jar:1.18-SNAPSHOT] at org.apache.flink.runtime.scheduler.ExecutionVertexVersioner.lambda$getExecutionVertexVersions$1(ExecutionVertexVersioner.java:89) ~[flink-dist-1.18-SNAPSHOT.jar:1.18-SNAPSHOT] at java.util.stream.ReferencePipeline$3$1.accept(ReferencePipeline.java:193) ~[?:1.8.0_333] at java.util.HashMap$KeySpliterator.forEachRemaining(HashMap.java:1580) ~[?:1.8.0_333] at java.util.stream.AbstractPipeline.copyInto(AbstractPipeline.java:482) ~[?:1.8.0_333] at java.util.stream.AbstractPipeline.wrapAndCopyInto(AbstractPipeline.java:472) ~[?:1.8.0_333] at java.util.stream.ReduceOps$ReduceOp.evaluateSequential(ReduceOps.java:708) ~[?:1.8.0_333] at java.util.stream.AbstractPipeline.evaluate(AbstractPipeline.java:234) ~[?:1.8.0_333] at java.util.stream.ReferencePipeline.collect(ReferencePipeline.java:499) ~[?:1.8.0_333] at org.apache.flink.runtime.scheduler.ExecutionVertexVersioner.getExecutionVertexVersions(ExecutionVertexVersioner.java:90) ~[flink-dist-1.18-SNAPSHOT.jar:1.18-SNAPSHOT] at org.apache.flink.runtime.scheduler.adaptivebatch.SpeculativeScheduler.notifySlowTasks(SpeculativeScheduler.java:377) ~[flink-dist-1.18-SNAPSHOT.jar:1.18-SNAPSHOT] at org.apache.flink.runtime.scheduler.slowtaskdetector.ExecutionTimeBasedSlowTaskDetector.lambda$scheduleTask$1(ExecutionTimeBasedSlowTaskDetector.java:129) ~[flink-dist-1.18-SNAPSHOT.jar:1.18-SNAPSHOT] at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) ~[?:1.8.0_333] at java.util.concurrent.FutureTask.run(FutureTask.java:266) ~[?:1.8.0_333] at org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.lambda$handleRunAsync$4(PekkoRpcActor.java:451) ~[flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT] at org.apache.flink.runtime.concurrent.ClassLoadingUtils.runWithContextClassLoader(ClassLoadingUtils.java:68) ~[flink-dist-1.18-SNAPSHOT.jar:1.18-SNAPSHOT] at org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.handleRunAsync(PekkoRpcActor.java:451) ~[flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT] at org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.handleRpcMessage(PekkoRpcActor.java:218) ~[flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT] at org.apache.flink.runtime.rpc.pekko.FencedPekkoRpcActor.handleRpcMessage(FencedPekkoRpcActor.java:85) ~[flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT] at org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.handleMessage(PekkoRpcActor.java:168) ~[flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT] at org.apache.pekko.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:33) [flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT] at org.apache.pekko.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:29) [flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT] at scala.PartialFunction.applyOrElse(PartialFunction.scala:127) [flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT] at scala.PartialFunction.applyOrElse$(PartialFunction.scala:126) [flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT] at org.apache.pekko.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:29) [flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT] at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:175) [flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT] at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:176) [flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT] at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:176) [flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT] at org.apache.pekko.actor.Actor.aroundReceive(Actor.scala:547) [flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT] at org.apache.pekko.actor.Actor.aroundReceive$(Actor.scala:545) [flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT] at org.apache.pekko.actor.AbstractActor.aroundReceive(AbstractActor.scala:229) [flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT] at org.apache.pekko.actor.ActorCell.receiveMessage(ActorCell.scala:590) [flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT] at org.apache.pekko.actor.ActorCell.invoke(ActorCell.scala:557) [flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT] at org.apache.pekko.dispatch.Mailbox.processMailbox(Mailbox.scala:280) [flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT] at org.apache.pekko.dispatch.Mailbox.run(Mailbox.scala:241) [flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT] at org.apache.pekko.dispatch.Mailbox.exec(Mailbox.scala:253) [flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT] at java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:289) [?:1.8.0_333] at java.util.concurrent.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1067) [?:1.8.0_333] at java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1703) [?:1.8.0_333] at java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:172) [?:1.8.0_333] {code} was:When the ExecutionTimeBasedSlowTaskDetector determines slow tasks, it only requires that the ExecutionJobVertex is initialized and has not finished. However, when the SpeculativeScheduler schedules slow tasks, it requires that the corresponding ExecutionVertex has an ExecutionVertexVersion, which means that the ExecutionVertex has entered the scheduled state. This inconsistency can result in speculative execution failure. > ExecutionTimeBasedSlowTaskDetector treats unscheduled tasks as slow tasks and > causes speculative execution to fail. > ------------------------------------------------------------------------------------------------------------------- > > Key: FLINK-32876 > URL: https://issues.apache.org/jira/browse/FLINK-32876 > Project: Flink > Issue Type: Bug > Components: Runtime / Coordination > Affects Versions: 1.18.0 > Reporter: Junrui Li > Priority: Major > Fix For: 1.18.0 > > > When we enable speculative execution and configure job with the following > configuration: > > {code:java} > execution.batch.speculative.enabled: true > slow-task-detector.execution-time.baseline-ratio: 0.0 > slow-task-detector.execution-time.baseline-lower-bound: 0s{code} > The ExecutionTimeBasedSlowTaskDetector will identify ExecutionJobVertex that > has not yet been scheduled as slow tasks and notify them to the > SpeculativeScheduler. However, the SpeculativeScheduler requires that the > corresponding ExecutionVertex has entered the scheduled state before > scheduling backup tasks. If this requirement is not met, it will result in > speculative execution failure. > The exception stack trace is as follows: > {code:java} > java.lang.IllegalStateException: Execution vertex > b3f44e8b1dc132ff2a47f7955c75ef7d_0 does not have a recorded version at > org.apache.flink.util.Preconditions.checkState(Preconditions.java:215) > ~[flink-dist-1.18-SNAPSHOT.jar:1.18-SNAPSHOT] at > org.apache.flink.runtime.scheduler.ExecutionVertexVersioner.getCurrentVersion(ExecutionVertexVersioner.java:71) > ~[flink-dist-1.18-SNAPSHOT.jar:1.18-SNAPSHOT] at > org.apache.flink.runtime.scheduler.ExecutionVertexVersioner.lambda$getExecutionVertexVersions$1(ExecutionVertexVersioner.java:89) > ~[flink-dist-1.18-SNAPSHOT.jar:1.18-SNAPSHOT] at > java.util.stream.ReferencePipeline$3$1.accept(ReferencePipeline.java:193) > ~[?:1.8.0_333] at > java.util.HashMap$KeySpliterator.forEachRemaining(HashMap.java:1580) > ~[?:1.8.0_333] at > java.util.stream.AbstractPipeline.copyInto(AbstractPipeline.java:482) > ~[?:1.8.0_333] at > java.util.stream.AbstractPipeline.wrapAndCopyInto(AbstractPipeline.java:472) > ~[?:1.8.0_333] at > java.util.stream.ReduceOps$ReduceOp.evaluateSequential(ReduceOps.java:708) > ~[?:1.8.0_333] at > java.util.stream.AbstractPipeline.evaluate(AbstractPipeline.java:234) > ~[?:1.8.0_333] at > java.util.stream.ReferencePipeline.collect(ReferencePipeline.java:499) > ~[?:1.8.0_333] at > org.apache.flink.runtime.scheduler.ExecutionVertexVersioner.getExecutionVertexVersions(ExecutionVertexVersioner.java:90) > ~[flink-dist-1.18-SNAPSHOT.jar:1.18-SNAPSHOT] at > org.apache.flink.runtime.scheduler.adaptivebatch.SpeculativeScheduler.notifySlowTasks(SpeculativeScheduler.java:377) > ~[flink-dist-1.18-SNAPSHOT.jar:1.18-SNAPSHOT] at > org.apache.flink.runtime.scheduler.slowtaskdetector.ExecutionTimeBasedSlowTaskDetector.lambda$scheduleTask$1(ExecutionTimeBasedSlowTaskDetector.java:129) > ~[flink-dist-1.18-SNAPSHOT.jar:1.18-SNAPSHOT] at > java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) > ~[?:1.8.0_333] at java.util.concurrent.FutureTask.run(FutureTask.java:266) > ~[?:1.8.0_333] at > org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.lambda$handleRunAsync$4(PekkoRpcActor.java:451) > ~[flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT] at > org.apache.flink.runtime.concurrent.ClassLoadingUtils.runWithContextClassLoader(ClassLoadingUtils.java:68) > ~[flink-dist-1.18-SNAPSHOT.jar:1.18-SNAPSHOT] at > org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.handleRunAsync(PekkoRpcActor.java:451) > ~[flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT] at > org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.handleRpcMessage(PekkoRpcActor.java:218) > ~[flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT] at > org.apache.flink.runtime.rpc.pekko.FencedPekkoRpcActor.handleRpcMessage(FencedPekkoRpcActor.java:85) > ~[flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT] at > org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.handleMessage(PekkoRpcActor.java:168) > ~[flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT] at > org.apache.pekko.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:33) > [flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT] at > org.apache.pekko.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:29) > [flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT] at > scala.PartialFunction.applyOrElse(PartialFunction.scala:127) > [flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT] at > scala.PartialFunction.applyOrElse$(PartialFunction.scala:126) > [flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT] at > org.apache.pekko.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:29) > [flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT] at > scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:175) > [flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT] at > scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:176) > [flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT] at > scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:176) > [flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT] at > org.apache.pekko.actor.Actor.aroundReceive(Actor.scala:547) > [flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT] at > org.apache.pekko.actor.Actor.aroundReceive$(Actor.scala:545) > [flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT] at > org.apache.pekko.actor.AbstractActor.aroundReceive(AbstractActor.scala:229) > [flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT] at > org.apache.pekko.actor.ActorCell.receiveMessage(ActorCell.scala:590) > [flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT] at > org.apache.pekko.actor.ActorCell.invoke(ActorCell.scala:557) > [flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT] at > org.apache.pekko.dispatch.Mailbox.processMailbox(Mailbox.scala:280) > [flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT] at > org.apache.pekko.dispatch.Mailbox.run(Mailbox.scala:241) > [flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT] at > org.apache.pekko.dispatch.Mailbox.exec(Mailbox.scala:253) > [flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT] at > java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:289) [?:1.8.0_333] > at > java.util.concurrent.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1067) > [?:1.8.0_333] at > java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1703) > [?:1.8.0_333] at > java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:172) > [?:1.8.0_333] {code} > -- This message was sent by Atlassian Jira (v8.20.10#820010)