[ 
https://issues.apache.org/jira/browse/FLINK-32876?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Junrui Li updated FLINK-32876:
------------------------------
    Description: 
When we enable speculative execution and configure job with the following 
configuration:

 
{code:java}
execution.batch.speculative.enabled: true
slow-task-detector.execution-time.baseline-ratio: 0.0
slow-task-detector.execution-time.baseline-lower-bound: 0s{code}
The ExecutionTimeBasedSlowTaskDetector will identify ExecutionJobVertex that 
has not yet been scheduled as slow tasks and notify them to the 
SpeculativeScheduler. However, the SpeculativeScheduler requires that the 
corresponding ExecutionVertex has entered the scheduled state before scheduling 
backup tasks. If this requirement is not met, it will result in speculative 
execution failure.

The exception stack trace is as follows:
{code:java}
java.lang.IllegalStateException: Execution vertex 
b3f44e8b1dc132ff2a47f7955c75ef7d_0 does not have a recorded version  at 
org.apache.flink.util.Preconditions.checkState(Preconditions.java:215) 
~[flink-dist-1.18-SNAPSHOT.jar:1.18-SNAPSHOT]  at 
org.apache.flink.runtime.scheduler.ExecutionVertexVersioner.getCurrentVersion(ExecutionVertexVersioner.java:71)
 ~[flink-dist-1.18-SNAPSHOT.jar:1.18-SNAPSHOT]  at 
org.apache.flink.runtime.scheduler.ExecutionVertexVersioner.lambda$getExecutionVertexVersions$1(ExecutionVertexVersioner.java:89)
 ~[flink-dist-1.18-SNAPSHOT.jar:1.18-SNAPSHOT]  at 
java.util.stream.ReferencePipeline$3$1.accept(ReferencePipeline.java:193) 
~[?:1.8.0_333]  at 
java.util.HashMap$KeySpliterator.forEachRemaining(HashMap.java:1580) 
~[?:1.8.0_333]  at 
java.util.stream.AbstractPipeline.copyInto(AbstractPipeline.java:482) 
~[?:1.8.0_333]  at 
java.util.stream.AbstractPipeline.wrapAndCopyInto(AbstractPipeline.java:472) 
~[?:1.8.0_333]  at 
java.util.stream.ReduceOps$ReduceOp.evaluateSequential(ReduceOps.java:708) 
~[?:1.8.0_333]  at 
java.util.stream.AbstractPipeline.evaluate(AbstractPipeline.java:234) 
~[?:1.8.0_333]  at 
java.util.stream.ReferencePipeline.collect(ReferencePipeline.java:499) 
~[?:1.8.0_333]  at 
org.apache.flink.runtime.scheduler.ExecutionVertexVersioner.getExecutionVertexVersions(ExecutionVertexVersioner.java:90)
 ~[flink-dist-1.18-SNAPSHOT.jar:1.18-SNAPSHOT]  at 
org.apache.flink.runtime.scheduler.adaptivebatch.SpeculativeScheduler.notifySlowTasks(SpeculativeScheduler.java:377)
 ~[flink-dist-1.18-SNAPSHOT.jar:1.18-SNAPSHOT]  at 
org.apache.flink.runtime.scheduler.slowtaskdetector.ExecutionTimeBasedSlowTaskDetector.lambda$scheduleTask$1(ExecutionTimeBasedSlowTaskDetector.java:129)
 ~[flink-dist-1.18-SNAPSHOT.jar:1.18-SNAPSHOT]  at 
java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) 
~[?:1.8.0_333]  at java.util.concurrent.FutureTask.run(FutureTask.java:266) 
~[?:1.8.0_333]  at 
org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.lambda$handleRunAsync$4(PekkoRpcActor.java:451)
 ~[flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT]  at 
org.apache.flink.runtime.concurrent.ClassLoadingUtils.runWithContextClassLoader(ClassLoadingUtils.java:68)
 ~[flink-dist-1.18-SNAPSHOT.jar:1.18-SNAPSHOT]  at 
org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.handleRunAsync(PekkoRpcActor.java:451)
 ~[flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT]  at 
org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.handleRpcMessage(PekkoRpcActor.java:218)
 ~[flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT]  at 
org.apache.flink.runtime.rpc.pekko.FencedPekkoRpcActor.handleRpcMessage(FencedPekkoRpcActor.java:85)
 ~[flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT]  at 
org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.handleMessage(PekkoRpcActor.java:168)
 ~[flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT]  at 
org.apache.pekko.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:33) 
[flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT]  at 
org.apache.pekko.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:29) 
[flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT]  at 
scala.PartialFunction.applyOrElse(PartialFunction.scala:127) 
[flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT]  at 
scala.PartialFunction.applyOrElse$(PartialFunction.scala:126) 
[flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT]  at 
org.apache.pekko.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:29) 
[flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT]  at 
scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:175) 
[flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT]  at 
scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:176) 
[flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT]  at 
scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:176) 
[flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT]  at 
org.apache.pekko.actor.Actor.aroundReceive(Actor.scala:547) 
[flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT]  at 
org.apache.pekko.actor.Actor.aroundReceive$(Actor.scala:545) 
[flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT]  at 
org.apache.pekko.actor.AbstractActor.aroundReceive(AbstractActor.scala:229) 
[flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT]  at 
org.apache.pekko.actor.ActorCell.receiveMessage(ActorCell.scala:590) 
[flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT]  at 
org.apache.pekko.actor.ActorCell.invoke(ActorCell.scala:557) 
[flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT]  at 
org.apache.pekko.dispatch.Mailbox.processMailbox(Mailbox.scala:280) 
[flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT]  at 
org.apache.pekko.dispatch.Mailbox.run(Mailbox.scala:241) 
[flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT]  at 
org.apache.pekko.dispatch.Mailbox.exec(Mailbox.scala:253) 
[flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT]  at 
java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:289) [?:1.8.0_333]  
at java.util.concurrent.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1067) 
[?:1.8.0_333]  at 
java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1703) 
[?:1.8.0_333]  at 
java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:172) 
[?:1.8.0_333] {code}
 

  was:When the ExecutionTimeBasedSlowTaskDetector determines slow tasks, it 
only requires that the ExecutionJobVertex is initialized and has not finished. 
However, when the SpeculativeScheduler schedules slow tasks, it requires that 
the corresponding ExecutionVertex has an ExecutionVertexVersion, which means 
that the ExecutionVertex has entered the scheduled state. This inconsistency 
can result in speculative execution failure.


> ExecutionTimeBasedSlowTaskDetector treats unscheduled tasks as slow tasks and 
> causes speculative execution to fail.
> -------------------------------------------------------------------------------------------------------------------
>
>                 Key: FLINK-32876
>                 URL: https://issues.apache.org/jira/browse/FLINK-32876
>             Project: Flink
>          Issue Type: Bug
>          Components: Runtime / Coordination
>    Affects Versions: 1.18.0
>            Reporter: Junrui Li
>            Priority: Major
>             Fix For: 1.18.0
>
>
> When we enable speculative execution and configure job with the following 
> configuration:
>  
> {code:java}
> execution.batch.speculative.enabled: true
> slow-task-detector.execution-time.baseline-ratio: 0.0
> slow-task-detector.execution-time.baseline-lower-bound: 0s{code}
> The ExecutionTimeBasedSlowTaskDetector will identify ExecutionJobVertex that 
> has not yet been scheduled as slow tasks and notify them to the 
> SpeculativeScheduler. However, the SpeculativeScheduler requires that the 
> corresponding ExecutionVertex has entered the scheduled state before 
> scheduling backup tasks. If this requirement is not met, it will result in 
> speculative execution failure.
> The exception stack trace is as follows:
> {code:java}
> java.lang.IllegalStateException: Execution vertex 
> b3f44e8b1dc132ff2a47f7955c75ef7d_0 does not have a recorded version  at 
> org.apache.flink.util.Preconditions.checkState(Preconditions.java:215) 
> ~[flink-dist-1.18-SNAPSHOT.jar:1.18-SNAPSHOT]  at 
> org.apache.flink.runtime.scheduler.ExecutionVertexVersioner.getCurrentVersion(ExecutionVertexVersioner.java:71)
>  ~[flink-dist-1.18-SNAPSHOT.jar:1.18-SNAPSHOT]  at 
> org.apache.flink.runtime.scheduler.ExecutionVertexVersioner.lambda$getExecutionVertexVersions$1(ExecutionVertexVersioner.java:89)
>  ~[flink-dist-1.18-SNAPSHOT.jar:1.18-SNAPSHOT]  at 
> java.util.stream.ReferencePipeline$3$1.accept(ReferencePipeline.java:193) 
> ~[?:1.8.0_333]  at 
> java.util.HashMap$KeySpliterator.forEachRemaining(HashMap.java:1580) 
> ~[?:1.8.0_333]  at 
> java.util.stream.AbstractPipeline.copyInto(AbstractPipeline.java:482) 
> ~[?:1.8.0_333]  at 
> java.util.stream.AbstractPipeline.wrapAndCopyInto(AbstractPipeline.java:472) 
> ~[?:1.8.0_333]  at 
> java.util.stream.ReduceOps$ReduceOp.evaluateSequential(ReduceOps.java:708) 
> ~[?:1.8.0_333]  at 
> java.util.stream.AbstractPipeline.evaluate(AbstractPipeline.java:234) 
> ~[?:1.8.0_333]  at 
> java.util.stream.ReferencePipeline.collect(ReferencePipeline.java:499) 
> ~[?:1.8.0_333]  at 
> org.apache.flink.runtime.scheduler.ExecutionVertexVersioner.getExecutionVertexVersions(ExecutionVertexVersioner.java:90)
>  ~[flink-dist-1.18-SNAPSHOT.jar:1.18-SNAPSHOT]  at 
> org.apache.flink.runtime.scheduler.adaptivebatch.SpeculativeScheduler.notifySlowTasks(SpeculativeScheduler.java:377)
>  ~[flink-dist-1.18-SNAPSHOT.jar:1.18-SNAPSHOT]  at 
> org.apache.flink.runtime.scheduler.slowtaskdetector.ExecutionTimeBasedSlowTaskDetector.lambda$scheduleTask$1(ExecutionTimeBasedSlowTaskDetector.java:129)
>  ~[flink-dist-1.18-SNAPSHOT.jar:1.18-SNAPSHOT]  at 
> java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) 
> ~[?:1.8.0_333]  at java.util.concurrent.FutureTask.run(FutureTask.java:266) 
> ~[?:1.8.0_333]  at 
> org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.lambda$handleRunAsync$4(PekkoRpcActor.java:451)
>  ~[flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT]  at 
> org.apache.flink.runtime.concurrent.ClassLoadingUtils.runWithContextClassLoader(ClassLoadingUtils.java:68)
>  ~[flink-dist-1.18-SNAPSHOT.jar:1.18-SNAPSHOT]  at 
> org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.handleRunAsync(PekkoRpcActor.java:451)
>  ~[flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT]  at 
> org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.handleRpcMessage(PekkoRpcActor.java:218)
>  ~[flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT]  at 
> org.apache.flink.runtime.rpc.pekko.FencedPekkoRpcActor.handleRpcMessage(FencedPekkoRpcActor.java:85)
>  ~[flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT]  at 
> org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.handleMessage(PekkoRpcActor.java:168)
>  ~[flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT]  at 
> org.apache.pekko.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:33) 
> [flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT]  at 
> org.apache.pekko.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:29) 
> [flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT]  at 
> scala.PartialFunction.applyOrElse(PartialFunction.scala:127) 
> [flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT]  at 
> scala.PartialFunction.applyOrElse$(PartialFunction.scala:126) 
> [flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT]  at 
> org.apache.pekko.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:29)
>  [flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT]  at 
> scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:175) 
> [flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT]  at 
> scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:176) 
> [flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT]  at 
> scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:176) 
> [flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT]  at 
> org.apache.pekko.actor.Actor.aroundReceive(Actor.scala:547) 
> [flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT]  at 
> org.apache.pekko.actor.Actor.aroundReceive$(Actor.scala:545) 
> [flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT]  at 
> org.apache.pekko.actor.AbstractActor.aroundReceive(AbstractActor.scala:229) 
> [flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT]  at 
> org.apache.pekko.actor.ActorCell.receiveMessage(ActorCell.scala:590) 
> [flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT]  at 
> org.apache.pekko.actor.ActorCell.invoke(ActorCell.scala:557) 
> [flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT]  at 
> org.apache.pekko.dispatch.Mailbox.processMailbox(Mailbox.scala:280) 
> [flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT]  at 
> org.apache.pekko.dispatch.Mailbox.run(Mailbox.scala:241) 
> [flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT]  at 
> org.apache.pekko.dispatch.Mailbox.exec(Mailbox.scala:253) 
> [flink-rpc-akka48a43f0a-d73c-494a-a57b-ded9f5d82a84.jar:1.18-SNAPSHOT]  at 
> java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:289) [?:1.8.0_333] 
>  at 
> java.util.concurrent.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1067) 
> [?:1.8.0_333]  at 
> java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1703) 
> [?:1.8.0_333]  at 
> java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:172) 
> [?:1.8.0_333] {code}
>  



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

Reply via email to