[ https://issues.apache.org/jira/browse/FLINK-22173?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17338400#comment-17338400 ]
Roman Khachatryan commented on FLINK-22173: ------------------------------------------- A recent failure (with commits in master up to 89c6c03660a88a648bbd13b4e6696124fe46d013): [https://dev.azure.com/apache-flink/apache-flink/_build/results?buildId=17468&view=logs&j=34f41360-6c0d-54d3-11a1-0292a2def1d9&t=2d56e022-1ace-542f-bf1a-b37dd63243f2&l=9772] {code} Apr 30 14:29:59 Caused by: org.apache.flink.shaded.netty4.io.netty.util.IllegalReferenceCountException: refCnt: 0, in crement: 1 Apr 30 14:29:59 at org.apache.flink.shaded.netty4.io.netty.util.internal.ReferenceCountUpdater.retain0(ReferenceCo untUpdater.java:123) Apr 30 14:29:59 at org.apache.flink.shaded.netty4.io.netty.util.internal.ReferenceCountUpdater.retain(ReferenceCou ntUpdater.java:110) Apr 30 14:29:59 at org.apache.flink.shaded.netty4.io.netty.buffer.AbstractReferenceCountedByteBuf.retain(AbstractR eferenceCountedByteBuf.java:80) Apr 30 14:29:59 at org.apache.flink.runtime.io.network.buffer.NetworkBuffer.retainBuffer(NetworkBuffer.java:166) Apr 30 14:29:59 at org.apache.flink.runtime.io.network.buffer.NetworkBuffer.retainBuffer(NetworkBuffer.java:47) Apr 30 14:29:59 at org.apache.flink.runtime.io.network.buffer.BufferConsumer.copy(BufferConsumer.java:143) Apr 30 14:29:59 at org.apache.flink.runtime.io.network.buffer.BufferConsumer.toDebugString(BufferConsumer.java:202 ) Apr 30 14:29:59 at org.apache.flink.runtime.io.network.logger.NetworkActionsLogger.traceRecover(NetworkActionsLogg er.java:94) Apr 30 14:29:59 at org.apache.flink.runtime.io.network.partition.PipelinedSubpartition.addRecovered(PipelinedSubpa rtition.java:142) Apr 30 14:29:59 at org.apache.flink.runtime.checkpoint.channel.ResultSubpartitionRecoveredStateHandler.recover(Rec overedChannelStateHandler.java:195) Apr 30 14:29:59 at org.apache.flink.runtime.checkpoint.channel.ResultSubpartitionRecoveredStateHandler.recover(Rec overedChannelStateHandler.java:144) Apr 30 14:29:59 at org.apache.flink.runtime.checkpoint.channel.ChannelStateChunkReader.readChunk(SequentialChannel StateReaderImpl.java:207) Apr 30 14:29:59 at org.apache.flink.runtime.checkpoint.channel.SequentialChannelStateReaderImpl.readSequentially(S equentialChannelStateReaderImpl.java:107) Apr 30 14:29:59 at org.apache.flink.runtime.checkpoint.channel.SequentialChannelStateReaderImpl.read(SequentialCha nnelStateReaderImpl.java:93) Apr 30 14:29:59 at org.apache.flink.runtime.checkpoint.channel.SequentialChannelStateReaderImpl.readOutputData(Seq uentialChannelStateReaderImpl.java:79) Apr 30 14:29:59 at org.apache.flink.streaming.runtime.tasks.StreamTask.restoreGates(StreamTask.java:571) Apr 30 14:29:59 at org.apache.flink.streaming.runtime.tasks.StreamTaskActionExecutor$1.call(StreamTaskActionExecut or.java:55) Apr 30 14:29:59 at org.apache.flink.streaming.runtime.tasks.StreamTask.restore(StreamTask.java:554) Apr 30 14:29:59 at org.apache.flink.runtime.taskmanager.Task.doRun(Task.java:757) Apr 30 14:29:59 at org.apache.flink.runtime.taskmanager.Task.run(Task.java:564) Apr 30 14:29:59 at java.lang.Thread.run(Thread.java:748) {code} > UnalignedCheckpointRescaleITCase fails on azure > ----------------------------------------------- > > Key: FLINK-22173 > URL: https://issues.apache.org/jira/browse/FLINK-22173 > Project: Flink > Issue Type: Bug > Components: Runtime / Checkpointing > Affects Versions: 1.13.0 > Reporter: Dawid Wysakowicz > Assignee: Arvid Heise > Priority: Critical > Labels: test-stability > Fix For: 1.13.0 > > > https://dev.azure.com/apache-flink/apache-flink/_build/results?buildId=16232&view=logs&j=d8d26c26-7ec2-5ed2-772e-7a1a1eb8317c&t=be5fb08e-1ad7-563c-4f1a-a97ad4ce4865&l=9628 > {code} > 2021-04-08T23:25:56.3131361Z [ERROR] Tests run: 31, Failures: 0, Errors: 1, > Skipped: 0, Time elapsed: 839.623 s <<< FAILURE! - in > org.apache.flink.test.checkpointing.UnalignedCheckpointRescaleITCase > 2021-04-08T23:25:56.3132784Z [ERROR] shouldRescaleUnalignedCheckpoint[no > scale union from 7 to > 7](org.apache.flink.test.checkpointing.UnalignedCheckpointRescaleITCase) > Time elapsed: 607.467 s <<< ERROR! > 2021-04-08T23:25:56.3133586Z > org.apache.flink.runtime.client.JobExecutionException: Job execution failed. > 2021-04-08T23:25:56.3134070Z at > org.apache.flink.runtime.jobmaster.JobResult.toJobExecutionResult(JobResult.java:144) > 2021-04-08T23:25:56.3134643Z at > org.apache.flink.test.checkpointing.UnalignedCheckpointTestBase.execute(UnalignedCheckpointTestBase.java:168) > 2021-04-08T23:25:56.3135577Z at > org.apache.flink.test.checkpointing.UnalignedCheckpointRescaleITCase.shouldRescaleUnalignedCheckpoint(UnalignedCheckpointRescaleITCase.java:368) > 2021-04-08T23:25:56.3138843Z at > sun.reflect.GeneratedMethodAccessor93.invoke(Unknown Source) > 2021-04-08T23:25:56.3139402Z at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > 2021-04-08T23:25:56.3139880Z at > java.lang.reflect.Method.invoke(Method.java:498) > 2021-04-08T23:25:56.3140328Z at > org.junit.runners.model.FrameworkMethod$1.runReflectiveCall(FrameworkMethod.java:50) > 2021-04-08T23:25:56.3140844Z at > org.junit.internal.runners.model.ReflectiveCallable.run(ReflectiveCallable.java:12) > 2021-04-08T23:25:56.3141768Z at > org.junit.runners.model.FrameworkMethod.invokeExplosively(FrameworkMethod.java:47) > 2021-04-08T23:25:56.3142272Z at > org.junit.internal.runners.statements.InvokeMethod.evaluate(InvokeMethod.java:17) > 2021-04-08T23:25:56.3142706Z at > org.junit.rules.Verifier$1.evaluate(Verifier.java:35) > 2021-04-08T23:25:56.3143142Z at > org.junit.rules.ExternalResource$1.evaluate(ExternalResource.java:48) > 2021-04-08T23:25:56.3143608Z at > org.apache.flink.util.TestNameProvider$1.evaluate(TestNameProvider.java:45) > 2021-04-08T23:25:56.3144039Z at > org.junit.rules.TestWatcher$1.evaluate(TestWatcher.java:55) > 2021-04-08T23:25:56.3144434Z at > org.junit.rules.RunRules.evaluate(RunRules.java:20) > 2021-04-08T23:25:56.3145027Z at > org.junit.runners.ParentRunner.runLeaf(ParentRunner.java:325) > 2021-04-08T23:25:56.3145484Z at > org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:78) > 2021-04-08T23:25:56.3145981Z at > org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:57) > 2021-04-08T23:25:56.3146421Z at > org.junit.runners.ParentRunner$3.run(ParentRunner.java:290) > 2021-04-08T23:25:56.3146843Z at > org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:71) > 2021-04-08T23:25:56.3147274Z at > org.junit.runners.ParentRunner.runChildren(ParentRunner.java:288) > 2021-04-08T23:25:56.3147692Z at > org.junit.runners.ParentRunner.access$000(ParentRunner.java:58) > 2021-04-08T23:25:56.3148116Z at > org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:268) > 2021-04-08T23:25:56.3148543Z at > org.junit.runners.ParentRunner.run(ParentRunner.java:363) > 2021-04-08T23:25:56.3148930Z at > org.junit.runners.Suite.runChild(Suite.java:128) > 2021-04-08T23:25:56.3149298Z at > org.junit.runners.Suite.runChild(Suite.java:27) > 2021-04-08T23:25:56.3149663Z at > org.junit.runners.ParentRunner$3.run(ParentRunner.java:290) > 2021-04-08T23:25:56.3150075Z at > org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:71) > 2021-04-08T23:25:56.3150488Z at > org.junit.runners.ParentRunner.runChildren(ParentRunner.java:288) > 2021-04-08T23:25:56.3151148Z at > org.junit.runners.ParentRunner.access$000(ParentRunner.java:58) > 2021-04-08T23:25:56.3151691Z at > org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:268) > 2021-04-08T23:25:56.3152115Z at > org.junit.rules.ExternalResource$1.evaluate(ExternalResource.java:48) > 2021-04-08T23:25:56.3152534Z at > org.junit.rules.RunRules.evaluate(RunRules.java:20) > 2021-04-08T23:25:56.3152919Z at > org.junit.runners.ParentRunner.run(ParentRunner.java:363) > 2021-04-08T23:25:56.3153349Z at > org.apache.maven.surefire.junit4.JUnit4Provider.execute(JUnit4Provider.java:365) > 2021-04-08T23:25:56.3154029Z at > org.apache.maven.surefire.junit4.JUnit4Provider.executeWithRerun(JUnit4Provider.java:273) > 2021-04-08T23:25:56.3154670Z at > org.apache.maven.surefire.junit4.JUnit4Provider.executeTestSet(JUnit4Provider.java:238) > 2021-04-08T23:25:56.3155183Z at > org.apache.maven.surefire.junit4.JUnit4Provider.invoke(JUnit4Provider.java:159) > 2021-04-08T23:25:56.3155715Z at > org.apache.maven.surefire.booter.ForkedBooter.invokeProviderInSameClassLoader(ForkedBooter.java:384) > 2021-04-08T23:25:56.3156250Z at > org.apache.maven.surefire.booter.ForkedBooter.runSuitesInProcess(ForkedBooter.java:345) > 2021-04-08T23:25:56.3156749Z at > org.apache.maven.surefire.booter.ForkedBooter.execute(ForkedBooter.java:126) > 2021-04-08T23:25:56.3157343Z at > org.apache.maven.surefire.booter.ForkedBooter.main(ForkedBooter.java:418) > 2021-04-08T23:25:56.3157928Z Caused by: > org.apache.flink.runtime.JobException: Recovery is suppressed by > FixedDelayRestartBackoffTimeStrategy(maxNumberRestartAttempts=1, > backoffTimeMS=100) > 2021-04-08T23:25:56.3158627Z at > org.apache.flink.runtime.executiongraph.failover.flip1.ExecutionFailureHandler.handleFailure(ExecutionFailureHandler.java:138) > 2021-04-08T23:25:56.3159356Z at > org.apache.flink.runtime.executiongraph.failover.flip1.ExecutionFailureHandler.getGlobalFailureHandlingResult(ExecutionFailureHandler.java:101) > 2021-04-08T23:25:56.3160053Z at > org.apache.flink.runtime.scheduler.DefaultScheduler.handleGlobalFailure(DefaultScheduler.java:227) > 2021-04-08T23:25:56.3160721Z at > org.apache.flink.runtime.scheduler.UpdateSchedulerNgOnInternalFailuresListener.notifyGlobalFailure(UpdateSchedulerNgOnInternalFailuresListener.java:57) > 2021-04-08T23:25:56.3161721Z at > org.apache.flink.runtime.executiongraph.DefaultExecutionGraph.failGlobal(DefaultExecutionGraph.java:973) > 2021-04-08T23:25:56.3162331Z at > org.apache.flink.runtime.executiongraph.DefaultExecutionGraph$1.lambda$failJob$0(DefaultExecutionGraph.java:412) > 2021-04-08T23:25:56.3162910Z at > org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRunAsync(AkkaRpcActor.java:440) > 2021-04-08T23:25:56.3163435Z at > org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcMessage(AkkaRpcActor.java:208) > 2021-04-08T23:25:56.3164057Z at > org.apache.flink.runtime.rpc.akka.FencedAkkaRpcActor.handleRpcMessage(FencedAkkaRpcActor.java:77) > 2021-04-08T23:25:56.3164599Z at > org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleMessage(AkkaRpcActor.java:158) > 2021-04-08T23:25:56.3165052Z at > akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:26) > 2021-04-08T23:25:56.3165471Z at > akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:21) > 2021-04-08T23:25:56.3165895Z at > scala.PartialFunction$class.applyOrElse(PartialFunction.scala:123) > 2021-04-08T23:25:56.3166322Z at > akka.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:21) > 2021-04-08T23:25:56.3166767Z at > scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:170) > 2021-04-08T23:25:56.3167194Z at > scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171) > 2021-04-08T23:25:56.3167636Z at > scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171) > 2021-04-08T23:25:56.3168056Z at > akka.actor.Actor$class.aroundReceive(Actor.scala:517) > 2021-04-08T23:25:56.3168447Z at > akka.actor.AbstractActor.aroundReceive(AbstractActor.scala:225) > 2021-04-08T23:25:56.3168867Z at > akka.actor.ActorCell.receiveMessage(ActorCell.scala:592) > 2021-04-08T23:25:56.3169254Z at > akka.actor.ActorCell.invoke(ActorCell.scala:561) > 2021-04-08T23:25:56.3169616Z at > akka.dispatch.Mailbox.processMailbox(Mailbox.scala:258) > 2021-04-08T23:25:56.3169991Z at akka.dispatch.Mailbox.run(Mailbox.scala:225) > 2021-04-08T23:25:56.3170328Z at akka.dispatch.Mailbox.exec(Mailbox.scala:235) > 2021-04-08T23:25:56.3170716Z at > akka.dispatch.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) > 2021-04-08T23:25:56.3171420Z at > akka.dispatch.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339) > 2021-04-08T23:25:56.3171886Z at > akka.dispatch.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) > 2021-04-08T23:25:56.3172349Z at > akka.dispatch.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107) > 2021-04-08T23:25:56.3172851Z Caused by: > org.apache.flink.util.FlinkRuntimeException: Exceeded checkpoint tolerable > failure threshold. > 2021-04-08T23:25:56.3173453Z at > org.apache.flink.runtime.checkpoint.CheckpointFailureManager.handleCheckpointException(CheckpointFailureManager.java:98) > 2021-04-08T23:25:56.3174129Z at > org.apache.flink.runtime.checkpoint.CheckpointFailureManager.handleJobLevelCheckpointException(CheckpointFailureManager.java:67) > 2021-04-08T23:25:56.3174771Z at > org.apache.flink.runtime.checkpoint.CheckpointCoordinator.abortPendingCheckpoint(CheckpointCoordinator.java:1935) > 2021-04-08T23:25:56.3175481Z at > org.apache.flink.runtime.checkpoint.CheckpointCoordinator.abortPendingCheckpoint(CheckpointCoordinator.java:1907) > 2021-04-08T23:25:56.3176078Z at > org.apache.flink.runtime.checkpoint.CheckpointCoordinator.access$600(CheckpointCoordinator.java:95) > 2021-04-08T23:25:56.3176658Z at > org.apache.flink.runtime.checkpoint.CheckpointCoordinator$CheckpointCanceller.run(CheckpointCoordinator.java:1991) > 2021-04-08T23:25:56.3177209Z at > java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) > 2021-04-08T23:25:56.3177627Z at > java.util.concurrent.FutureTask.run(FutureTask.java:266) > 2021-04-08T23:25:56.3178141Z at > java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:180) > 2021-04-08T23:25:56.3178766Z at > java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293) > 2021-04-08T23:25:56.3179308Z at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) > 2021-04-08T23:25:56.3179799Z at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) > 2021-04-08T23:25:56.3180204Z at java.lang.Thread.run(Thread.java:748) > {code} -- This message was sent by Atlassian Jira (v8.3.4#803005)