[ https://issues.apache.org/jira/browse/FLINK-26239?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Yun Gao updated FLINK-26239: ---------------------------- Priority: Critical (was: Major) > EventTimeWindowCheckpointingITCase.testSlidingTimeWindow failed on azure > ------------------------------------------------------------------------ > > Key: FLINK-26239 > URL: https://issues.apache.org/jira/browse/FLINK-26239 > Project: Flink > Issue Type: Bug > Components: Runtime / Checkpointing > Affects Versions: 1.15.0 > Reporter: Yun Gao > Priority: Critical > Labels: test-stability > > {code:java} > 2022-02-17T11:46:39.1850375Z Feb 17 11:46:39 Starting > org.apache.flink.test.checkpointing.EventTimeWindowCheckpointingITCase#testSlidingTimeWindow[statebackend > type =ROCKSDB_INCREMENTAL, buffersPerChannel = 2]. > 2022-02-17T11:46:39.1854584Z > org.apache.flink.runtime.client.JobExecutionException: Job execution failed. > 2022-02-17T11:46:39.1855470Z at > org.apache.flink.runtime.jobmaster.JobResult.toJobExecutionResult(JobResult.java:144) > 2022-02-17T11:46:39.1856444Z at > org.apache.flink.runtime.minicluster.MiniClusterJobClient.lambda$getJobExecutionResult$3(MiniClusterJobClient.java:141) > 2022-02-17T11:46:39.1857393Z at > java.util.concurrent.CompletableFuture.uniApply(CompletableFuture.java:616) > 2022-02-17T11:46:39.1858400Z at > java.util.concurrent.CompletableFuture$UniApply.tryFire(CompletableFuture.java:591) > 2022-02-17T11:46:39.1865249Z at > java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488) > 2022-02-17T11:46:39.1866299Z at > java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:1975) > 2022-02-17T11:46:39.1867590Z at > org.apache.flink.runtime.rpc.akka.AkkaInvocationHandler.lambda$invokeRpc$1(AkkaInvocationHandler.java:259) > 2022-02-17T11:46:39.1868546Z at > java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:774) > 2022-02-17T11:46:39.1869254Z at > java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:750) > 2022-02-17T11:46:39.1869828Z at > java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488) > 2022-02-17T11:46:39.1870367Z at > java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:1975) > 2022-02-17T11:46:39.1871131Z at > org.apache.flink.util.concurrent.FutureUtils.doForward(FutureUtils.java:1389) > 2022-02-17T11:46:39.1872123Z at > org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.lambda$null$1(ClassLoadingUtils.java:93) > 2022-02-17T11:46:39.1875765Z at > org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.runWithContextClassLoader(ClassLoadingUtils.java:68) > 2022-02-17T11:46:39.1877055Z at > org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.lambda$guardCompletionWithContextClassLoader$2(ClassLoadingUtils.java:92) > 2022-02-17T11:46:39.1878032Z at > java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:774) > 2022-02-17T11:46:39.1879084Z at > java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:750) > 2022-02-17T11:46:39.1879697Z at > java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488) > 2022-02-17T11:46:39.1880252Z at > java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:1975) > 2022-02-17T11:46:39.1880840Z at > org.apache.flink.runtime.concurrent.akka.AkkaFutureUtils$1.onComplete(AkkaFutureUtils.java:47) > 2022-02-17T11:46:39.1881357Z at > akka.dispatch.OnComplete.internal(Future.scala:300) > 2022-02-17T11:46:39.1881788Z at > akka.dispatch.OnComplete.internal(Future.scala:297) > ... > 2022-02-17T11:46:39.1915003Z Caused by: > org.apache.flink.util.FlinkRuntimeException: Exceeded checkpoint tolerable > failure threshold. > 2022-02-17T11:46:39.1915653Z at > org.apache.flink.runtime.checkpoint.CheckpointFailureManager.checkFailureAgainstCounter(CheckpointFailureManager.java:160) > 2022-02-17T11:46:39.1916393Z at > org.apache.flink.runtime.checkpoint.CheckpointFailureManager.handleJobLevelCheckpointException(CheckpointFailureManager.java:123) > 2022-02-17T11:46:39.1917125Z at > org.apache.flink.runtime.checkpoint.CheckpointFailureManager.handleCheckpointException(CheckpointFailureManager.java:90) > 2022-02-17T11:46:39.1917819Z at > org.apache.flink.runtime.checkpoint.CheckpointCoordinator.abortPendingCheckpoint(CheckpointCoordinator.java:2046) > 2022-02-17T11:46:39.1918594Z at > org.apache.flink.runtime.checkpoint.CheckpointCoordinator.abortPendingCheckpoint(CheckpointCoordinator.java:2025) > 2022-02-17T11:46:39.1919268Z at > org.apache.flink.runtime.checkpoint.CheckpointCoordinator.access$600(CheckpointCoordinator.java:98) > 2022-02-17T11:46:39.1920052Z at > org.apache.flink.runtime.checkpoint.CheckpointCoordinator$CheckpointCanceller.run(CheckpointCoordinator.java:2104) > 2022-02-17T11:46:39.1920852Z at > java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) > 2022-02-17T11:46:39.1921390Z at > java.util.concurrent.FutureTask.run(FutureTask.java:266) > 2022-02-17T11:46:39.1922079Z at > java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:180) > 2022-02-17T11:46:39.1922785Z at > java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293) > 2022-02-17T11:46:39.1923541Z at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) > 2022-02-17T11:46:39.1924108Z at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) > 2022-02-17T11:46:39.1924585Z at java.lang.Thread.run(Thread.java:748) {code} > From the log, the checkpoint seems to fail due to > > {code:java} > java.lang.IllegalStateException: Attempt to reference unknown state: > f1a3e68c-3bac-4bd9-b68f-7968a1411a06-KeyGroupRange{startKeyGroup=0, > endKeyGroup=1}-000058.sst > at org.apache.flink.util.Preconditions.checkState(Preconditions.java:193) > ~[flink-core-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] > at > org.apache.flink.runtime.state.SharedStateRegistryImpl.registerReference(SharedStateRegistryImpl.java:82) > ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] > at > org.apache.flink.runtime.state.IncrementalRemoteKeyedStateHandle.registerSharedStates(IncrementalRemoteKeyedStateHandle.java:317) > ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] > at > org.apache.flink.runtime.state.SharedStateRegistryImpl.registerAll(SharedStateRegistryImpl.java:172) > ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] > at > org.apache.flink.runtime.state.changelog.ChangelogStateBackendHandle$ChangelogStateBackendHandleImpl.registerSharedStates(ChangelogStateBackendHandle.java:124) > ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] > at > org.apache.flink.runtime.checkpoint.OperatorSubtaskState.registerSharedState(OperatorSubtaskState.java:229) > ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] > at > org.apache.flink.runtime.checkpoint.OperatorSubtaskState.registerSharedStates(OperatorSubtaskState.java:219) > ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] > at > org.apache.flink.runtime.checkpoint.TaskStateSnapshot.registerSharedStates(TaskStateSnapshot.java:189) > ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] > at > org.apache.flink.runtime.checkpoint.CheckpointCoordinator.receiveAcknowledgeMessage(CheckpointCoordinator.java:1114) > ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] > at > org.apache.flink.runtime.scheduler.ExecutionGraphHandler.lambda$acknowledgeCheckpoint$1(ExecutionGraphHandler.java:89) > ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] > at > org.apache.flink.runtime.scheduler.ExecutionGraphHandler.lambda$processCheckpointCoordinatorMessage$3(ExecutionGraphHandler.java:119) > ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) > [?:1.8.0_292] > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) > [?:1.8.0_292] > at java.lang.Thread.run(Thread.java:748) [?:1.8.0_292] > 11:36:48,770 [jobmanager-io-thread-14] WARN > org.apache.flink.runtime.jobmaster.JobMaster [] - Error while > processing AcknowledgeCheckpoint message > java.lang.IllegalStateException: Attempt to reference unknown state: > e0c386e6-8fdd-4277-8f5c-d4eb942c97e0-KeyGroupRange{startKeyGroup=6, > endKeyGroup=7}-000057.sst > at org.apache.flink.util.Preconditions.checkState(Preconditions.java:193) > ~[flink-core-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] > at > org.apache.flink.runtime.state.SharedStateRegistryImpl.registerReference(SharedStateRegistryImpl.java:82) > ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] > at > org.apache.flink.runtime.state.IncrementalRemoteKeyedStateHandle.registerSharedStates(IncrementalRemoteKeyedStateHandle.java:317) > ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] > at > org.apache.flink.runtime.state.SharedStateRegistryImpl.registerAll(SharedStateRegistryImpl.java:172) > ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] > at > org.apache.flink.runtime.state.changelog.ChangelogStateBackendHandle$ChangelogStateBackendHandleImpl.registerSharedStates(ChangelogStateBackendHandle.java:124) > ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] > at > org.apache.flink.runtime.checkpoint.OperatorSubtaskState.registerSharedState(OperatorSubtaskState.java:229) > ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] > at > org.apache.flink.runtime.checkpoint.OperatorSubtaskState.registerSharedStates(OperatorSubtaskState.java:219) > ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] > at > org.apache.flink.runtime.checkpoint.TaskStateSnapshot.registerSharedStates(TaskStateSnapshot.java:189) > ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] > at > org.apache.flink.runtime.checkpoint.CheckpointCoordinator.receiveAcknowledgeMessage(CheckpointCoordinator.java:1114) > ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] > at > org.apache.flink.runtime.scheduler.ExecutionGraphHandler.lambda$acknowledgeCheckpoint$1(ExecutionGraphHandler.java:89) > ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] > at > org.apache.flink.runtime.scheduler.ExecutionGraphHandler.lambda$processCheckpointCoordinatorMessage$3(ExecutionGraphHandler.java:119) > ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) > [?:1.8.0_292] > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) > [?:1.8.0_292] > at java.lang.Thread.run(Thread.java:748) [?:1.8.0_292] {code} > https://dev.azure.com/apache-flink/apache-flink/_build/results?buildId=31736&view=logs&j=5c8e7682-d68f-54d1-16a2-a09310218a49&t=86f654fa-ab48-5c1a-25f4-7e7f6afb9bba&l=6325 -- This message was sent by Atlassian Jira (v8.20.1#820001)