Yun Gao created FLINK-26239: ------------------------------- Summary: EventTimeWindowCheckpointingITCase.testSlidingTimeWindow failed on azure Key: FLINK-26239 URL: https://issues.apache.org/jira/browse/FLINK-26239 Project: Flink Issue Type: Bug Components: Runtime / Checkpointing Affects Versions: 1.15.0 Reporter: Yun Gao
{code:java} 2022-02-17T11:46:39.1850375Z Feb 17 11:46:39 Starting org.apache.flink.test.checkpointing.EventTimeWindowCheckpointingITCase#testSlidingTimeWindow[statebackend type =ROCKSDB_INCREMENTAL, buffersPerChannel = 2]. 2022-02-17T11:46:39.1854584Z org.apache.flink.runtime.client.JobExecutionException: Job execution failed. 2022-02-17T11:46:39.1855470Z at org.apache.flink.runtime.jobmaster.JobResult.toJobExecutionResult(JobResult.java:144) 2022-02-17T11:46:39.1856444Z at org.apache.flink.runtime.minicluster.MiniClusterJobClient.lambda$getJobExecutionResult$3(MiniClusterJobClient.java:141) 2022-02-17T11:46:39.1857393Z at java.util.concurrent.CompletableFuture.uniApply(CompletableFuture.java:616) 2022-02-17T11:46:39.1858400Z at java.util.concurrent.CompletableFuture$UniApply.tryFire(CompletableFuture.java:591) 2022-02-17T11:46:39.1865249Z at java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488) 2022-02-17T11:46:39.1866299Z at java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:1975) 2022-02-17T11:46:39.1867590Z at org.apache.flink.runtime.rpc.akka.AkkaInvocationHandler.lambda$invokeRpc$1(AkkaInvocationHandler.java:259) 2022-02-17T11:46:39.1868546Z at java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:774) 2022-02-17T11:46:39.1869254Z at java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:750) 2022-02-17T11:46:39.1869828Z at java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488) 2022-02-17T11:46:39.1870367Z at java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:1975) 2022-02-17T11:46:39.1871131Z at org.apache.flink.util.concurrent.FutureUtils.doForward(FutureUtils.java:1389) 2022-02-17T11:46:39.1872123Z at org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.lambda$null$1(ClassLoadingUtils.java:93) 2022-02-17T11:46:39.1875765Z at org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.runWithContextClassLoader(ClassLoadingUtils.java:68) 2022-02-17T11:46:39.1877055Z at org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.lambda$guardCompletionWithContextClassLoader$2(ClassLoadingUtils.java:92) 2022-02-17T11:46:39.1878032Z at java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:774) 2022-02-17T11:46:39.1879084Z at java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:750) 2022-02-17T11:46:39.1879697Z at java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488) 2022-02-17T11:46:39.1880252Z at java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:1975) 2022-02-17T11:46:39.1880840Z at org.apache.flink.runtime.concurrent.akka.AkkaFutureUtils$1.onComplete(AkkaFutureUtils.java:47) 2022-02-17T11:46:39.1881357Z at akka.dispatch.OnComplete.internal(Future.scala:300) 2022-02-17T11:46:39.1881788Z at akka.dispatch.OnComplete.internal(Future.scala:297) ... 2022-02-17T11:46:39.1915003Z Caused by: org.apache.flink.util.FlinkRuntimeException: Exceeded checkpoint tolerable failure threshold. 2022-02-17T11:46:39.1915653Z at org.apache.flink.runtime.checkpoint.CheckpointFailureManager.checkFailureAgainstCounter(CheckpointFailureManager.java:160) 2022-02-17T11:46:39.1916393Z at org.apache.flink.runtime.checkpoint.CheckpointFailureManager.handleJobLevelCheckpointException(CheckpointFailureManager.java:123) 2022-02-17T11:46:39.1917125Z at org.apache.flink.runtime.checkpoint.CheckpointFailureManager.handleCheckpointException(CheckpointFailureManager.java:90) 2022-02-17T11:46:39.1917819Z at org.apache.flink.runtime.checkpoint.CheckpointCoordinator.abortPendingCheckpoint(CheckpointCoordinator.java:2046) 2022-02-17T11:46:39.1918594Z at org.apache.flink.runtime.checkpoint.CheckpointCoordinator.abortPendingCheckpoint(CheckpointCoordinator.java:2025) 2022-02-17T11:46:39.1919268Z at org.apache.flink.runtime.checkpoint.CheckpointCoordinator.access$600(CheckpointCoordinator.java:98) 2022-02-17T11:46:39.1920052Z at org.apache.flink.runtime.checkpoint.CheckpointCoordinator$CheckpointCanceller.run(CheckpointCoordinator.java:2104) 2022-02-17T11:46:39.1920852Z at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) 2022-02-17T11:46:39.1921390Z at java.util.concurrent.FutureTask.run(FutureTask.java:266) 2022-02-17T11:46:39.1922079Z at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:180) 2022-02-17T11:46:39.1922785Z at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293) 2022-02-17T11:46:39.1923541Z at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) 2022-02-17T11:46:39.1924108Z at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) 2022-02-17T11:46:39.1924585Z at java.lang.Thread.run(Thread.java:748) {code} >From the log, the checkpoint seems to fail due to {code:java} java.lang.IllegalStateException: Attempt to reference unknown state: f1a3e68c-3bac-4bd9-b68f-7968a1411a06-KeyGroupRange{startKeyGroup=0, endKeyGroup=1}-000058.sst at org.apache.flink.util.Preconditions.checkState(Preconditions.java:193) ~[flink-core-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] at org.apache.flink.runtime.state.SharedStateRegistryImpl.registerReference(SharedStateRegistryImpl.java:82) ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] at org.apache.flink.runtime.state.IncrementalRemoteKeyedStateHandle.registerSharedStates(IncrementalRemoteKeyedStateHandle.java:317) ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] at org.apache.flink.runtime.state.SharedStateRegistryImpl.registerAll(SharedStateRegistryImpl.java:172) ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] at org.apache.flink.runtime.state.changelog.ChangelogStateBackendHandle$ChangelogStateBackendHandleImpl.registerSharedStates(ChangelogStateBackendHandle.java:124) ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] at org.apache.flink.runtime.checkpoint.OperatorSubtaskState.registerSharedState(OperatorSubtaskState.java:229) ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] at org.apache.flink.runtime.checkpoint.OperatorSubtaskState.registerSharedStates(OperatorSubtaskState.java:219) ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] at org.apache.flink.runtime.checkpoint.TaskStateSnapshot.registerSharedStates(TaskStateSnapshot.java:189) ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] at org.apache.flink.runtime.checkpoint.CheckpointCoordinator.receiveAcknowledgeMessage(CheckpointCoordinator.java:1114) ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] at org.apache.flink.runtime.scheduler.ExecutionGraphHandler.lambda$acknowledgeCheckpoint$1(ExecutionGraphHandler.java:89) ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] at org.apache.flink.runtime.scheduler.ExecutionGraphHandler.lambda$processCheckpointCoordinatorMessage$3(ExecutionGraphHandler.java:119) ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) [?:1.8.0_292] at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) [?:1.8.0_292] at java.lang.Thread.run(Thread.java:748) [?:1.8.0_292] 11:36:48,770 [jobmanager-io-thread-14] WARN org.apache.flink.runtime.jobmaster.JobMaster [] - Error while processing AcknowledgeCheckpoint message java.lang.IllegalStateException: Attempt to reference unknown state: e0c386e6-8fdd-4277-8f5c-d4eb942c97e0-KeyGroupRange{startKeyGroup=6, endKeyGroup=7}-000057.sst at org.apache.flink.util.Preconditions.checkState(Preconditions.java:193) ~[flink-core-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] at org.apache.flink.runtime.state.SharedStateRegistryImpl.registerReference(SharedStateRegistryImpl.java:82) ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] at org.apache.flink.runtime.state.IncrementalRemoteKeyedStateHandle.registerSharedStates(IncrementalRemoteKeyedStateHandle.java:317) ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] at org.apache.flink.runtime.state.SharedStateRegistryImpl.registerAll(SharedStateRegistryImpl.java:172) ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] at org.apache.flink.runtime.state.changelog.ChangelogStateBackendHandle$ChangelogStateBackendHandleImpl.registerSharedStates(ChangelogStateBackendHandle.java:124) ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] at org.apache.flink.runtime.checkpoint.OperatorSubtaskState.registerSharedState(OperatorSubtaskState.java:229) ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] at org.apache.flink.runtime.checkpoint.OperatorSubtaskState.registerSharedStates(OperatorSubtaskState.java:219) ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] at org.apache.flink.runtime.checkpoint.TaskStateSnapshot.registerSharedStates(TaskStateSnapshot.java:189) ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] at org.apache.flink.runtime.checkpoint.CheckpointCoordinator.receiveAcknowledgeMessage(CheckpointCoordinator.java:1114) ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] at org.apache.flink.runtime.scheduler.ExecutionGraphHandler.lambda$acknowledgeCheckpoint$1(ExecutionGraphHandler.java:89) ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] at org.apache.flink.runtime.scheduler.ExecutionGraphHandler.lambda$processCheckpointCoordinatorMessage$3(ExecutionGraphHandler.java:119) ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) [?:1.8.0_292] at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) [?:1.8.0_292] at java.lang.Thread.run(Thread.java:748) [?:1.8.0_292] {code} https://dev.azure.com/apache-flink/apache-flink/_build/results?buildId=31736&view=logs&j=5c8e7682-d68f-54d1-16a2-a09310218a49&t=86f654fa-ab48-5c1a-25f4-7e7f6afb9bba&l=6325 -- This message was sent by Atlassian Jira (v8.20.1#820001)