[ https://issues.apache.org/jira/browse/FLINK-36990?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
zjjiang updated FLINK-36990: ---------------------------- Description: Currently, SchemaRegistry encounters the following NPE exception when failing back, causing another failure: {code:java} ... 2024-12-30 19:10:43,813 INFO org.apache.flink.runtime.executiongraph.ExecutionGraph [] - Source: Flink CDC Event Source: mysql (1/2) (7670379b51da6bcf1592d89a4f671520_bc764cd8ddf7a0cff126f51c16239658_0_0) switched from RUNNING to CANCELING. 2024-12-30 19:10:43,813 INFO org.apache.flink.runtime.source.coordinator.SourceCoordinator [] - Removing registered reader after failure for subtask 0 (#0) of source Source: Flink CDC Event Source: mysql. 2024-12-30 19:10:43,817 INFO org.apache.flink.runtime.source.coordinator.SourceCoordinator [] - Removing registered reader after failure for subtask 1 (#0) of source Source: Flink CDC Event Source: mysql. 2024-12-30 19:10:43,817 INFO org.apache.flink.runtime.executiongraph.ExecutionGraph [] - Source: Flink CDC Event Source: mysql (2/2) (7670379b51da6bcf1592d89a4f671520_bc764cd8ddf7a0cff126f51c16239658_1_0) switched from RUNNING to CANCELING. 2024-12-30 19:10:43,818 INFO org.apache.flink.runtime.executiongraph.ExecutionGraph [] - PostPartition (2/2) (7670379b51da6bcf1592d89a4f671520_fffb571f3c6daa1956cc08d8e61e216a_1_0) switched from RUNNING to CANCELING. 2024-12-30 19:10:43,818 INFO org.apache.flink.runtime.executiongraph.ExecutionGraph [] - PrePartition (2/2) (7670379b51da6bcf1592d89a4f671520_78be0dd8677bc2711e2a56947a5ea048_1_0) switched from RUNNING to CANCELING. 2024-12-30 19:10:43,818 INFO org.apache.flink.runtime.executiongraph.ExecutionGraph [] - PostPartition (1/2) (7670379b51da6bcf1592d89a4f671520_fffb571f3c6daa1956cc08d8e61e216a_0_0) switched from RUNNING to CANCELING. 2024-12-30 19:10:43,819 ERROR org.apache.flink.runtime.rpc.pekko.FencedPekkoRpcActor [] - Caught exception while executing runnable in main thread. java.lang.NullPointerException: null at java.util.concurrent.ConcurrentHashMap.putVal(ConcurrentHashMap.java:1011) ~[?:1.8.0_352] at java.util.concurrent.ConcurrentHashMap.put(ConcurrentHashMap.java:1006) ~[?:1.8.0_352] at org.apache.flink.cdc.runtime.operators.schema.common.SchemaRegistry.executionAttemptFailed(SchemaRegistry.java:283) ~[?:?] at org.apache.flink.runtime.operators.coordination.OperatorCoordinatorHolder.executionAttemptFailed(OperatorCoordinatorHolder.java:228) ~[flink-dist-1.20.0.jar:1.20.0] at org.apache.flink.runtime.scheduler.DefaultScheduler.lambda$notifyCoordinatorsAboutTaskFailure$1(DefaultScheduler.java:314) ~[flink-dist-1.20.0.jar:1.20.0] at java.util.ArrayList.forEach(ArrayList.java:1259) ~[?:1.8.0_352] at java.util.Collections$UnmodifiableCollection.forEach(Collections.java:1082) ~[?:1.8.0_352] at org.apache.flink.runtime.scheduler.DefaultScheduler.notifyCoordinatorsAboutTaskFailure(DefaultScheduler.java:314) ~[flink-dist-1.20.0.jar:1.20.0] at org.apache.flink.runtime.scheduler.DefaultScheduler.notifyCoordinatorOfCancellation(DefaultScheduler.java:509) ~[flink-dist-1.20.0.jar:1.20.0] at org.apache.flink.runtime.scheduler.DefaultScheduler.cancelExecution(DefaultScheduler.java:441) ~[flink-dist-1.20.0.jar:1.20.0] at java.util.stream.ReferencePipeline$3$1.accept(ReferencePipeline.java:193) ~[?:1.8.0_352] at java.util.Collections$2.tryAdvance(Collections.java:4719) ~[?:1.8.0_352] at java.util.Collections$2.forEachRemaining(Collections.java:4727) ~[?:1.8.0_352] at java.util.stream.AbstractPipeline.copyInto(AbstractPipeline.java:482) ~[?:1.8.0_352] at java.util.stream.AbstractPipeline.wrapAndCopyInto(AbstractPipeline.java:472) ~[?:1.8.0_352] at java.util.stream.ReduceOps$ReduceOp.evaluateSequential(ReduceOps.java:708) ~[?:1.8.0_352] at java.util.stream.AbstractPipeline.evaluate(AbstractPipeline.java:234) ~[?:1.8.0_352] at java.util.stream.ReferencePipeline.collect(ReferencePipeline.java:566) ~[?:1.8.0_352] at org.apache.flink.runtime.scheduler.DefaultScheduler.cancelExecutionVertex(DefaultScheduler.java:437) ~[flink-dist-1.20.0.jar:1.20.0] at java.util.stream.ReferencePipeline$3$1.accept(ReferencePipeline.java:193) ~[?:1.8.0_352] at java.util.HashMap$KeySpliterator.forEachRemaining(HashMap.java:1580) ~[?:1.8.0_352] at java.util.stream.AbstractPipeline.copyInto(AbstractPipeline.java:482) ~[?:1.8.0_352] at java.util.stream.AbstractPipeline.wrapAndCopyInto(AbstractPipeline.java:472) ~[?:1.8.0_352] at java.util.stream.ReduceOps$ReduceOp.evaluateSequential(ReduceOps.java:708) ~[?:1.8.0_352] at java.util.stream.AbstractPipeline.evaluate(AbstractPipeline.java:234) ~[?:1.8.0_352] at java.util.stream.ReferencePipeline.collect(ReferencePipeline.java:566) ~[?:1.8.0_352] at org.apache.flink.runtime.scheduler.DefaultScheduler.cancelTasksAsync(DefaultScheduler.java:428) ~[flink-dist-1.20.0.jar:1.20.0] at org.apache.flink.runtime.scheduler.DefaultScheduler.restartTasksWithDelay(DefaultScheduler.java:364) ~[flink-dist-1.20.0.jar:1.20.0] at org.apache.flink.runtime.scheduler.DefaultScheduler.maybeRestartTasks(DefaultScheduler.java:330) ~[flink-dist-1.20.0.jar:1.20.0] at org.apache.flink.runtime.scheduler.DefaultScheduler.handleGlobalFailure(DefaultScheduler.java:325) ~[flink-dist-1.20.0.jar:1.20.0] at org.apache.flink.runtime.operators.coordination.OperatorCoordinatorHolder.checkpointCoordinatorInternal(OperatorCoordinatorHolder.java:339) ~[flink-dist-1.20.0.jar:1.20.0] at org.apache.flink.runtime.operators.coordination.OperatorCoordinatorHolder.lambda$checkpointCoordinator$0(OperatorCoordinatorHolder.java:248) ~[flink-dist-1.20.0.jar:1.20.0] at org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.lambda$handleRunAsync$4(PekkoRpcActor.java:460) ~[flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] at org.apache.flink.runtime.concurrent.ClassLoadingUtils.runWithContextClassLoader(ClassLoadingUtils.java:68) ~[flink-dist-1.20.0.jar:1.20.0] at org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.handleRunAsync(PekkoRpcActor.java:460) ~[flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] at org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.handleRpcMessage(PekkoRpcActor.java:225) ~[flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] at org.apache.flink.runtime.rpc.pekko.FencedPekkoRpcActor.handleRpcMessage(FencedPekkoRpcActor.java:88) ~[flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] at org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.handleMessage(PekkoRpcActor.java:174) ~[flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] at org.apache.pekko.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:33) [flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] at org.apache.pekko.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:29) [flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] at scala.PartialFunction.applyOrElse(PartialFunction.scala:127) [flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] at scala.PartialFunction.applyOrElse$(PartialFunction.scala:126) [flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] at org.apache.pekko.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:29) [flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:175) [flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:176) [flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:176) [flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] at org.apache.pekko.actor.Actor.aroundReceive(Actor.scala:547) [flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] at org.apache.pekko.actor.Actor.aroundReceive$(Actor.scala:545) [flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] at org.apache.pekko.actor.AbstractActor.aroundReceive(AbstractActor.scala:229) [flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] at org.apache.pekko.actor.ActorCell.receiveMessage(ActorCell.scala:590) [flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] at org.apache.pekko.actor.ActorCell.invoke(ActorCell.scala:557) [flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] at org.apache.pekko.dispatch.Mailbox.processMailbox(Mailbox.scala:280) [flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] at org.apache.pekko.dispatch.Mailbox.run(Mailbox.scala:241) [flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] at org.apache.pekko.dispatch.Mailbox.exec(Mailbox.scala:253) [flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] at java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:289) [?:1.8.0_352] at java.util.concurrent.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1056) [?:1.8.0_352] at java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1692) [?:1.8.0_352] at java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:175) [?:1.8.0_352] 2024-12-30 19:10:43,824 INFO org.apache.flink.runtime.source.coordinator.SourceCoordinator [] - Marking checkpoint 5 as aborted for source Source: Flink CDC Event Source: mysql. 2024-12-30 19:10:43,854 INFO org.apache.flink.runtime.executiongraph.ExecutionGraph [] - Source: Flink CDC Event Source: mysql (2/2) (7670379b51da6bcf1592d89a4f671520_bc764cd8ddf7a0cff126f51c16239658_1_0) switched from CANCELING to CANCELED. 2024-12-30 19:10:43,856 INFO org.apache.flink.runtime.executiongraph.ExecutionGraph [] - PostPartition (1/2) (7670379b51da6bcf1592d89a4f671520_fffb571f3c6daa1956cc08d8e61e216a_0_0) switched from CANCELING to CANCELED. 2024-12-30 19:10:43,857 INFO org.apache.flink.runtime.executiongraph.ExecutionGraph [] - PrePartition (2/2) (7670379b51da6bcf1592d89a4f671520_78be0dd8677bc2711e2a56947a5ea048_1_0) switched from CANCELING to CANCELED. ...{code} was: Currently, SchemaRegistry encounters the following NPE exception when failing back, causing another failure: {code:java} 2024-12-30 19:10:43,813 INFO org.apache.flink.runtime.executiongraph.ExecutionGraph [] - Source: Flink CDC Event Source: mysql (1/2) (7670379b51da6bcf1592d89a4f671520_bc764cd8ddf7a0cff126f51c16239658_0_0) switched from RUNNING to CANCELING. 2024-12-30 19:10:43,813 INFO org.apache.flink.runtime.source.coordinator.SourceCoordinator [] - Removing registered reader after failure for subtask 0 (#0) of source Source: Flink CDC Event Source: mysql. 2024-12-30 19:10:43,817 INFO org.apache.flink.runtime.source.coordinator.SourceCoordinator [] - Removing registered reader after failure for subtask 1 (#0) of source Source: Flink CDC Event Source: mysql. 2024-12-30 19:10:43,817 INFO org.apache.flink.runtime.executiongraph.ExecutionGraph [] - Source: Flink CDC Event Source: mysql (2/2) (7670379b51da6bcf1592d89a4f671520_bc764cd8ddf7a0cff126f51c16239658_1_0) switched from RUNNING to CANCELING. 2024-12-30 19:10:43,818 INFO org.apache.flink.runtime.executiongraph.ExecutionGraph [] - PostPartition (2/2) (7670379b51da6bcf1592d89a4f671520_fffb571f3c6daa1956cc08d8e61e216a_1_0) switched from RUNNING to CANCELING. 2024-12-30 19:10:43,818 INFO org.apache.flink.runtime.executiongraph.ExecutionGraph [] - PrePartition (2/2) (7670379b51da6bcf1592d89a4f671520_78be0dd8677bc2711e2a56947a5ea048_1_0) switched from RUNNING to CANCELING. 2024-12-30 19:10:43,818 INFO org.apache.flink.runtime.executiongraph.ExecutionGraph [] - PostPartition (1/2) (7670379b51da6bcf1592d89a4f671520_fffb571f3c6daa1956cc08d8e61e216a_0_0) switched from RUNNING to CANCELING. 2024-12-30 19:10:43,819 ERROR org.apache.flink.runtime.rpc.pekko.FencedPekkoRpcActor [] - Caught exception while executing runnable in main thread. java.lang.NullPointerException: null at java.util.concurrent.ConcurrentHashMap.putVal(ConcurrentHashMap.java:1011) ~[?:1.8.0_352] at java.util.concurrent.ConcurrentHashMap.put(ConcurrentHashMap.java:1006) ~[?:1.8.0_352] at org.apache.flink.cdc.runtime.operators.schema.common.SchemaRegistry.executionAttemptFailed(SchemaRegistry.java:283) ~[?:?] at org.apache.flink.runtime.operators.coordination.OperatorCoordinatorHolder.executionAttemptFailed(OperatorCoordinatorHolder.java:228) ~[flink-dist-1.20.0.jar:1.20.0] at org.apache.flink.runtime.scheduler.DefaultScheduler.lambda$notifyCoordinatorsAboutTaskFailure$1(DefaultScheduler.java:314) ~[flink-dist-1.20.0.jar:1.20.0] at java.util.ArrayList.forEach(ArrayList.java:1259) ~[?:1.8.0_352] at java.util.Collections$UnmodifiableCollection.forEach(Collections.java:1082) ~[?:1.8.0_352] at org.apache.flink.runtime.scheduler.DefaultScheduler.notifyCoordinatorsAboutTaskFailure(DefaultScheduler.java:314) ~[flink-dist-1.20.0.jar:1.20.0] at org.apache.flink.runtime.scheduler.DefaultScheduler.notifyCoordinatorOfCancellation(DefaultScheduler.java:509) ~[flink-dist-1.20.0.jar:1.20.0] at org.apache.flink.runtime.scheduler.DefaultScheduler.cancelExecution(DefaultScheduler.java:441) ~[flink-dist-1.20.0.jar:1.20.0] at java.util.stream.ReferencePipeline$3$1.accept(ReferencePipeline.java:193) ~[?:1.8.0_352] at java.util.Collections$2.tryAdvance(Collections.java:4719) ~[?:1.8.0_352] at java.util.Collections$2.forEachRemaining(Collections.java:4727) ~[?:1.8.0_352] at java.util.stream.AbstractPipeline.copyInto(AbstractPipeline.java:482) ~[?:1.8.0_352] at java.util.stream.AbstractPipeline.wrapAndCopyInto(AbstractPipeline.java:472) ~[?:1.8.0_352] at java.util.stream.ReduceOps$ReduceOp.evaluateSequential(ReduceOps.java:708) ~[?:1.8.0_352] at java.util.stream.AbstractPipeline.evaluate(AbstractPipeline.java:234) ~[?:1.8.0_352] at java.util.stream.ReferencePipeline.collect(ReferencePipeline.java:566) ~[?:1.8.0_352] at org.apache.flink.runtime.scheduler.DefaultScheduler.cancelExecutionVertex(DefaultScheduler.java:437) ~[flink-dist-1.20.0.jar:1.20.0] at java.util.stream.ReferencePipeline$3$1.accept(ReferencePipeline.java:193) ~[?:1.8.0_352] at java.util.HashMap$KeySpliterator.forEachRemaining(HashMap.java:1580) ~[?:1.8.0_352] at java.util.stream.AbstractPipeline.copyInto(AbstractPipeline.java:482) ~[?:1.8.0_352] at java.util.stream.AbstractPipeline.wrapAndCopyInto(AbstractPipeline.java:472) ~[?:1.8.0_352] at java.util.stream.ReduceOps$ReduceOp.evaluateSequential(ReduceOps.java:708) ~[?:1.8.0_352] at java.util.stream.AbstractPipeline.evaluate(AbstractPipeline.java:234) ~[?:1.8.0_352] at java.util.stream.ReferencePipeline.collect(ReferencePipeline.java:566) ~[?:1.8.0_352] at org.apache.flink.runtime.scheduler.DefaultScheduler.cancelTasksAsync(DefaultScheduler.java:428) ~[flink-dist-1.20.0.jar:1.20.0] at org.apache.flink.runtime.scheduler.DefaultScheduler.restartTasksWithDelay(DefaultScheduler.java:364) ~[flink-dist-1.20.0.jar:1.20.0] at org.apache.flink.runtime.scheduler.DefaultScheduler.maybeRestartTasks(DefaultScheduler.java:330) ~[flink-dist-1.20.0.jar:1.20.0] at org.apache.flink.runtime.scheduler.DefaultScheduler.handleGlobalFailure(DefaultScheduler.java:325) ~[flink-dist-1.20.0.jar:1.20.0] at org.apache.flink.runtime.operators.coordination.OperatorCoordinatorHolder.checkpointCoordinatorInternal(OperatorCoordinatorHolder.java:339) ~[flink-dist-1.20.0.jar:1.20.0] at org.apache.flink.runtime.operators.coordination.OperatorCoordinatorHolder.lambda$checkpointCoordinator$0(OperatorCoordinatorHolder.java:248) ~[flink-dist-1.20.0.jar:1.20.0] at org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.lambda$handleRunAsync$4(PekkoRpcActor.java:460) ~[flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] at org.apache.flink.runtime.concurrent.ClassLoadingUtils.runWithContextClassLoader(ClassLoadingUtils.java:68) ~[flink-dist-1.20.0.jar:1.20.0] at org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.handleRunAsync(PekkoRpcActor.java:460) ~[flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] at org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.handleRpcMessage(PekkoRpcActor.java:225) ~[flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] at org.apache.flink.runtime.rpc.pekko.FencedPekkoRpcActor.handleRpcMessage(FencedPekkoRpcActor.java:88) ~[flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] at org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.handleMessage(PekkoRpcActor.java:174) ~[flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] at org.apache.pekko.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:33) [flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] at org.apache.pekko.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:29) [flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] at scala.PartialFunction.applyOrElse(PartialFunction.scala:127) [flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] at scala.PartialFunction.applyOrElse$(PartialFunction.scala:126) [flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] at org.apache.pekko.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:29) [flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:175) [flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:176) [flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:176) [flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] at org.apache.pekko.actor.Actor.aroundReceive(Actor.scala:547) [flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] at org.apache.pekko.actor.Actor.aroundReceive$(Actor.scala:545) [flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] at org.apache.pekko.actor.AbstractActor.aroundReceive(AbstractActor.scala:229) [flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] at org.apache.pekko.actor.ActorCell.receiveMessage(ActorCell.scala:590) [flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] at org.apache.pekko.actor.ActorCell.invoke(ActorCell.scala:557) [flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] at org.apache.pekko.dispatch.Mailbox.processMailbox(Mailbox.scala:280) [flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] at org.apache.pekko.dispatch.Mailbox.run(Mailbox.scala:241) [flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] at org.apache.pekko.dispatch.Mailbox.exec(Mailbox.scala:253) [flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] at java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:289) [?:1.8.0_352] at java.util.concurrent.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1056) [?:1.8.0_352] at java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1692) [?:1.8.0_352] at java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:175) [?:1.8.0_352] 2024-12-30 19:10:43,824 INFO org.apache.flink.runtime.source.coordinator.SourceCoordinator [] - Marking checkpoint 5 as aborted for source Source: Flink CDC Event Source: mysql. 2024-12-30 19:10:43,854 INFO org.apache.flink.runtime.executiongraph.ExecutionGraph [] - Source: Flink CDC Event Source: mysql (2/2) (7670379b51da6bcf1592d89a4f671520_bc764cd8ddf7a0cff126f51c16239658_1_0) switched from CANCELING to CANCELED. 2024-12-30 19:10:43,856 INFO org.apache.flink.runtime.executiongraph.ExecutionGraph [] - PostPartition (1/2) (7670379b51da6bcf1592d89a4f671520_fffb571f3c6daa1956cc08d8e61e216a_0_0) switched from CANCELING to CANCELED. 2024-12-30 19:10:43,857 INFO org.apache.flink.runtime.executiongraph.ExecutionGraph [] - PrePartition (2/2) (7670379b51da6bcf1592d89a4f671520_78be0dd8677bc2711e2a56947a5ea048_1_0) switched from CANCELING to CANCELED. {code} > SchemaRegistry fails on failover due to NPE exception caught while executing > runnable in main thread > ---------------------------------------------------------------------------------------------------- > > Key: FLINK-36990 > URL: https://issues.apache.org/jira/browse/FLINK-36990 > Project: Flink > Issue Type: Bug > Components: Flink CDC > Affects Versions: cdc-3.3.0 > Reporter: zjjiang > Priority: Major > > Currently, SchemaRegistry encounters the following NPE exception when failing > back, causing another failure: > {code:java} > ... > 2024-12-30 19:10:43,813 INFO > org.apache.flink.runtime.executiongraph.ExecutionGraph [] - Source: > Flink CDC Event Source: mysql (1/2) > (7670379b51da6bcf1592d89a4f671520_bc764cd8ddf7a0cff126f51c16239658_0_0) > switched from RUNNING to CANCELING. > 2024-12-30 19:10:43,813 INFO > org.apache.flink.runtime.source.coordinator.SourceCoordinator [] - Removing > registered reader after failure for subtask 0 (#0) of source Source: Flink > CDC Event Source: mysql. > 2024-12-30 19:10:43,817 INFO > org.apache.flink.runtime.source.coordinator.SourceCoordinator [] - Removing > registered reader after failure for subtask 1 (#0) of source Source: Flink > CDC Event Source: mysql. > 2024-12-30 19:10:43,817 INFO > org.apache.flink.runtime.executiongraph.ExecutionGraph [] - Source: > Flink CDC Event Source: mysql (2/2) > (7670379b51da6bcf1592d89a4f671520_bc764cd8ddf7a0cff126f51c16239658_1_0) > switched from RUNNING to CANCELING. > 2024-12-30 19:10:43,818 INFO > org.apache.flink.runtime.executiongraph.ExecutionGraph [] - > PostPartition (2/2) > (7670379b51da6bcf1592d89a4f671520_fffb571f3c6daa1956cc08d8e61e216a_1_0) > switched from RUNNING to CANCELING. > 2024-12-30 19:10:43,818 INFO > org.apache.flink.runtime.executiongraph.ExecutionGraph [] - > PrePartition (2/2) > (7670379b51da6bcf1592d89a4f671520_78be0dd8677bc2711e2a56947a5ea048_1_0) > switched from RUNNING to CANCELING. > 2024-12-30 19:10:43,818 INFO > org.apache.flink.runtime.executiongraph.ExecutionGraph [] - > PostPartition (1/2) > (7670379b51da6bcf1592d89a4f671520_fffb571f3c6daa1956cc08d8e61e216a_0_0) > switched from RUNNING to CANCELING. > 2024-12-30 19:10:43,819 ERROR > org.apache.flink.runtime.rpc.pekko.FencedPekkoRpcActor [] - Caught > exception while executing runnable in main thread. > java.lang.NullPointerException: null > at > java.util.concurrent.ConcurrentHashMap.putVal(ConcurrentHashMap.java:1011) > ~[?:1.8.0_352] > at > java.util.concurrent.ConcurrentHashMap.put(ConcurrentHashMap.java:1006) > ~[?:1.8.0_352] > at > org.apache.flink.cdc.runtime.operators.schema.common.SchemaRegistry.executionAttemptFailed(SchemaRegistry.java:283) > ~[?:?] > at > org.apache.flink.runtime.operators.coordination.OperatorCoordinatorHolder.executionAttemptFailed(OperatorCoordinatorHolder.java:228) > ~[flink-dist-1.20.0.jar:1.20.0] > at > org.apache.flink.runtime.scheduler.DefaultScheduler.lambda$notifyCoordinatorsAboutTaskFailure$1(DefaultScheduler.java:314) > ~[flink-dist-1.20.0.jar:1.20.0] > at java.util.ArrayList.forEach(ArrayList.java:1259) ~[?:1.8.0_352] > at > java.util.Collections$UnmodifiableCollection.forEach(Collections.java:1082) > ~[?:1.8.0_352] > at > org.apache.flink.runtime.scheduler.DefaultScheduler.notifyCoordinatorsAboutTaskFailure(DefaultScheduler.java:314) > ~[flink-dist-1.20.0.jar:1.20.0] > at > org.apache.flink.runtime.scheduler.DefaultScheduler.notifyCoordinatorOfCancellation(DefaultScheduler.java:509) > ~[flink-dist-1.20.0.jar:1.20.0] > at > org.apache.flink.runtime.scheduler.DefaultScheduler.cancelExecution(DefaultScheduler.java:441) > ~[flink-dist-1.20.0.jar:1.20.0] > at > java.util.stream.ReferencePipeline$3$1.accept(ReferencePipeline.java:193) > ~[?:1.8.0_352] > at java.util.Collections$2.tryAdvance(Collections.java:4719) > ~[?:1.8.0_352] > at java.util.Collections$2.forEachRemaining(Collections.java:4727) > ~[?:1.8.0_352] > at java.util.stream.AbstractPipeline.copyInto(AbstractPipeline.java:482) > ~[?:1.8.0_352] > at > java.util.stream.AbstractPipeline.wrapAndCopyInto(AbstractPipeline.java:472) > ~[?:1.8.0_352] > at > java.util.stream.ReduceOps$ReduceOp.evaluateSequential(ReduceOps.java:708) > ~[?:1.8.0_352] > at java.util.stream.AbstractPipeline.evaluate(AbstractPipeline.java:234) > ~[?:1.8.0_352] > at java.util.stream.ReferencePipeline.collect(ReferencePipeline.java:566) > ~[?:1.8.0_352] > at > org.apache.flink.runtime.scheduler.DefaultScheduler.cancelExecutionVertex(DefaultScheduler.java:437) > ~[flink-dist-1.20.0.jar:1.20.0] > at > java.util.stream.ReferencePipeline$3$1.accept(ReferencePipeline.java:193) > ~[?:1.8.0_352] > at java.util.HashMap$KeySpliterator.forEachRemaining(HashMap.java:1580) > ~[?:1.8.0_352] > at java.util.stream.AbstractPipeline.copyInto(AbstractPipeline.java:482) > ~[?:1.8.0_352] > at > java.util.stream.AbstractPipeline.wrapAndCopyInto(AbstractPipeline.java:472) > ~[?:1.8.0_352] > at > java.util.stream.ReduceOps$ReduceOp.evaluateSequential(ReduceOps.java:708) > ~[?:1.8.0_352] > at java.util.stream.AbstractPipeline.evaluate(AbstractPipeline.java:234) > ~[?:1.8.0_352] > at java.util.stream.ReferencePipeline.collect(ReferencePipeline.java:566) > ~[?:1.8.0_352] > at > org.apache.flink.runtime.scheduler.DefaultScheduler.cancelTasksAsync(DefaultScheduler.java:428) > ~[flink-dist-1.20.0.jar:1.20.0] > at > org.apache.flink.runtime.scheduler.DefaultScheduler.restartTasksWithDelay(DefaultScheduler.java:364) > ~[flink-dist-1.20.0.jar:1.20.0] > at > org.apache.flink.runtime.scheduler.DefaultScheduler.maybeRestartTasks(DefaultScheduler.java:330) > ~[flink-dist-1.20.0.jar:1.20.0] > at > org.apache.flink.runtime.scheduler.DefaultScheduler.handleGlobalFailure(DefaultScheduler.java:325) > ~[flink-dist-1.20.0.jar:1.20.0] > at > org.apache.flink.runtime.operators.coordination.OperatorCoordinatorHolder.checkpointCoordinatorInternal(OperatorCoordinatorHolder.java:339) > ~[flink-dist-1.20.0.jar:1.20.0] > at > org.apache.flink.runtime.operators.coordination.OperatorCoordinatorHolder.lambda$checkpointCoordinator$0(OperatorCoordinatorHolder.java:248) > ~[flink-dist-1.20.0.jar:1.20.0] > at > org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.lambda$handleRunAsync$4(PekkoRpcActor.java:460) > ~[flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] > at > org.apache.flink.runtime.concurrent.ClassLoadingUtils.runWithContextClassLoader(ClassLoadingUtils.java:68) > ~[flink-dist-1.20.0.jar:1.20.0] > at > org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.handleRunAsync(PekkoRpcActor.java:460) > ~[flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] > at > org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.handleRpcMessage(PekkoRpcActor.java:225) > ~[flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] > at > org.apache.flink.runtime.rpc.pekko.FencedPekkoRpcActor.handleRpcMessage(FencedPekkoRpcActor.java:88) > ~[flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] > at > org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.handleMessage(PekkoRpcActor.java:174) > ~[flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] > at > org.apache.pekko.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:33) > [flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] > at > org.apache.pekko.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:29) > [flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] > at scala.PartialFunction.applyOrElse(PartialFunction.scala:127) > [flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] > at scala.PartialFunction.applyOrElse$(PartialFunction.scala:126) > [flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] > at > org.apache.pekko.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:29) > [flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] > at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:175) > [flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] > at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:176) > [flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] > at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:176) > [flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] > at org.apache.pekko.actor.Actor.aroundReceive(Actor.scala:547) > [flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] > at org.apache.pekko.actor.Actor.aroundReceive$(Actor.scala:545) > [flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] > at > org.apache.pekko.actor.AbstractActor.aroundReceive(AbstractActor.scala:229) > [flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] > at org.apache.pekko.actor.ActorCell.receiveMessage(ActorCell.scala:590) > [flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] > at org.apache.pekko.actor.ActorCell.invoke(ActorCell.scala:557) > [flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] > at org.apache.pekko.dispatch.Mailbox.processMailbox(Mailbox.scala:280) > [flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] > at org.apache.pekko.dispatch.Mailbox.run(Mailbox.scala:241) > [flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] > at org.apache.pekko.dispatch.Mailbox.exec(Mailbox.scala:253) > [flink-rpc-akka00d41e2b-3940-4ffd-9308-83099bf3a317.jar:1.20.0] > at java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:289) > [?:1.8.0_352] > at > java.util.concurrent.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1056) > [?:1.8.0_352] > at java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1692) > [?:1.8.0_352] > at > java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:175) > [?:1.8.0_352] > 2024-12-30 19:10:43,824 INFO > org.apache.flink.runtime.source.coordinator.SourceCoordinator [] - Marking > checkpoint 5 as aborted for source Source: Flink CDC Event Source: mysql. > 2024-12-30 19:10:43,854 INFO > org.apache.flink.runtime.executiongraph.ExecutionGraph [] - Source: > Flink CDC Event Source: mysql (2/2) > (7670379b51da6bcf1592d89a4f671520_bc764cd8ddf7a0cff126f51c16239658_1_0) > switched from CANCELING to CANCELED. > 2024-12-30 19:10:43,856 INFO > org.apache.flink.runtime.executiongraph.ExecutionGraph [] - > PostPartition (1/2) > (7670379b51da6bcf1592d89a4f671520_fffb571f3c6daa1956cc08d8e61e216a_0_0) > switched from CANCELING to CANCELED. > 2024-12-30 19:10:43,857 INFO > org.apache.flink.runtime.executiongraph.ExecutionGraph [] - > PrePartition (2/2) > (7670379b51da6bcf1592d89a4f671520_78be0dd8677bc2711e2a56947a5ea048_1_0) > switched from CANCELING to CANCELED. > > ...{code} -- This message was sent by Atlassian Jira (v8.20.10#820010)