[ https://issues.apache.org/jira/browse/IGNITE-24742?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Kirill Sizov updated IGNITE-24742: ----------------------------------- Description: *Preconditions* Randomly start and stop nodes of the cluster and immediately perform manual reset. *Expected behavior* Manual reset completes successfully *Actual behavior* Sometimes manual reset fails. {noformat} org.apache.ignite.compute.ComputeException: Job execution failed: org.apache.ignite.internal.table.distributed.disaster.exceptions.DisasterRecoveryException: IGN-RECOVERY-3 TraceId:3ccaf9ac-39ab-4c51-88df-org.apache.ignite.internal.table.distributed.disaster.exceptions.DisasterRecoveryException: IGN-RECOVERY-3 TraceId:3ccaf9ac-39ab-4c51-88df-1fe2f38e2e0d io.netty.channel.AbstractChannel$AnnotatedConnectException: Connection refused: /192.168.210.9:3344 at java.lang.invoke.MethodHandle.invokeWithArguments(MethodHandle.java:733) ~[?:?] at org.apache.ignite.internal.util.ExceptionUtils$1.copy(ExceptionUtils.java:877) ~[ignite-core-3.1.0-SNAPSHOT.jar:?] at org.apache.ignite.internal.util.ExceptionUtils$ExceptionFactory.createCopy(ExceptionUtils.java:811) ~[ignite-core-3.1.0-SNAPSHOT.jar:?] at org.apache.ignite.internal.util.ExceptionUtils.copyExceptionWithCause(ExceptionUtils.java:613) ~[ignite-core-3.1.0-SNAPSHOT.jar:?] at org.apache.ignite.internal.client.TcpClientChannel.readError(TcpClientChannel.java:555) ~[ignite-client-3.1.0-SNAPSHOT.jar:?] at org.apache.ignite.internal.client.TcpClientChannel.processNextMessage(TcpClientChannel.java:449) ~[ignite-client-3.1.0-SNAPSHOT.jar:?] at org.apache.ignite.internal.client.TcpClientChannel.onMessage(TcpClientChannel.java:272) ~[ignite-client-3.1.0-SNAPSHOT.jar:?] at org.apache.ignite.internal.client.io.netty.NettyClientConnection.onMessage(NettyClientConnection.java:117) ~[ignite-client-3.1.0-SNAPSHOT.jar:?] at org.apache.ignite.internal.client.io.netty.NettyClientMessageHandler.channelRead(NettyClientMessageHandler.java:33) ~[ignite-client-3.1.0-SNAPSHOT.jar:?] at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:444) ~[netty-transport-4.1.119.Final.jar:4.1.119.Final] at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:420) ~[netty-transport-4.1.119.Final.jar:4.1.119.Final] at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:412) ~[netty-transport-4.1.119.Final.jar:4.1.119.Final] at io.netty.handler.codec.ByteToMessageDecoder.fireChannelRead(ByteToMessageDecoder.java:346) ~[netty-codec-4.1.119.Final.jar:4.1.119.Final] at io.netty.handler.codec.ByteToMessageDecoder.channelRead(ByteToMessageDecoder.java:318) ~[netty-codec-4.1.119.Final.jar:4.1.119.Final] at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:444) ~[netty-transport-4.1.119.Final.jar:4.1.119.Final] at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:420) ~[netty-transport-4.1.119.Final.jar:4.1.119.Final] at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:412) ~[netty-transport-4.1.119.Final.jar:4.1.119.Final] at io.netty.channel.DefaultChannelPipeline$HeadContext.channelRead(DefaultChannelPipeline.java:1357) ~[netty-transport-4.1.119.Final.jar:4.1.119.Final] at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:440) ~[netty-transport-4.1.119.Final.jar:4.1.119.Final] at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:420) ~[netty-transport-4.1.119.Final.jar:4.1.119.Final] at io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:868) ~[netty-transport-4.1.119.Final.jar:4.1.119.Final] at io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:166) ~[netty-transport-4.1.119.Final.jar:4.1.119.Final] at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:796) ~[netty-transport-4.1.119.Final.jar:4.1.119.Final] at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:732) ~[netty-transport-4.1.119.Final.jar:4.1.119.Final] at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:658) ~[netty-transport-4.1.119.Final.jar:4.1.119.Final] at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:562) ~[netty-transport-4.1.119.Final.jar:4.1.119.Final] at io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:998) ~[netty-common-4.1.119.Final.jar:4.1.119.Final] at io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74) ~[netty-common-4.1.119.Final.jar:4.1.119.Final] at io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30) ~[netty-common-4.1.119.Final.jar:4.1.119.Final] ... 1 more {noformat} The exception on server is: {noformat} 2025-03-10 08:10:22:071 +0000 [ERROR][%poc-tester-SERVER-192.168.210.164-id-0%partition-operations-8][GroupUpdateRequest] Failed to reset partition java.util.concurrent.CompletionException: org.apache.ignite.internal.table.distributed.disaster.exceptions.DisasterRecoveryException: IGN-RECOVERY-3 TraceId:3ccaf9ac-39ab-4c51-88df-1fe2f38e2e0d io.netty.channel.AbstractChannel$AnnotatedConnectException: Connection refused: /192.168.210.9:3344 at java.base/java.util.concurrent.CompletableFuture.encodeThrowable(CompletableFuture.java:315) at java.base/java.util.concurrent.CompletableFuture.completeThrowable(CompletableFuture.java:320) at java.base/java.util.concurrent.CompletableFuture.uniHandle(CompletableFuture.java:936) at java.base/java.util.concurrent.CompletableFuture$UniHandle.tryFire(CompletableFuture.java:911) at java.base/java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:510) at java.base/java.util.concurrent.CompletableFuture.postFire(CompletableFuture.java:614) at java.base/java.util.concurrent.CompletableFuture$UniHandle.tryFire(CompletableFuture.java:914) at java.base/java.util.concurrent.CompletableFuture$Completion.run(CompletableFuture.java:482) at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144) at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642) at java.base/java.lang.Thread.run(Thread.java:1583) Caused by: org.apache.ignite.internal.table.distributed.disaster.exceptions.DisasterRecoveryException: IGN-RECOVERY-3 TraceId:3ccaf9ac-39ab-4c51-88df-1fe2f38e2e0d io.netty.channel.AbstractChannel$AnnotatedConnectException: Connection refused: /192.168.210.9:3344 at org.apache.ignite.internal.table.distributed.disaster.DisasterRecoveryManager.lambda$localPartitionStatesInternal$9(DisasterRecoveryManager.java:550) at java.base/java.util.concurrent.CompletableFuture.uniHandle(CompletableFuture.java:934) ... 8 more Caused by: java.util.concurrent.CompletionException: io.netty.channel.AbstractChannel$AnnotatedConnectException: Connection refused: /192.168.210.9:3344 at java.base/java.util.concurrent.CompletableFuture.encodeRelay(CompletableFuture.java:368) at java.base/java.util.concurrent.CompletableFuture.completeRelay(CompletableFuture.java:377) at java.base/java.util.concurrent.CompletableFuture$UniCompose.tryFire(CompletableFuture.java:1152) at java.base/java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:510) at java.base/java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:2194) at org.apache.ignite.internal.network.netty.NettyUtils.lambda$toCompletableFuture$0(NettyUtils.java:74) at io.netty.util.concurrent.DefaultPromise.notifyListener0(DefaultPromise.java:590) at io.netty.util.concurrent.DefaultPromise.notifyListeners0(DefaultPromise.java:583) at io.netty.util.concurrent.DefaultPromise.notifyListenersNow(DefaultPromise.java:559) at io.netty.util.concurrent.DefaultPromise.notifyListeners(DefaultPromise.java:492) at io.netty.util.concurrent.DefaultPromise.setValue0(DefaultPromise.java:636) at io.netty.util.concurrent.DefaultPromise.setFailure0(DefaultPromise.java:629) at io.netty.util.concurrent.DefaultPromise.tryFailure(DefaultPromise.java:118) at io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.fulfillConnectPromise(AbstractNioChannel.java:326) at io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.finishConnect(AbstractNioChannel.java:342) at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:784) at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:732) at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:658) at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:562) at io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:998) at io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74) at io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30) ... 1 more Caused by: io.netty.channel.AbstractChannel$AnnotatedConnectException: Connection refused: /192.168.210.9:3344 Caused by: java.net.ConnectException: Connection refused at java.base/sun.nio.ch.Net.pollConnect(Native Method) at java.base/sun.nio.ch.Net.pollConnectNow(Net.java:682) at java.base/sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:973) at io.netty.channel.socket.nio.NioSocketChannel.doFinishConnect(NioSocketChannel.java:336) at io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.finishConnect(AbstractNioChannel.java:339) at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:784) at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:732) at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:658) at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:562) at io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:998) at io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74) at io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30) at java.base/java.lang.Thread.run(Thread.java:1583){noformat} Likely the logical topology has not been updated at the time of the request, disaster recovery manager sends messages to all known nodes, receives a connection exception and fails the whole reset. *Implementation details* The most natural solution to this issue is to add a reasonable retry policy to the {{localPartitionStates}} method. was: *Preconditions* Randomly start and stop nodes of the cluster and immediately perform manual reset. *Expected behavior* Manual reset completes successfully *Actual behavior* Sometimes manual reset fails. {noformat} org.apache.ignite.compute.ComputeException: Job execution failed: org.apache.ignite.internal.table.distributed.disaster.exceptions.DisasterRecoveryException: IGN-RECOVERY-3 TraceId:3ccaf9ac-39ab-4c51-88df-org.apache.ignite.internal.table.distributed.disaster.exceptions.DisasterRecoveryException: IGN-RECOVERY-3 TraceId:3ccaf9ac-39ab-4c51-88df-1fe2f38e2e0d io.netty.channel.AbstractChannel$AnnotatedConnectException: Connection refused: /192.168.210.9:3344 at java.lang.invoke.MethodHandle.invokeWithArguments(MethodHandle.java:733) ~[?:?] at org.apache.ignite.internal.util.ExceptionUtils$1.copy(ExceptionUtils.java:877) ~[ignite-core-3.1.0-SNAPSHOT.jar:?] at org.apache.ignite.internal.util.ExceptionUtils$ExceptionFactory.createCopy(ExceptionUtils.java:811) ~[ignite-core-3.1.0-SNAPSHOT.jar:?] at org.apache.ignite.internal.util.ExceptionUtils.copyExceptionWithCause(ExceptionUtils.java:613) ~[ignite-core-3.1.0-SNAPSHOT.jar:?] at org.apache.ignite.internal.client.TcpClientChannel.readError(TcpClientChannel.java:555) ~[ignite-client-3.1.0-SNAPSHOT.jar:?] at org.apache.ignite.internal.client.TcpClientChannel.processNextMessage(TcpClientChannel.java:449) ~[ignite-client-3.1.0-SNAPSHOT.jar:?] at org.apache.ignite.internal.client.TcpClientChannel.onMessage(TcpClientChannel.java:272) ~[ignite-client-3.1.0-SNAPSHOT.jar:?] at org.apache.ignite.internal.client.io.netty.NettyClientConnection.onMessage(NettyClientConnection.java:117) ~[ignite-client-3.1.0-SNAPSHOT.jar:?] at org.apache.ignite.internal.client.io.netty.NettyClientMessageHandler.channelRead(NettyClientMessageHandler.java:33) ~[ignite-client-3.1.0-SNAPSHOT.jar:?] at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:444) ~[netty-transport-4.1.119.Final.jar:4.1.119.Final] at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:420) ~[netty-transport-4.1.119.Final.jar:4.1.119.Final] at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:412) ~[netty-transport-4.1.119.Final.jar:4.1.119.Final] at io.netty.handler.codec.ByteToMessageDecoder.fireChannelRead(ByteToMessageDecoder.java:346) ~[netty-codec-4.1.119.Final.jar:4.1.119.Final] at io.netty.handler.codec.ByteToMessageDecoder.channelRead(ByteToMessageDecoder.java:318) ~[netty-codec-4.1.119.Final.jar:4.1.119.Final] at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:444) ~[netty-transport-4.1.119.Final.jar:4.1.119.Final] at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:420) ~[netty-transport-4.1.119.Final.jar:4.1.119.Final] at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:412) ~[netty-transport-4.1.119.Final.jar:4.1.119.Final] at io.netty.channel.DefaultChannelPipeline$HeadContext.channelRead(DefaultChannelPipeline.java:1357) ~[netty-transport-4.1.119.Final.jar:4.1.119.Final] at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:440) ~[netty-transport-4.1.119.Final.jar:4.1.119.Final] at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:420) ~[netty-transport-4.1.119.Final.jar:4.1.119.Final] at io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:868) ~[netty-transport-4.1.119.Final.jar:4.1.119.Final] at io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:166) ~[netty-transport-4.1.119.Final.jar:4.1.119.Final] at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:796) ~[netty-transport-4.1.119.Final.jar:4.1.119.Final] at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:732) ~[netty-transport-4.1.119.Final.jar:4.1.119.Final] at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:658) ~[netty-transport-4.1.119.Final.jar:4.1.119.Final] at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:562) ~[netty-transport-4.1.119.Final.jar:4.1.119.Final] at io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:998) ~[netty-common-4.1.119.Final.jar:4.1.119.Final] at io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74) ~[netty-common-4.1.119.Final.jar:4.1.119.Final] at io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30) ~[netty-common-4.1.119.Final.jar:4.1.119.Final] ... 1 more {noformat} The exception on server is: {noformat} 2025-03-10 08:10:22:071 +0000 [ERROR][%poc-tester-SERVER-192.168.210.164-id-0%partition-operations-8][GroupUpdateRequest] Failed to reset partition java.util.concurrent.CompletionException: org.apache.ignite.internal.table.distributed.disaster.exceptions.DisasterRecoveryException: IGN-RECOVERY-3 TraceId:3ccaf9ac-39ab-4c51-88df-1fe2f38e2e0d io.netty.channel.AbstractChannel$AnnotatedConnectException: Connection refused: /192.168.210.9:3344 at java.base/java.util.concurrent.CompletableFuture.encodeThrowable(CompletableFuture.java:315) at java.base/java.util.concurrent.CompletableFuture.completeThrowable(CompletableFuture.java:320) at java.base/java.util.concurrent.CompletableFuture.uniHandle(CompletableFuture.java:936) at java.base/java.util.concurrent.CompletableFuture$UniHandle.tryFire(CompletableFuture.java:911) at java.base/java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:510) at java.base/java.util.concurrent.CompletableFuture.postFire(CompletableFuture.java:614) at java.base/java.util.concurrent.CompletableFuture$UniHandle.tryFire(CompletableFuture.java:914) at java.base/java.util.concurrent.CompletableFuture$Completion.run(CompletableFuture.java:482) at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144) at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642) at java.base/java.lang.Thread.run(Thread.java:1583) Caused by: org.apache.ignite.internal.table.distributed.disaster.exceptions.DisasterRecoveryException: IGN-RECOVERY-3 TraceId:3ccaf9ac-39ab-4c51-88df-1fe2f38e2e0d io.netty.channel.AbstractChannel$AnnotatedConnectException: Connection refused: /192.168.210.9:3344 at org.apache.ignite.internal.table.distributed.disaster.DisasterRecoveryManager.lambda$localPartitionStatesInternal$9(DisasterRecoveryManager.java:550) at java.base/java.util.concurrent.CompletableFuture.uniHandle(CompletableFuture.java:934) ... 8 more Caused by: java.util.concurrent.CompletionException: io.netty.channel.AbstractChannel$AnnotatedConnectException: Connection refused: /192.168.210.9:3344 at java.base/java.util.concurrent.CompletableFuture.encodeRelay(CompletableFuture.java:368) at java.base/java.util.concurrent.CompletableFuture.completeRelay(CompletableFuture.java:377) at java.base/java.util.concurrent.CompletableFuture$UniCompose.tryFire(CompletableFuture.java:1152) at java.base/java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:510) at java.base/java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:2194) at org.apache.ignite.internal.network.netty.NettyUtils.lambda$toCompletableFuture$0(NettyUtils.java:74) at io.netty.util.concurrent.DefaultPromise.notifyListener0(DefaultPromise.java:590) at io.netty.util.concurrent.DefaultPromise.notifyListeners0(DefaultPromise.java:583) at io.netty.util.concurrent.DefaultPromise.notifyListenersNow(DefaultPromise.java:559) at io.netty.util.concurrent.DefaultPromise.notifyListeners(DefaultPromise.java:492) at io.netty.util.concurrent.DefaultPromise.setValue0(DefaultPromise.java:636) at io.netty.util.concurrent.DefaultPromise.setFailure0(DefaultPromise.java:629) at io.netty.util.concurrent.DefaultPromise.tryFailure(DefaultPromise.java:118) at io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.fulfillConnectPromise(AbstractNioChannel.java:326) at io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.finishConnect(AbstractNioChannel.java:342) at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:784) at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:732) at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:658) at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:562) at io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:998) at io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74) at io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30) ... 1 more Caused by: io.netty.channel.AbstractChannel$AnnotatedConnectException: Connection refused: /192.168.210.9:3344 Caused by: java.net.ConnectException: Connection refused at java.base/sun.nio.ch.Net.pollConnect(Native Method) at java.base/sun.nio.ch.Net.pollConnectNow(Net.java:682) at java.base/sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:973) at io.netty.channel.socket.nio.NioSocketChannel.doFinishConnect(NioSocketChannel.java:336) at io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.finishConnect(AbstractNioChannel.java:339) at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:784) at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:732) at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:658) at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:562) at io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:998) at io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74) at io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30) at java.base/java.lang.Thread.run(Thread.java:1583){noformat} Likely the logical topology has not been updated at the time of the request, disaster recovery manager sends messages to all known nodes, receives a connection exception and fails the whole reset. > Disaster recovery fails on unstable topology > -------------------------------------------- > > Key: IGNITE-24742 > URL: https://issues.apache.org/jira/browse/IGNITE-24742 > Project: Ignite > Issue Type: Bug > Reporter: Kirill Sizov > Priority: Major > Labels: ignite-3 > > *Preconditions* > Randomly start and stop nodes of the cluster and immediately perform manual > reset. > *Expected behavior* > Manual reset completes successfully > *Actual behavior* > Sometimes manual reset fails. > {noformat} > org.apache.ignite.compute.ComputeException: Job execution failed: > org.apache.ignite.internal.table.distributed.disaster.exceptions.DisasterRecoveryException: > IGN-RECOVERY-3 > TraceId:3ccaf9ac-39ab-4c51-88df-org.apache.ignite.internal.table.distributed.disaster.exceptions.DisasterRecoveryException: > IGN-RECOVERY-3 TraceId:3ccaf9ac-39ab-4c51-88df-1fe2f38e2e0d > io.netty.channel.AbstractChannel$AnnotatedConnectException: Connection > refused: /192.168.210.9:3344 > at > java.lang.invoke.MethodHandle.invokeWithArguments(MethodHandle.java:733) > ~[?:?] > at > org.apache.ignite.internal.util.ExceptionUtils$1.copy(ExceptionUtils.java:877) > ~[ignite-core-3.1.0-SNAPSHOT.jar:?] > at > org.apache.ignite.internal.util.ExceptionUtils$ExceptionFactory.createCopy(ExceptionUtils.java:811) > ~[ignite-core-3.1.0-SNAPSHOT.jar:?] > at > org.apache.ignite.internal.util.ExceptionUtils.copyExceptionWithCause(ExceptionUtils.java:613) > ~[ignite-core-3.1.0-SNAPSHOT.jar:?] > at > org.apache.ignite.internal.client.TcpClientChannel.readError(TcpClientChannel.java:555) > ~[ignite-client-3.1.0-SNAPSHOT.jar:?] > at > org.apache.ignite.internal.client.TcpClientChannel.processNextMessage(TcpClientChannel.java:449) > ~[ignite-client-3.1.0-SNAPSHOT.jar:?] > at > org.apache.ignite.internal.client.TcpClientChannel.onMessage(TcpClientChannel.java:272) > ~[ignite-client-3.1.0-SNAPSHOT.jar:?] > at > org.apache.ignite.internal.client.io.netty.NettyClientConnection.onMessage(NettyClientConnection.java:117) > ~[ignite-client-3.1.0-SNAPSHOT.jar:?] > at > org.apache.ignite.internal.client.io.netty.NettyClientMessageHandler.channelRead(NettyClientMessageHandler.java:33) > ~[ignite-client-3.1.0-SNAPSHOT.jar:?] > at > io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:444) > ~[netty-transport-4.1.119.Final.jar:4.1.119.Final] > at > io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:420) > ~[netty-transport-4.1.119.Final.jar:4.1.119.Final] > at > io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:412) > ~[netty-transport-4.1.119.Final.jar:4.1.119.Final] > at > io.netty.handler.codec.ByteToMessageDecoder.fireChannelRead(ByteToMessageDecoder.java:346) > ~[netty-codec-4.1.119.Final.jar:4.1.119.Final] > at > io.netty.handler.codec.ByteToMessageDecoder.channelRead(ByteToMessageDecoder.java:318) > ~[netty-codec-4.1.119.Final.jar:4.1.119.Final] > at > io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:444) > ~[netty-transport-4.1.119.Final.jar:4.1.119.Final] > at > io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:420) > ~[netty-transport-4.1.119.Final.jar:4.1.119.Final] > at > io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:412) > ~[netty-transport-4.1.119.Final.jar:4.1.119.Final] > at > io.netty.channel.DefaultChannelPipeline$HeadContext.channelRead(DefaultChannelPipeline.java:1357) > ~[netty-transport-4.1.119.Final.jar:4.1.119.Final] > at > io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:440) > ~[netty-transport-4.1.119.Final.jar:4.1.119.Final] > at > io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:420) > ~[netty-transport-4.1.119.Final.jar:4.1.119.Final] > at > io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:868) > ~[netty-transport-4.1.119.Final.jar:4.1.119.Final] > at > io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:166) > ~[netty-transport-4.1.119.Final.jar:4.1.119.Final] > at > io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:796) > ~[netty-transport-4.1.119.Final.jar:4.1.119.Final] > at > io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:732) > ~[netty-transport-4.1.119.Final.jar:4.1.119.Final] > at > io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:658) > ~[netty-transport-4.1.119.Final.jar:4.1.119.Final] > at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:562) > ~[netty-transport-4.1.119.Final.jar:4.1.119.Final] > at > io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:998) > ~[netty-common-4.1.119.Final.jar:4.1.119.Final] > at > io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74) > ~[netty-common-4.1.119.Final.jar:4.1.119.Final] > at > io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30) > ~[netty-common-4.1.119.Final.jar:4.1.119.Final] > ... 1 more {noformat} > The exception on server is: > {noformat} > 2025-03-10 08:10:22:071 +0000 > [ERROR][%poc-tester-SERVER-192.168.210.164-id-0%partition-operations-8][GroupUpdateRequest] > Failed to reset partition > java.util.concurrent.CompletionException: > org.apache.ignite.internal.table.distributed.disaster.exceptions.DisasterRecoveryException: > IGN-RECOVERY-3 TraceId:3ccaf9ac-39ab-4c51-88df-1fe2f38e2e0d > io.netty.channel.AbstractChannel$AnnotatedConnectException: Connection > refused: /192.168.210.9:3344 > at > java.base/java.util.concurrent.CompletableFuture.encodeThrowable(CompletableFuture.java:315) > at > java.base/java.util.concurrent.CompletableFuture.completeThrowable(CompletableFuture.java:320) > at > java.base/java.util.concurrent.CompletableFuture.uniHandle(CompletableFuture.java:936) > at > java.base/java.util.concurrent.CompletableFuture$UniHandle.tryFire(CompletableFuture.java:911) > at > java.base/java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:510) > at > java.base/java.util.concurrent.CompletableFuture.postFire(CompletableFuture.java:614) > at > java.base/java.util.concurrent.CompletableFuture$UniHandle.tryFire(CompletableFuture.java:914) > at > java.base/java.util.concurrent.CompletableFuture$Completion.run(CompletableFuture.java:482) > at > java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144) > at > java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642) > at java.base/java.lang.Thread.run(Thread.java:1583) > Caused by: > org.apache.ignite.internal.table.distributed.disaster.exceptions.DisasterRecoveryException: > IGN-RECOVERY-3 TraceId:3ccaf9ac-39ab-4c51-88df-1fe2f38e2e0d > io.netty.channel.AbstractChannel$AnnotatedConnectException: Connection > refused: /192.168.210.9:3344 > at > org.apache.ignite.internal.table.distributed.disaster.DisasterRecoveryManager.lambda$localPartitionStatesInternal$9(DisasterRecoveryManager.java:550) > at > java.base/java.util.concurrent.CompletableFuture.uniHandle(CompletableFuture.java:934) > ... 8 more > Caused by: java.util.concurrent.CompletionException: > io.netty.channel.AbstractChannel$AnnotatedConnectException: Connection > refused: /192.168.210.9:3344 > at > java.base/java.util.concurrent.CompletableFuture.encodeRelay(CompletableFuture.java:368) > at > java.base/java.util.concurrent.CompletableFuture.completeRelay(CompletableFuture.java:377) > at > java.base/java.util.concurrent.CompletableFuture$UniCompose.tryFire(CompletableFuture.java:1152) > at > java.base/java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:510) > at > java.base/java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:2194) > at > org.apache.ignite.internal.network.netty.NettyUtils.lambda$toCompletableFuture$0(NettyUtils.java:74) > at > io.netty.util.concurrent.DefaultPromise.notifyListener0(DefaultPromise.java:590) > at > io.netty.util.concurrent.DefaultPromise.notifyListeners0(DefaultPromise.java:583) > at > io.netty.util.concurrent.DefaultPromise.notifyListenersNow(DefaultPromise.java:559) > at > io.netty.util.concurrent.DefaultPromise.notifyListeners(DefaultPromise.java:492) > at > io.netty.util.concurrent.DefaultPromise.setValue0(DefaultPromise.java:636) > at > io.netty.util.concurrent.DefaultPromise.setFailure0(DefaultPromise.java:629) > at > io.netty.util.concurrent.DefaultPromise.tryFailure(DefaultPromise.java:118) > at > io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.fulfillConnectPromise(AbstractNioChannel.java:326) > at > io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.finishConnect(AbstractNioChannel.java:342) > at > io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:784) > at > io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:732) > at > io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:658) > at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:562) > at > io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:998) > at > io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74) > at > io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30) > ... 1 more > Caused by: io.netty.channel.AbstractChannel$AnnotatedConnectException: > Connection refused: /192.168.210.9:3344 > Caused by: java.net.ConnectException: Connection refused > at java.base/sun.nio.ch.Net.pollConnect(Native Method) > at java.base/sun.nio.ch.Net.pollConnectNow(Net.java:682) > at > java.base/sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:973) > at > io.netty.channel.socket.nio.NioSocketChannel.doFinishConnect(NioSocketChannel.java:336) > at > io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.finishConnect(AbstractNioChannel.java:339) > at > io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:784) > at > io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:732) > at > io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:658) > at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:562) > at > io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:998) > at > io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74) > at > io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30) > at java.base/java.lang.Thread.run(Thread.java:1583){noformat} > Likely the logical topology has not been updated at the time of the request, > disaster recovery manager sends messages to all known nodes, receives a > connection exception and fails the whole reset. > *Implementation details* > The most natural solution to this issue is to add a reasonable retry policy > to the {{localPartitionStates}} method. -- This message was sent by Atlassian Jira (v8.20.10#820010)