alievmirza commented on code in PR #4686: URL: https://github.com/apache/ignite-3/pull/4686#discussion_r1833184562
########## modules/table/src/integrationTest/java/org/apache/ignite/internal/disaster/ItDisasterRecoveryReconfigurationTest.java: ########## @@ -434,6 +443,164 @@ public void testIncompleteRebalanceAfterResetPartitions() throws Exception { }); } + /** + * Tests that in a situation from the test {@link #testInsertFailsIfMajorityIsLost()} it is possible to recover partition using a + * disaster recovery API, but with manual flag set to false. We expect that in this replica factor won't be restored. + * In this test, assignments will be (1, 3, 4), according to {@link RendezvousDistributionFunction}. + */ + @Test + @ZoneParams(nodes = 5, replicas = 3, partitions = 1) + void testAutomaticRebalanceIfMajorityIsLost() throws Exception { + int partId = 0; + + IgniteImpl node0 = unwrapIgniteImpl(cluster.node(0)); + Table table = node0.tables().table(TABLE_NAME); + + awaitPrimaryReplica(node0, partId); + + assertRealAssignments(node0, partId, 1, 3, 4); + + stopNodesInParallel(3, 4); + + waitForScale(node0, 3); + + CompletableFuture<?> updateFuture = node0.disasterRecoveryManager().resetPartitions( + zoneName, + QUALIFIED_TABLE_NAME, + Set.of(), + false + ); + + assertThat(updateFuture, willSucceedIn(60, SECONDS)); + + awaitPrimaryReplica(node0, partId); + + assertRealAssignments(node0, partId, 1); + + List<Throwable> errors = insertValues(table, partId, 0); + assertThat(errors, is(empty())); + + // check that there is no ongoing or planned rebalance + assertNull(getPendingAssignments(node0, partId)); + + assertRealAssignments(node0, partId, 1); + } + + /** + * Tests a scenario where there's a single partition on a node 1, and the node that hosts it is lost. + * Not manual reset should do nothing in that case, so no new pending or planned is presented. + */ + @Test + @ZoneParams(nodes = 2, replicas = 1, partitions = 1) + void testAutomaticRebalanceIfPartitionIsLost() throws Exception { + int partId = 0; + + IgniteImpl node0 = unwrapIgniteImpl(cluster.node(0)); + + IgniteImpl node1 = unwrapIgniteImpl(cluster.node(1)); + + executeSql(format("ALTER ZONE %s SET data_nodes_auto_adjust_scale_down=%d", zoneName, INFINITE_TIMER_VALUE)); + + awaitPrimaryReplica(node0, partId); + + assertRealAssignments(node0, partId, 1); + + stopNodesInParallel(1); + + CompletableFuture<?> updateFuture = node0.disasterRecoveryManager().resetPartitions( + zoneName, + QUALIFIED_TABLE_NAME, + Set.of(), + false + ); + + assertThat(updateFuture, willCompleteSuccessfully()); + + // This is needed to be sure that all previous meta storage events are handled. + executeSql(format("CREATE ZONE %s with storage_profiles='%s'", + "FAKE_ZONE", DEFAULT_STORAGE_PROFILE + )); + + // check that there is no ongoing or planned rebalance Review Comment: done ########## modules/table/src/integrationTest/java/org/apache/ignite/internal/disaster/ItDisasterRecoveryReconfigurationTest.java: ########## @@ -434,6 +443,164 @@ public void testIncompleteRebalanceAfterResetPartitions() throws Exception { }); } + /** + * Tests that in a situation from the test {@link #testInsertFailsIfMajorityIsLost()} it is possible to recover partition using a + * disaster recovery API, but with manual flag set to false. We expect that in this replica factor won't be restored. + * In this test, assignments will be (1, 3, 4), according to {@link RendezvousDistributionFunction}. + */ + @Test + @ZoneParams(nodes = 5, replicas = 3, partitions = 1) + void testAutomaticRebalanceIfMajorityIsLost() throws Exception { + int partId = 0; + + IgniteImpl node0 = unwrapIgniteImpl(cluster.node(0)); + Table table = node0.tables().table(TABLE_NAME); + + awaitPrimaryReplica(node0, partId); + + assertRealAssignments(node0, partId, 1, 3, 4); + + stopNodesInParallel(3, 4); + + waitForScale(node0, 3); + + CompletableFuture<?> updateFuture = node0.disasterRecoveryManager().resetPartitions( + zoneName, + QUALIFIED_TABLE_NAME, + Set.of(), + false + ); + + assertThat(updateFuture, willSucceedIn(60, SECONDS)); + + awaitPrimaryReplica(node0, partId); + + assertRealAssignments(node0, partId, 1); + + List<Throwable> errors = insertValues(table, partId, 0); + assertThat(errors, is(empty())); + + // check that there is no ongoing or planned rebalance + assertNull(getPendingAssignments(node0, partId)); + + assertRealAssignments(node0, partId, 1); + } + + /** + * Tests a scenario where there's a single partition on a node 1, and the node that hosts it is lost. + * Not manual reset should do nothing in that case, so no new pending or planned is presented. + */ + @Test + @ZoneParams(nodes = 2, replicas = 1, partitions = 1) + void testAutomaticRebalanceIfPartitionIsLost() throws Exception { + int partId = 0; + + IgniteImpl node0 = unwrapIgniteImpl(cluster.node(0)); + + IgniteImpl node1 = unwrapIgniteImpl(cluster.node(1)); + + executeSql(format("ALTER ZONE %s SET data_nodes_auto_adjust_scale_down=%d", zoneName, INFINITE_TIMER_VALUE)); + + awaitPrimaryReplica(node0, partId); + + assertRealAssignments(node0, partId, 1); + + stopNodesInParallel(1); + + CompletableFuture<?> updateFuture = node0.disasterRecoveryManager().resetPartitions( + zoneName, + QUALIFIED_TABLE_NAME, + Set.of(), + false + ); + + assertThat(updateFuture, willCompleteSuccessfully()); + + // This is needed to be sure that all previous meta storage events are handled. + executeSql(format("CREATE ZONE %s with storage_profiles='%s'", + "FAKE_ZONE", DEFAULT_STORAGE_PROFILE + )); + + // check that there is no ongoing or planned rebalance + assertNull(getPendingAssignments(node0, partId)); + + assertEquals(1, getStableAssignments(node0, partId).nodes().size()); + + assertEquals(node1.name(), getStableAssignments(node0, partId).nodes().stream().findFirst().get().consistentId()); + } + + /** + * Tests a scenario where only one node from stable is left, but we have node in pending nodes and perform reset partition operation. + * We expect this node from pending being presented after reset, so not manual reset logic take into account pending nodes. + * + * <p>It goes like this: + * <ul> + * <li>We have 6 nodes and a partition on nodes 1, 4 and 5.</li> + * <li>We stop node 5, so rebalance on 1, 3, 4 is triggered, but blocked and cannot be finished.</li> + * <li>Zones scale down is set to infinite value, we stop node 4 and new rebalance is not triggered and majority is lost.</li> + * <li>We execute "resetPartitions" and expect that pending assignments will be 1, 3, so node 3 from pending is presented.</li> + * </ul> + */ + @Test + @ZoneParams(nodes = 6, replicas = 3, partitions = 1) + public void testIncompleteRebalanceBeforeAutomaticResetPartitions() throws Exception { + int partId = 0; + + IgniteImpl node0 = unwrapIgniteImpl(cluster.node(0)); + + int catalogVersion = node0.catalogManager().latestCatalogVersion(); + long timestamp = node0.catalogManager().catalog(catalogVersion).time(); + + Table table = node0.tables().table(TABLE_NAME); + + awaitPrimaryReplica(node0, partId); + assertRealAssignments(node0, partId, 1, 4, 5); + + insertValues(table, partId, 0); + + Assignments assignment134 = Assignments.of(timestamp, + Assignment.forPeer(node(1).name()), + Assignment.forPeer(node(3).name()), + Assignment.forPeer(node(4).name()) + ); + + cluster.runningNodes().map(TestWrappers::unwrapIgniteImpl).forEach(node -> { + BiPredicate<String, NetworkMessage> newPredicate = (nodeName, msg) -> stableKeySwitchMessage(msg, partId, assignment134); + BiPredicate<String, NetworkMessage> oldPredicate = node.dropMessagesPredicate(); Review Comment: refactored -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: notifications-unsubscr...@ignite.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org