Re: [PR] IGNITE-23571 resetPartitions improvements: do not try to recover replica factor after not manual reset [ignite-3]

via GitHub Thu, 07 Nov 2024 10:42:19 -0800


alievmirza commented on code in PR #4686:
URL: https://github.com/apache/ignite-3/pull/4686#discussion_r1833184562



##########
modules/table/src/integrationTest/java/org/apache/ignite/internal/disaster/ItDisasterRecoveryReconfigurationTest.java:
##########
@@ -434,6 +443,164 @@ public void testIncompleteRebalanceAfterResetPartitions() 
throws Exception {
         });
     }
 
+    /**
+     * Tests that in a situation from the test {@link 
#testInsertFailsIfMajorityIsLost()} it is possible to recover partition using a
+     * disaster recovery API, but with manual flag set to false. We expect 
that in this replica factor won't be restored.
+     * In this test, assignments will be (1, 3, 4), according to {@link 
RendezvousDistributionFunction}.
+     */
+    @Test
+    @ZoneParams(nodes = 5, replicas = 3, partitions = 1)
+    void testAutomaticRebalanceIfMajorityIsLost() throws Exception {
+        int partId = 0;
+
+        IgniteImpl node0 = unwrapIgniteImpl(cluster.node(0));
+        Table table = node0.tables().table(TABLE_NAME);
+
+        awaitPrimaryReplica(node0, partId);
+
+        assertRealAssignments(node0, partId, 1, 3, 4);
+
+        stopNodesInParallel(3, 4);
+
+        waitForScale(node0, 3);
+
+        CompletableFuture<?> updateFuture = 
node0.disasterRecoveryManager().resetPartitions(
+                zoneName,
+                QUALIFIED_TABLE_NAME,
+                Set.of(),
+                false
+        );
+
+        assertThat(updateFuture, willSucceedIn(60, SECONDS));
+
+        awaitPrimaryReplica(node0, partId);
+
+        assertRealAssignments(node0, partId, 1);
+
+        List<Throwable> errors = insertValues(table, partId, 0);
+        assertThat(errors, is(empty()));
+
+        // check that there is no ongoing or planned rebalance
+        assertNull(getPendingAssignments(node0, partId));
+
+        assertRealAssignments(node0, partId, 1);
+    }
+
+    /**
+     * Tests a scenario where there's a single partition on a node 1, and the 
node that hosts it is lost.
+     * Not manual reset should do nothing in that case, so no new pending or 
planned is presented.
+     */
+    @Test
+    @ZoneParams(nodes = 2, replicas = 1, partitions = 1)
+    void testAutomaticRebalanceIfPartitionIsLost() throws Exception {
+        int partId = 0;
+
+        IgniteImpl node0 = unwrapIgniteImpl(cluster.node(0));
+
+        IgniteImpl node1 = unwrapIgniteImpl(cluster.node(1));
+
+        executeSql(format("ALTER ZONE %s SET 
data_nodes_auto_adjust_scale_down=%d", zoneName, INFINITE_TIMER_VALUE));
+
+        awaitPrimaryReplica(node0, partId);
+
+        assertRealAssignments(node0, partId, 1);
+
+        stopNodesInParallel(1);
+
+        CompletableFuture<?> updateFuture = 
node0.disasterRecoveryManager().resetPartitions(
+                zoneName,
+                QUALIFIED_TABLE_NAME,
+                Set.of(),
+                false
+        );
+
+        assertThat(updateFuture, willCompleteSuccessfully());
+
+        // This is needed to be sure that all previous meta storage events are 
handled.
+        executeSql(format("CREATE ZONE %s with storage_profiles='%s'",
+                "FAKE_ZONE", DEFAULT_STORAGE_PROFILE
+        ));
+
+        // check that there is no ongoing or planned rebalance

Review Comment:
   done



##########
modules/table/src/integrationTest/java/org/apache/ignite/internal/disaster/ItDisasterRecoveryReconfigurationTest.java:
##########
@@ -434,6 +443,164 @@ public void testIncompleteRebalanceAfterResetPartitions() 
throws Exception {
         });
     }
 
+    /**
+     * Tests that in a situation from the test {@link 
#testInsertFailsIfMajorityIsLost()} it is possible to recover partition using a
+     * disaster recovery API, but with manual flag set to false. We expect 
that in this replica factor won't be restored.
+     * In this test, assignments will be (1, 3, 4), according to {@link 
RendezvousDistributionFunction}.
+     */
+    @Test
+    @ZoneParams(nodes = 5, replicas = 3, partitions = 1)
+    void testAutomaticRebalanceIfMajorityIsLost() throws Exception {
+        int partId = 0;
+
+        IgniteImpl node0 = unwrapIgniteImpl(cluster.node(0));
+        Table table = node0.tables().table(TABLE_NAME);
+
+        awaitPrimaryReplica(node0, partId);
+
+        assertRealAssignments(node0, partId, 1, 3, 4);
+
+        stopNodesInParallel(3, 4);
+
+        waitForScale(node0, 3);
+
+        CompletableFuture<?> updateFuture = 
node0.disasterRecoveryManager().resetPartitions(
+                zoneName,
+                QUALIFIED_TABLE_NAME,
+                Set.of(),
+                false
+        );
+
+        assertThat(updateFuture, willSucceedIn(60, SECONDS));
+
+        awaitPrimaryReplica(node0, partId);
+
+        assertRealAssignments(node0, partId, 1);
+
+        List<Throwable> errors = insertValues(table, partId, 0);
+        assertThat(errors, is(empty()));
+
+        // check that there is no ongoing or planned rebalance
+        assertNull(getPendingAssignments(node0, partId));
+
+        assertRealAssignments(node0, partId, 1);
+    }
+
+    /**
+     * Tests a scenario where there's a single partition on a node 1, and the 
node that hosts it is lost.
+     * Not manual reset should do nothing in that case, so no new pending or 
planned is presented.
+     */
+    @Test
+    @ZoneParams(nodes = 2, replicas = 1, partitions = 1)
+    void testAutomaticRebalanceIfPartitionIsLost() throws Exception {
+        int partId = 0;
+
+        IgniteImpl node0 = unwrapIgniteImpl(cluster.node(0));
+
+        IgniteImpl node1 = unwrapIgniteImpl(cluster.node(1));
+
+        executeSql(format("ALTER ZONE %s SET 
data_nodes_auto_adjust_scale_down=%d", zoneName, INFINITE_TIMER_VALUE));
+
+        awaitPrimaryReplica(node0, partId);
+
+        assertRealAssignments(node0, partId, 1);
+
+        stopNodesInParallel(1);
+
+        CompletableFuture<?> updateFuture = 
node0.disasterRecoveryManager().resetPartitions(
+                zoneName,
+                QUALIFIED_TABLE_NAME,
+                Set.of(),
+                false
+        );
+
+        assertThat(updateFuture, willCompleteSuccessfully());
+
+        // This is needed to be sure that all previous meta storage events are 
handled.
+        executeSql(format("CREATE ZONE %s with storage_profiles='%s'",
+                "FAKE_ZONE", DEFAULT_STORAGE_PROFILE
+        ));
+
+        // check that there is no ongoing or planned rebalance
+        assertNull(getPendingAssignments(node0, partId));
+
+        assertEquals(1, getStableAssignments(node0, partId).nodes().size());
+
+        assertEquals(node1.name(), getStableAssignments(node0, 
partId).nodes().stream().findFirst().get().consistentId());
+    }
+
+    /**
+     * Tests a scenario where only one node from stable is left, but we have 
node in pending nodes and perform reset partition operation.
+     * We expect this node from pending being presented after reset, so not 
manual reset logic take into account pending nodes.
+     *
+     * <p>It goes like this:
+     * <ul>
+     *     <li>We have 6 nodes and a partition on nodes 1, 4 and 5.</li>
+     *     <li>We stop node 5, so rebalance on 1, 3, 4 is triggered, but 
blocked and cannot be finished.</li>
+     *     <li>Zones scale down is set to infinite value, we stop node 4 and 
new rebalance is not triggered and majority is lost.</li>
+     *     <li>We execute "resetPartitions" and expect that pending 
assignments will be 1, 3, so node 3 from pending is presented.</li>
+     * </ul>
+     */
+    @Test
+    @ZoneParams(nodes = 6, replicas = 3, partitions = 1)
+    public void testIncompleteRebalanceBeforeAutomaticResetPartitions() throws 
Exception {
+        int partId = 0;
+
+        IgniteImpl node0 = unwrapIgniteImpl(cluster.node(0));
+
+        int catalogVersion = node0.catalogManager().latestCatalogVersion();
+        long timestamp = node0.catalogManager().catalog(catalogVersion).time();
+
+        Table table = node0.tables().table(TABLE_NAME);
+
+        awaitPrimaryReplica(node0, partId);
+        assertRealAssignments(node0, partId, 1, 4, 5);
+
+        insertValues(table, partId, 0);
+
+        Assignments assignment134 = Assignments.of(timestamp,
+                Assignment.forPeer(node(1).name()),
+                Assignment.forPeer(node(3).name()),
+                Assignment.forPeer(node(4).name())
+        );
+
+        
cluster.runningNodes().map(TestWrappers::unwrapIgniteImpl).forEach(node -> {
+            BiPredicate<String, NetworkMessage> newPredicate = (nodeName, msg) 
-> stableKeySwitchMessage(msg, partId, assignment134);
+            BiPredicate<String, NetworkMessage> oldPredicate = 
node.dropMessagesPredicate();

Review Comment:
   refactored 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: notifications-unsubscr...@ignite.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Re: [PR] IGNITE-23571 resetPartitions improvements: do not try to recover replica factor after not manual reset [ignite-3]

Reply via email to