ayushtkn commented on code in PR #5349:
URL: https://github.com/apache/hadoop/pull/5349#discussion_r1096494121


##########
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java:
##########
@@ -3832,7 +3837,38 @@ public boolean isDatanodeFullyStarted() {
     }
     return true;
   }
-  
+
+  /**
+   * Wait for the datanode to be fully started and also connected to active 
namenode. This means
+   * wait until the given time duration for all the BP threads to come alive 
and all the block
+   * pools to be initialized. Wait until any one of the BP service actor is 
connected to active
+   * namenode.
+   *
+   * @param waitTimeMs Wait time in millis for this method to return the 
datanode probes. If
+   * datanode stays unhealthy or not connected to any active namenode even 
after the given wait
+   * time elapses, it returns false.
+   * @return true - if the data node is fully started and connected to active 
namenode within
+   * the given time internal, false otherwise.
+   */
+  public boolean isDatanodeHealthy(long waitTimeMs) {
+    long startTime = monotonicNow();
+    while (monotonicNow() - startTime <= waitTimeMs) {
+      if (isDatanodeFullyStartedAndConnectedToActiveNN()) {
+        return true;
+      }
+    }
+    return false;
+  }

Review Comment:
   This waiting is test logic, we should keep it in ``MiniDfsCluster`` only and 
can refactor/reuse existing methods as well. Can you trying changing like this:
   ```
   diff --git 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java
 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java
   index 414ab579dd0..ad0f8e8b03e 100644
   --- 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java
   +++ 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java
   @@ -3830,39 +3830,12 @@ boolean isRestarting() {
       * @return true - if the data node is fully started
       */
      public boolean isDatanodeFullyStarted() {
   -    for (BPOfferService bp : blockPoolManager.getAllNamenodeThreads()) {
   -      if (!bp.isInitialized() || !bp.isAlive()) {
   -        return false;
   -      }
   -    }
   -    return true;
   -  }
   -
   -  /**
   -   * Wait for the datanode to be fully started and also connected to active 
namenode. This means
   -   * wait until the given time duration for all the BP threads to come 
alive and all the block
   -   * pools to be initialized. Wait until any one of the BP service actor is 
connected to active
   -   * namenode.
   -   *
   -   * @param waitTimeMs Wait time in millis for this method to return the 
datanode probes. If
   -   * datanode stays unhealthy or not connected to any active namenode even 
after the given wait
   -   * time elapses, it returns false.
   -   * @return true - if the data node is fully started and connected to 
active namenode within
   -   * the given time internal, false otherwise.
   -   */
   -  public boolean isDatanodeHealthy(long waitTimeMs) {
   -    long startTime = monotonicNow();
   -    while (monotonicNow() - startTime <= waitTimeMs) {
   -      if (isDatanodeFullyStartedAndConnectedToActiveNN()) {
   -        return true;
   -      }
   -    }
   -    return false;
   +    return isDatanodeFullyStarted(false);
      }
    
   -  private boolean isDatanodeFullyStartedAndConnectedToActiveNN() {
   +  public boolean isDatanodeFullyStarted(boolean checkConnectionToActive) {
        for (BPOfferService bp : blockPoolManager.getAllNamenodeThreads()) {
   -      if (!bp.isInitialized() || !bp.isAlive() || bp.getActiveNN() == null) 
{
   +      if (!bp.isInitialized() || !bp.isAlive() || (checkConnectionToActive 
&& bp.getActiveNN() == null)) {
            return false;
          }
        }
   diff --git 
a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java
 
b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java
   index dd8bb204382..0576b4a42e1 100644
   --- 
a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java
   +++ 
b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java
   @@ -2529,6 +2529,11 @@ public boolean restartDataNode(DataNodeProperties 
dnprop) throws IOException {
        return restartDataNode(dnprop, false);
      }
    
   +  public void waitDatanodeConnectedToActive(DataNode dn, int timeout) 
throws InterruptedException, TimeoutException {
   +    GenericTestUtils.waitFor(() -> dn.isDatanodeFullyStarted(true), 100, 
timeout,
   +        "Datanode is not connected to active even after " + timeout + " ms 
of waiting");
   +  }
   +
      public void waitDatanodeFullyStarted(DataNode dn, int timeout)
          throws TimeoutException, InterruptedException {
        GenericTestUtils.waitFor(dn::isDatanodeFullyStarted, 100, timeout,
   diff --git 
a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeMXBean.java
 
b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeMXBean.java
   index 34fc3558f73..ad4c892b22f 100644
   --- 
a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeMXBean.java
   +++ 
b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeMXBean.java
   @@ -35,7 +35,6 @@
    import org.apache.hadoop.fs.CommonConfigurationKeys;
    import org.apache.hadoop.fs.FileSystem;
    import org.apache.hadoop.fs.Path;
   -import org.apache.hadoop.ha.HAServiceProtocol;
    import org.apache.hadoop.hdfs.DFSConfigKeys;
    import org.apache.hadoop.hdfs.DFSTestUtil;
    import org.apache.hadoop.hdfs.MiniDFSCluster;
   @@ -317,8 +316,7 @@ public void testDataNodeMXBeanLastHeartbeats() throws 
Exception {
    
          // Verify and wait until one of the BP service actor identifies 
active namenode as active
          // and another as standby.
   -      Assert.assertTrue("Datanode could not be connected to active namenode 
in 5s",
   -          datanode.isDatanodeHealthy(5000));
   +      cluster.waitDatanodeConnectedToActive(datanode, 5000);
    
          // Verify that last heartbeat sent to both namenodes in last 5 sec.
          assertLastHeartbeatSentTime(datanode, "LastHeartbeat");
   
   ```
   
   Can add Javadoc or refactor better, just for the idea sake



##########
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeMXBean.java:
##########
@@ -294,4 +297,107 @@ public void testDataNodeMXBeanSlowDisksEnabled() throws 
Exception {
       if (cluster != null) {cluster.shutdown();}
     }
   }
+
+  @Test
+  public void testDataNodeMXBeanLastHeartbeats() throws Exception {
+    Configuration conf = new Configuration();
+    try (MiniDFSCluster cluster = new MiniDFSCluster
+        .Builder(conf)
+        .nnTopology(MiniDFSNNTopology.simpleHATopology(2))
+        .numDataNodes(1)
+        .build()) {
+      cluster.waitActive();
+      cluster.transitionToActive(0);
+      cluster.transitionToStandby(1);
+
+      DataNode datanode = cluster.getDataNodes().get(0);
+
+      MBeanServer mbs = ManagementFactory.getPlatformMBeanServer();
+      ObjectName mxbeanName = new ObjectName(
+          "Hadoop:service=DataNode,name=DataNodeInfo");
+
+      // Verify and wait until one of the BP service actor identifies active 
namenode as active
+      // and another as standby.
+      GenericTestUtils.waitFor(() -> {
+        List<Map<String, String>> bpServiceActorInfo = 
datanode.getBPServiceActorInfoMap();
+        Map<String, String> bpServiceActorInfo1 = bpServiceActorInfo.get(0);
+        Map<String, String> bpServiceActorInfo2 = bpServiceActorInfo.get(1);
+        return (HAServiceProtocol.HAServiceState.ACTIVE.toString()
+            .equals(bpServiceActorInfo1.get("NamenodeHaState"))
+            && HAServiceProtocol.HAServiceState.STANDBY.toString()
+            .equals(bpServiceActorInfo2.get("NamenodeHaState")))
+            || (HAServiceProtocol.HAServiceState.ACTIVE.toString()
+            .equals(bpServiceActorInfo2.get("NamenodeHaState"))
+            && HAServiceProtocol.HAServiceState.STANDBY.toString()
+            .equals(bpServiceActorInfo1.get("NamenodeHaState")));
+      },
+          500,
+          8000,
+          "No namenode is reported active");
+
+      // basic metrics validation
+      String clusterId = (String) mbs.getAttribute(mxbeanName, "ClusterId");
+      Assert.assertEquals(datanode.getClusterId(), clusterId);
+      String version = (String)mbs.getAttribute(mxbeanName, "Version");
+      Assert.assertEquals(datanode.getVersion(),version);
+      String bpActorInfo = (String) mbs.getAttribute(mxbeanName, 
"BPServiceActorInfo");
+      Assert.assertEquals(datanode.getBPServiceActorInfo(), bpActorInfo);
+
+      // Verify that last heartbeat sent to both namenodes in last 5 sec.
+      assertLastHeartbeatSentTime(datanode, "LastHeartbeat");
+      // Verify that last heartbeat response from both namenodes have been 
received within
+      // last 5 sec.
+      assertLastHeartbeatSentTime(datanode, "LastHeartbeatResponseTime");
+
+
+      NameNode sbNameNode = cluster.getNameNode(1);
+
+      // Stopping standby namenode
+      sbNameNode.stop();
+
+      // Verify that last heartbeat response time from one of the namenodes 
would stay much higher
+      // after stopping one namenode.
+      GenericTestUtils.waitFor(() -> {
+        List<Map<String, String>> bpServiceActorInfo = 
datanode.getBPServiceActorInfoMap();
+        Map<String, String> bpServiceActorInfo1 = bpServiceActorInfo.get(0);
+        Map<String, String> bpServiceActorInfo2 = bpServiceActorInfo.get(1);
+
+        long lastHeartbeatResponseTime1 =
+            
Long.parseLong(bpServiceActorInfo1.get("LastHeartbeatResponseTime"));
+        long lastHeartbeatResponseTime2 =
+            
Long.parseLong(bpServiceActorInfo2.get("LastHeartbeatResponseTime"));
+
+        LOG.info("Last heartbeat response from namenode 1: {}", 
lastHeartbeatResponseTime1);
+        LOG.info("Last heartbeat response from namenode 2: {}", 
lastHeartbeatResponseTime2);
+
+        return (lastHeartbeatResponseTime1 < 5L && lastHeartbeatResponseTime2 
> 5L) || (
+            lastHeartbeatResponseTime1 > 5L && lastHeartbeatResponseTime2 < 
5L);
+
+      },
+          200,
+          15000,
+          "Last heartbeat response should be higher than 5s for at least one 
namenode");
+
+      // Verify that last heartbeat sent to both namenodes in last 5 sec even 
though
+      // the last heartbeat received from one of the namenodes is greater than 
5 sec ago.
+      assertLastHeartbeatSentTime(datanode, "LastHeartbeat");
+    }
+  }
+
+  private static void assertLastHeartbeatSentTime(DataNode datanode, String 
lastHeartbeat) {
+    List<Map<String, String>> bpServiceActorInfo = 
datanode.getBPServiceActorInfoMap();
+    Map<String, String> bpServiceActorInfo1 = bpServiceActorInfo.get(0);
+    Map<String, String> bpServiceActorInfo2 = bpServiceActorInfo.get(1);
+
+    long lastHeartbeatSent1 =
+        Long.parseLong(bpServiceActorInfo1.get(lastHeartbeat));
+    long lastHeartbeatSent2 =
+        Long.parseLong(bpServiceActorInfo2.get(lastHeartbeat));
+
+    Assert.assertTrue(lastHeartbeat + " for first bp service actor is higher 
than 5s",
+        lastHeartbeatSent1 < 5L);
+    Assert.assertTrue(lastHeartbeat + " for second bp service actor is higher 
than 5s",
+        lastHeartbeatSent2 < 5L);

Review Comment:
   Should be good then, can revisit if it does create some noise



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to