peterxcli commented on code in PR #8386:
URL: https://github.com/apache/ozone/pull/8386#discussion_r2072762765


##########
hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/HealthyPipelineSafeModeRule.java:
##########
@@ -122,21 +131,65 @@ protected synchronized boolean validate() {
 
   @Override
   protected synchronized void process(Pipeline pipeline) {
+    Preconditions.checkNotNull(pipeline);
 
     // When SCM is in safe mode for long time, already registered
     // datanode can send pipeline report again, or SCMPipelineManager will
     // create new pipelines.
-    Preconditions.checkNotNull(pipeline);
-    if (pipeline.getType() == HddsProtos.ReplicationType.RATIS &&
-        ((RatisReplicationConfig) pipeline.getReplicationConfig())
-            .getReplicationFactor() == HddsProtos.ReplicationFactor.THREE &&
-        !processedPipelineIDs.contains(pipeline.getId())) {
-      getSafeModeMetrics().incCurrentHealthyPipelinesCount();
-      currentHealthyPipelineCount++;
-      processedPipelineIDs.add(pipeline.getId());
-      unProcessedPipelineSet.remove(pipeline.getId());
+
+    // Only handle RATIS + 3-replica pipelines.
+    if (pipeline.getType() != HddsProtos.ReplicationType.RATIS ||
+        ((RatisReplicationConfig) 
pipeline.getReplicationConfig()).getReplicationFactor() !=
+            HddsProtos.ReplicationFactor.THREE) {
+      SCMSafeModeManager.getLogger().warn(
+          "Skipping pipeline safemode report processing as Replication type 
isn't RATIS " +
+              "or replication factor isn't 3.");
+      return;
+    }
+
+    // Skip already processed ones.
+    if (processedPipelineIDs.contains(pipeline.getId())) {
+      LOG.info("Skipping pipeline safemode report processing check as 
pipeline: {} is already recorded.",
+          pipeline.getId());
+      return;
+    }
+
+    List<DatanodeDetails> pipelineDns = pipeline.getNodes();
+    if (pipelineDns.size() != 3) {
+      LOG.warn("Only {} DNs reported this pipeline: {}, all 3 DNs should 
report the pipeline", pipelineDns.size(),
+          pipeline.getId());
+      return;
     }
 
+    Map<DatanodeDetails, String> badDnsWithReasons = new LinkedHashMap<>();
+
+    for (DatanodeDetails dn : pipelineDns) {
+      try {
+        NodeStatus status = nodeManager.getNodeStatus(dn);
+        if (!status.equals(NodeStatus.inServiceHealthy())) {
+          String reason = String.format("Health: %s, Operational State: %s",
+              status.getHealth(), status.getOperationalState());
+          badDnsWithReasons.put(dn, reason);
+        }
+      } catch (NodeNotFoundException e) {
+        badDnsWithReasons.put(dn, "DN not registered with SCM");
+      }
+    }
+
+    if (!badDnsWithReasons.isEmpty()) {
+      LOG.warn("Below DNs reported by Pipeline: {} are either in bad health or 
un-registered with SCMs",
+          pipeline.getId());
+      for (Map.Entry<DatanodeDetails, String> entry : 
badDnsWithReasons.entrySet()) {
+        LOG.warn("DN {}: {}", entry.getKey().getID(), entry.getValue());
+      }
+      return;
+    }

Review Comment:
   log all the bad DNs and their reasons in a single line?
   ```suggestion
       if (!badDnsWithReasons.isEmpty()) {
         String badDnSummary = badDnsWithReasons.entrySet().stream()
             .map(entry -> String.format("DN %s: %s", entry.getKey().getID(), 
entry.getValue()))
             .collect(Collectors.joining("; "));
         LOG.warn("Below DNs reported by Pipeline: {} are either in bad health 
or un-registered with SCMs. Details: {}",
             pipeline.getId(), badDnSummary);
         return;
       }
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@ozone.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@ozone.apache.org
For additional commands, e-mail: issues-h...@ozone.apache.org

Reply via email to