Skip to content

Commit

Permalink
HDDS-11243. SCM SafeModeRule Support EC.
Browse files Browse the repository at this point in the history
  • Loading branch information
slfan1989 committed Oct 3, 2024
1 parent 57f59f9 commit 062dcc3
Show file tree
Hide file tree
Showing 6 changed files with 86 additions and 28 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ public final class HddsConfigKeys {

public static final String HDDS_SCM_SAFEMODE_REPORTED_DATANODE_PCT =
"hdds.scm.safemode.reported.datanode.pct";
public static final double HDDS_SCM_SAFEMODE_REPORTED_DATANODE_PCT_DEFAULT = 0.90;
public static final double HDDS_SCM_SAFEMODE_REPORTED_DATANODE_PCT_DEFAULT = 0.10;

// This configuration setting is used as a fallback location by all
// Ozone/HDDS services for their metadata. It is useful as a single
Expand Down
2 changes: 1 addition & 1 deletion hadoop-hdds/common/src/main/resources/ozone-default.xml
Original file line number Diff line number Diff line change
Expand Up @@ -1697,7 +1697,7 @@

<property>
<name>hdds.scm.safemode.reported.datanode.pct</name>
<value>0.90</value>
<value>0.10</value>
<tag>HDDS,SCM,OPERATION</tag>
<description>
Percentage of successfully reported datanodes.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -237,29 +237,56 @@ protected synchronized void cleanup() {

@Override
public String getStatusText() {
List<Long> sampleContainers = ratisContainerMap.keySet()
.stream()
.limit(SAMPLE_CONTAINER_DISPLAY_LIMIT)
.collect(Collectors.toList());

String status = String.format(
// ratis container
String status = String.format(
"%1.2f%% of [Ratis] Containers(%s / %s) with at least one reported replica (=%1.2f) >= " +
"safeModeCutoff (=%1.2f);" +
"%1.2f%% of [EC] Containers(%s / %s) with at least N reported replica (=%1.2f) >= " +
"safeModeCutoff (=%1.2f)",
"safeModeCutoff (=%1.2f);",
(ratisContainerWithMinReplicas.doubleValue() / getRatisMaxContainer()) * 100,
ratisContainerWithMinReplicas, (long) getRatisMaxContainer(),
getCurrentContainerThreshold(), this.safeModeCutoff,
(ecContainerWithMinReplicas.doubleValue() / getEcMaxContainer()) * 100,
ecContainerWithMinReplicas, (long) getEcMaxContainer(),
getCurrentECContainerThreshold(), this.safeModeCutoff);
getCurrentContainerThreshold(), this.safeModeCutoff);

Set<Long> sampleRatisContainers = ratisContainerDNsMap.entrySet().stream().
filter(entry -> entry.getValue().isEmpty()).
map(Map.Entry::getKey).
limit(SAMPLE_CONTAINER_DISPLAY_LIMIT).
collect(Collectors.toSet());

if (!sampleContainers.isEmpty()) {
if (!sampleRatisContainers.isEmpty()) {
String sampleContainerText =
"Sample containers not satisfying the criteria : " + sampleContainers;
"Sample Ratis Containers not satisfying the criteria : " + sampleRatisContainers + ";";
status = status.concat("\n").concat(sampleContainerText);
}

// ec container
String ecStatus = String.format(
"%1.2f%% of [EC] Containers(%s / %s) with at least N reported replica (=%1.2f) >= " +
"safeModeCutoff (=%1.2f);",
(ecContainerWithMinReplicas.doubleValue() / getEcMaxContainer()) * 100,
ecContainerWithMinReplicas, (long) getEcMaxContainer(),
getCurrentECContainerThreshold(), this.safeModeCutoff);
status = status.concat("\n").concat(ecStatus);

Set<Long> sampleEcContainers = ecContainerDNsMap.entrySet().stream().
filter(entry -> {
Long containerId = entry.getKey();
int minReplica = getMinReplica(containerId, Boolean.TRUE);
Set<UUID> allReplicas = entry.getValue();
if (allReplicas.size() >= minReplica) {
return false;
}
return true;
}).
map(Map.Entry::getKey).
limit(SAMPLE_CONTAINER_DISPLAY_LIMIT).
collect(Collectors.toSet());

if (!sampleEcContainers.isEmpty()) {
String sampleECContainerText =
"Sample EC Containers not satisfying the criteria : " + sampleEcContainers + ";";
status = status.concat("\n").concat(sampleECContainerText);
}

return status;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,13 @@
*/
package org.apache.hadoop.hdds.scm.safemode;

import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.Map;
import java.util.UUID;
import java.util.Set;
import java.util.HashSet;
import java.util.HashMap;
import java.util.stream.Collectors;

import com.google.common.base.Preconditions;
import org.apache.hadoop.hdds.conf.ConfigurationSource;
Expand Down Expand Up @@ -53,6 +56,7 @@ public class DataNodeSafeModeRule extends
private HashSet<UUID> registeredDnSet;
private PipelineManager pipelineManager;
private Set<UUID> pipeLineDnSet = new HashSet<>();
private Map<UUID, String> unRegisteredDn = new HashMap<>();
private final double dnReportedPercent;

public DataNodeSafeModeRule(String ruleName, EventQueue eventQueue,
Expand Down Expand Up @@ -90,15 +94,20 @@ protected boolean validate() {
protected void process(NodeRegistrationContainerReport reportsProto) {
UUID dnUUID = reportsProto.getDatanodeDetails().getUuid();

// If a DN is registered for the first time
// (as it is possible for the DN to be registered multiple times),
// we will write the UUID of this DN into the `registeredDnSet` and
// remove it from the unregistered list.
if (pipeLineDnSet.contains(dnUUID) || !registeredDnSet.contains(dnUUID)) {
registeredDnSet.add(reportsProto.getDatanodeDetails().getUuid());
registeredDnSet.add(dnUUID);
registeredDns = registeredDnSet.size();
unRegisteredDn.remove(dnUUID);
}

// Print the DN registration logs.
if (scmInSafeMode()) {
SCMSafeModeManager.getLogger().info(
"SCM in safe mode. {} DataNodes registered, {} required.",
registeredDns, requiredDns);
SCMSafeModeManager.getLogger().debug(
"SCM in safe mode. {} DataNodes registered, {} required.", registeredDns, requiredDns);
}
}

Expand All @@ -109,9 +118,24 @@ protected void cleanup() {

@Override
public String getStatusText() {
return String
.format("Registered DataNodes (=%d) >= Required DataNodes (=%d) / Total DataNode (%d) ",
this.registeredDns, this.requiredDns, this.pipeLineDnSet.size());

String status = String
.format("Registered DataNodes (=%d) >= Required DataNodes (=%d) / Total DataNode (%d); ",
this.registeredDns, this.requiredDns, this.pipeLineDnSet.size());

// Retrieve the list of unregistered DNs.
List<String> unRegisteredDnHostNames = unRegisteredDn.values()
.stream()
.limit(SAMPLE_DN_DISPLAY_LIMIT)
.collect(Collectors.toList());

// We will concatenate the information of unregistered DNs and then display it.
if (!unRegisteredDnHostNames.isEmpty()) {
String sampleDNText = "Unregistered DN : " + unRegisteredDnHostNames;
status = status.concat("\n").concat(sampleDNText);
}

return status;
}


Expand All @@ -127,15 +151,21 @@ public void refresh(boolean forceRefresh) {
}

private void initializeRule(boolean refresh) {
// We will attempt to retrieve the entire DN list here,
// as the Pipeline only exists within the list of active DNs.
if (pipelineManager != null) {
List<Pipeline> pipelines = pipelineManager.getPipelines();
pipelines.forEach(pipeline -> {
List<DatanodeDetails> nodes = pipeline.getNodes();
for (DatanodeDetails node : nodes) {
pipeLineDnSet.add(node.getUuid());
unRegisteredDn.put(node.getUuid(), node.getHostName());
}
});
requiredDns = (int) Math.ceil(dnReportedPercent * pipeLineDnSet.size());
int tmpRequiredDns = (int) Math.ceil(dnReportedPercent * pipeLineDnSet.size());
if (tmpRequiredDns > requiredDns) {
requiredDns = tmpRequiredDns;
}
}

String totalDataNode = pipeLineDnSet.size() > 0 ?
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ public abstract class SafeModeExitRule<T> implements EventHandler<T> {
private final String ruleName;
protected static final int SAMPLE_CONTAINER_DISPLAY_LIMIT = 5;
protected static final int SAMPLE_PIPELINE_DISPLAY_LIMIT = 5;
protected static final int SAMPLE_DN_DISPLAY_LIMIT = 5;

public SafeModeExitRule(SCMSafeModeManager safeModeManager,
String ruleName, EventQueue eventQueue) {
Expand Down
4 changes: 2 additions & 2 deletions hadoop-hdds/server-scm/src/main/resources/webapps/scm/scm.js
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
require: {
overview: "^overview"
},
controller: function ($http,$scope) {
controller: function ($http,$scope,$sce) {
var ctrl = this;
$scope.reverse = false;
$scope.columnName = "hostname";
Expand Down Expand Up @@ -142,7 +142,7 @@

$scope.formatValue = function(value) {
if (value && value.includes(';')) {
return $sce.trustAsHtml(value.replace(';', '<br>'));
return $sce.trustAsHtml(value.replace('/;/g', '<br>'));
} else {
return $sce.trustAsHtml(value);
}
Expand Down

0 comments on commit 062dcc3

Please sign in to comment.