Skip to content

Commit

Permalink
HDDS-11290. Container scanner should keep scanning after non-fatal er…
Browse files Browse the repository at this point in the history
…rors (#7127)
  • Loading branch information
errose28 authored Nov 4, 2024
1 parent 04c196c commit 445eaf1
Show file tree
Hide file tree
Showing 28 changed files with 756 additions and 524 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@
import org.apache.hadoop.ozone.container.common.transport.server.ratis.DispatcherContext;
import org.apache.hadoop.ozone.container.common.volume.HddsVolume;
import org.apache.hadoop.ozone.container.common.volume.VolumeSet;
import org.apache.hadoop.ozone.container.ozoneimpl.ContainerScanError;
import org.apache.hadoop.ozone.container.ozoneimpl.DataScanResult;
import org.apache.hadoop.ozone.container.ozoneimpl.OnDemandContainerDataScanner;
import org.apache.hadoop.ozone.container.common.volume.VolumeUsage;
import org.apache.hadoop.util.Time;
Expand All @@ -70,6 +72,7 @@

import java.io.File;
import java.io.IOException;
import java.util.Collections;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
Expand All @@ -79,7 +82,6 @@
import static org.apache.hadoop.ozone.audit.AuditLogger.PerformanceStringBuilder;
import static org.apache.hadoop.hdds.scm.protocolPB.ContainerCommandResponseBuilders.malformedRequest;
import static org.apache.hadoop.hdds.scm.protocolPB.ContainerCommandResponseBuilders.unsupportedRequest;
import static org.apache.hadoop.ozone.container.common.interfaces.Container.ScanResult;

/**
* Ozone Container dispatcher takes a call from the netty server and routes it
Expand Down Expand Up @@ -396,10 +398,10 @@ private ContainerCommandResponseProto dispatchRequest(
try {
// TODO HDDS-7096 + HDDS-8781: Use on demand scanning for the open
// container instead.
handler.markContainerUnhealthy(container,
ScanResult.unhealthy(ScanResult.FailureType.WRITE_FAILURE,
new File(container.getContainerData().getContainerPath()),
new StorageContainerException(result)));
ContainerScanError error = new ContainerScanError(ContainerScanError.FailureType.WRITE_FAILURE,
new File(container.getContainerData().getContainerPath()),
new StorageContainerException(result));
handler.markContainerUnhealthy(container, DataScanResult.fromErrors(Collections.singletonList(error)));
LOG.info("Marked Container UNHEALTHY, ContainerID: {}", containerID);
} catch (IOException ioe) {
// just log the error here in case marking the container fails,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,71 +33,13 @@
import org.apache.hadoop.hdfs.util.DataTransferThrottler;
import org.apache.hadoop.ozone.container.common.impl.ContainerData;
import org.apache.hadoop.ozone.container.common.volume.VolumeSet;
import org.apache.hadoop.ozone.container.ozoneimpl.DataScanResult;
import org.apache.hadoop.ozone.container.ozoneimpl.MetadataScanResult;

/**
* Interface for Container Operations.
*/
public interface Container<CONTAINERDATA extends ContainerData> {
/**
* Encapsulates the result of a container scan.
*/
class ScanResult {
/**
* Represents the reason a container scan failed and a container should
* be marked unhealthy.
*/
public enum FailureType {
MISSING_CONTAINER_DIR,
MISSING_METADATA_DIR,
MISSING_CONTAINER_FILE,
MISSING_CHUNKS_DIR,
MISSING_CHUNK_FILE,
CORRUPT_CONTAINER_FILE,
CORRUPT_CHUNK,
INCONSISTENT_CHUNK_LENGTH,
INACCESSIBLE_DB,
WRITE_FAILURE,
DELETED_CONTAINER
}

private final boolean healthy;
private final File unhealthyFile;
private final FailureType failureType;
private final Throwable exception;

private ScanResult(boolean healthy, FailureType failureType,
File unhealthyFile, Throwable exception) {
this.healthy = healthy;
this.unhealthyFile = unhealthyFile;
this.failureType = failureType;
this.exception = exception;
}

public static ScanResult healthy() {
return new ScanResult(true, null, null, null);
}

public static ScanResult unhealthy(FailureType type, File failingFile,
Throwable exception) {
return new ScanResult(false, type, failingFile, exception);
}

public boolean isHealthy() {
return healthy;
}

public File getUnhealthyFile() {
return unhealthyFile;
}

public FailureType getFailureType() {
return failureType;
}

public Throwable getException() {
return exception;
}
}

/**
* Creates a container.
Expand Down Expand Up @@ -227,10 +169,10 @@ ContainerReplicaProto getContainerReport()

/**
* check and report the structural integrity of the container.
* @return true if the integrity checks pass
* Scan the container metadata to detect corruption.
* @return A {@link MetadataScanResult} encapsulating the result of the scan.
* @throws InterruptedException if the scanning thread is interrupted before it completes.
*/
ScanResult scanMetaData() throws InterruptedException;
MetadataScanResult scanMetaData() throws InterruptedException;

/**
* Return if the container data should be checksum verified to detect
Expand All @@ -243,15 +185,14 @@ ContainerReplicaProto getContainerReport()
/**
* Perform checksum verification for the container data.
*
* @param throttler A reference of {@link DataTransferThrottler} used to
* perform I/O bandwidth throttling
* @param canceler A reference of {@link Canceler} used to cancel the
* I/O bandwidth throttling (e.g. for shutdown purpose).
* @return true if the checksum verification succeeds
* false otherwise
* @throws InterruptedException if the scan is interrupted.
*/
ScanResult scanData(DataTransferThrottler throttler, Canceler canceler)
* @param throttler A reference of {@link DataTransferThrottler} used to
* perform I/O bandwidth throttling
* @param canceler A reference of {@link Canceler} used to cancel the
* I/O bandwidth throttling (e.g. for shutdown purpose).
* @return A {@link DataScanResult} encapsulating the result of the scan.
* @throws InterruptedException if the scanning thread is interrupted before it completes.
*/
DataScanResult scanData(DataTransferThrottler throttler, Canceler canceler)
throws InterruptedException;

/** Acquire read lock. */
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,6 @@
import org.apache.hadoop.ozone.container.keyvalue.TarContainerPacker;
import org.apache.ratis.statemachine.StateMachine;

import static org.apache.hadoop.ozone.container.common.interfaces.Container.ScanResult;

/**
* Dispatcher sends ContainerCommandRequests to Handler. Each Container Type
* should have an implementation for Handler.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.ozone.container.common.interfaces;

import org.apache.hadoop.ozone.container.ozoneimpl.ContainerScanError;

import java.util.List;

/**
* Encapsulates the result of a container scan.
*/
public interface ScanResult {
boolean isHealthy();

boolean isDeleted();

List<ContainerScanError> getErrors();
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import static org.apache.hadoop.ozone.container.common.interfaces.Container.ScanResult;
import org.apache.hadoop.ozone.container.common.interfaces.ScanResult;

/**
* Utility class defining methods to write to the datanode container log.
Expand Down Expand Up @@ -91,10 +91,7 @@ public static void logClosed(ContainerData containerData) {
*/
public static void logUnhealthy(ContainerData containerData,
ScanResult reason) {
String message = reason.getFailureType() + " for file " +
reason.getUnhealthyFile() +
". Message: " + reason.getException().getMessage();
LOG.error(getMessage(containerData, message));
LOG.error(getMessage(containerData, reason.toString()));
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@
import org.apache.hadoop.ozone.container.keyvalue.helpers.BlockUtils;
import org.apache.hadoop.ozone.container.keyvalue.helpers.KeyValueContainerLocationUtil;
import org.apache.hadoop.ozone.container.keyvalue.helpers.KeyValueContainerUtil;
import org.apache.hadoop.ozone.container.ozoneimpl.DataScanResult;
import org.apache.hadoop.ozone.container.ozoneimpl.MetadataScanResult;
import org.apache.hadoop.ozone.container.replication.ContainerImporter;
import org.apache.hadoop.ozone.container.upgrade.VersionedDatanodeFeatures;
import org.apache.hadoop.util.DiskChecker.DiskOutOfSpaceException;
Expand Down Expand Up @@ -940,11 +942,9 @@ public File getContainerDBFile() {
}

@Override
public ScanResult scanMetaData() throws InterruptedException {
long containerId = containerData.getContainerID();
public MetadataScanResult scanMetaData() throws InterruptedException {
KeyValueContainerCheck checker =
new KeyValueContainerCheck(containerData.getMetadataPath(), config,
containerId, containerData.getVolume(), this);
new KeyValueContainerCheck(config, this);
return checker.fastCheck();
}

Expand All @@ -963,19 +963,15 @@ public boolean shouldScanData() {
}

@Override
public ScanResult scanData(DataTransferThrottler throttler, Canceler canceler)
public DataScanResult scanData(DataTransferThrottler throttler, Canceler canceler)
throws InterruptedException {
if (!shouldScanData()) {
throw new IllegalStateException("The checksum verification can not be" +
" done for container in state "
+ containerData.getState());
}

long containerId = containerData.getContainerID();
KeyValueContainerCheck checker =
new KeyValueContainerCheck(containerData.getMetadataPath(), config,
containerId, containerData.getVolume(), this);

KeyValueContainerCheck checker = new KeyValueContainerCheck(config, this);
return checker.fullCheck(throttler, canceler);
}

Expand Down
Loading

0 comments on commit 445eaf1

Please sign in to comment.