Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HDDS-12150. Abnormal container states should not crash the SCM ContainerReportHandler thread #7882

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -291,9 +291,6 @@ private boolean updateContainerState(final DatanodeDetails datanode,
}

if (replica.getState() == State.CLOSED) {
Preconditions.checkArgument(replica.getBlockCommitSequenceId()
== container.getSequenceId());

/*
For an EC container, only the first index and the parity indexes are
guaranteed to have block data. So, update the container's state in SCM
Expand All @@ -312,6 +309,9 @@ private boolean updateContainerState(final DatanodeDetails datanode,
logger.info("Moving container {} to CLOSED state, datanode {} " +
"reported CLOSED replica with index {}.", containerId, datanode,
replica.getReplicaIndex());
if (!verifyBcsId(replica.getBlockCommitSequenceId(), container.getSequenceId(), datanode, containerId)) {
return true; // ignored = true
}
Comment on lines 309 to +314
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

logger.info should come after verifyBcsId.

containerManager.updateContainerState(containerId,
LifeCycleEvent.CLOSE);
}
Expand All @@ -336,8 +336,9 @@ private boolean updateContainerState(final DatanodeDetails datanode,
if (replica.getState() == State.CLOSED) {
logger.info("Moving container {} to CLOSED state, datanode {} " +
"reported CLOSED replica.", containerId, datanode);
Preconditions.checkArgument(replica.getBlockCommitSequenceId()
== container.getSequenceId());
if (!verifyBcsId(replica.getBlockCommitSequenceId(), container.getSequenceId(), datanode, containerId)) {
return true; // ignored = true
}
containerManager.updateContainerState(containerId,
LifeCycleEvent.FORCE_CLOSE);
}
Expand Down Expand Up @@ -376,6 +377,33 @@ private boolean updateContainerState(final DatanodeDetails datanode,
return ignored;
}

/**
* Helper method to verify that the replica's bcsId matches the container's in SCM.
* Throws IOException if the bcsIds do not match.
* <p>
* @param replicaBcsId Replica bcsId
* @param containerBcsId Container bcsId in SCM
* @param datanode DatanodeDetails for logging
* @param containerId ContainerID for logging
* @return true if verification has passed, false otherwise
* @throws IOException Thrown when bcsIds do not match
Comment on lines +381 to +389
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The method doesn't throw IOException if the bcsIds do not match.

*/
private boolean verifyBcsId(long replicaBcsId, long containerBcsId,
DatanodeDetails datanode, ContainerID containerId) throws IOException {

if (replicaBcsId != containerBcsId) {
final String errMsg = "Unexpected bcsId for container " + containerId +
" from datanode " + datanode + ". replica's: " + replicaBcsId +
", SCM's: " + containerBcsId +
". Ignoring container report for " + containerId;

logger.error(errMsg);
return false;
} else {
return true;
}
}

private void updateContainerReplica(final DatanodeDetails datanodeDetails,
final ContainerID containerId,
final ContainerReplicaProto replicaProto)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ private void testReplicaIndexUpdate(ContainerInfo container,
Map<DatanodeDetails, Integer> expectedReplicaMap) {
final ContainerReportsProto containerReport = getContainerReportsProto(
container.containerID(), ContainerReplicaProto.State.CLOSED,
dn.getUuidString(), 2000000000L, 100000000L, replicaIndex);
dn.getUuidString(), 2000000000L, 100000000L, 10000L, replicaIndex);
final ContainerReportFromDatanode containerReportFromDatanode =
new ContainerReportFromDatanode(dn, containerReport);
final ContainerReportHandler reportHandler = new ContainerReportHandler(
Expand Down Expand Up @@ -604,7 +604,7 @@ private void createAndHandleContainerReport(ContainerID containerID,

@Test
public void testClosingToQuasiClosed()
throws NodeNotFoundException, IOException, TimeoutException {
throws NodeNotFoundException, IOException {
/*
* The container is in CLOSING state and all the replicas are in
* OPEN/CLOSING state.
Expand Down Expand Up @@ -671,7 +671,7 @@ public void testClosingToQuasiClosed()

@Test
public void testQuasiClosedToClosed()
throws NodeNotFoundException, IOException, TimeoutException {
throws NodeNotFoundException, IOException {
/*
* The container is in QUASI_CLOSED state.
* - One of the replica is in QUASI_CLOSED state
Expand Down Expand Up @@ -740,6 +740,70 @@ public void testQuasiClosedToClosed()
assertEquals(LifeCycleState.CLOSED, containerManager.getContainer(containerOne.containerID()).getState());
}

@Test
public void testQuasiClosedToClosedAttemptWithMismatchingBCSID()
throws NodeNotFoundException, IOException {
/*
* Negative test. When a replica with a (lower) mismatching bcsId gets reported,
* expect the ContainerReportHandler thread to not throw uncaught exception
* (which could lead to ContainerReportHandler thread crash before HDDS-12150)
*/

final ContainerReportHandler reportHandler = new ContainerReportHandler(nodeManager, containerManager);
final Iterator<DatanodeDetails> nodeIterator = nodeManager.getNodes(
NodeStatus.inServiceHealthy()).iterator();

final DatanodeDetails datanodeOne = nodeIterator.next();
final DatanodeDetails datanodeTwo = nodeIterator.next();
final DatanodeDetails datanodeThree = nodeIterator.next();

final ContainerInfo containerOne = getContainer(LifeCycleState.QUASI_CLOSED);
final ContainerInfo containerTwo = getContainer(LifeCycleState.CLOSED);

final Set<ContainerID> containerIDSet = Stream.of(
containerOne.containerID(), containerTwo.containerID())
.collect(Collectors.toSet());
final Set<ContainerReplica> containerOneReplicas = getReplicas(
containerOne.containerID(), ContainerReplicaProto.State.QUASI_CLOSED,
10000L, // sequenceId
datanodeOne);
containerOneReplicas.addAll(getReplicas(
containerOne.containerID(), ContainerReplicaProto.State.CLOSING,
10000L,
datanodeTwo, datanodeThree));
final Set<ContainerReplica> containerTwoReplicas = getReplicas(
containerTwo.containerID(), ContainerReplicaProto.State.CLOSED,
10000L,
datanodeOne, datanodeTwo, datanodeThree);

nodeManager.setContainers(datanodeOne, containerIDSet);
nodeManager.setContainers(datanodeTwo, containerIDSet);
nodeManager.setContainers(datanodeThree, containerIDSet);

containerStateManager.addContainer(containerOne.getProtobuf());
containerStateManager.addContainer(containerTwo.getProtobuf());

containerOneReplicas.forEach(r ->
containerStateManager.updateContainerReplica(
containerTwo.containerID(), r));

containerTwoReplicas.forEach(r ->
containerStateManager.updateContainerReplica(
containerTwo.containerID(), r));


final ContainerReportsProto containerReport = getContainerReportsProto(
containerOne.containerID(), ContainerReplicaProto.State.CLOSED,
datanodeOne.getUuidString(),
2000L);

final ContainerReportFromDatanode containerReportFromDatanode =
new ContainerReportFromDatanode(datanodeOne, containerReport);
reportHandler.onMessage(containerReportFromDatanode, publisher);

assertEquals(LifeCycleState.QUASI_CLOSED, containerManager.getContainer(containerOne.containerID()).getState());
}

@Test
public void openContainerKeyAndBytesUsedUpdatedToMinimumOfAllReplicas()
throws IOException, TimeoutException {
Expand Down Expand Up @@ -1095,7 +1159,7 @@ private ContainerReportFromDatanode getContainerReportFromDatanode(
DatanodeDetails dn, long bytesUsed, long keyCount, int replicaIndex) {
ContainerReportsProto containerReport = getContainerReportsProto(
containerId, state, dn.getUuidString(), bytesUsed, keyCount,
replicaIndex);
10000L, replicaIndex);

return new ContainerReportFromDatanode(dn, containerReport);
}
Expand All @@ -1104,20 +1168,34 @@ protected static ContainerReportsProto getContainerReportsProto(
final ContainerID containerId, final ContainerReplicaProto.State state,
final String originNodeId) {
return getContainerReportsProto(containerId, state, originNodeId,
2000000000L, 100000000L, 0);
2000000000L, 100000000L, 10000L, 0);
}

protected static ContainerReportsProto getContainerReportsProto(
final ContainerID containerId, final ContainerReplicaProto.State state,
final String originNodeId, final long bcsId) {
return getContainerReportsProto(containerId, state, originNodeId,
2000000000L, 100000000L, bcsId, 0);
}

protected static ContainerReportsProto getContainerReportsProto(
final ContainerID containerId, final ContainerReplicaProto.State state,
final String originNodeId, int replicaIndex) {
return getContainerReportsProto(containerId, state, originNodeId,
2000000000L, 100000000L, replicaIndex);
2000000000L, 100000000L, 10000L, replicaIndex);
}

protected static ContainerReportsProto getContainerReportsProto(
final ContainerID containerId, final ContainerReplicaProto.State state,
final String originNodeId, final long bcsId, int replicaIndex) {
return getContainerReportsProto(containerId, state, originNodeId,
2000000000L, 100000000L, bcsId, replicaIndex);
}

protected static ContainerReportsProto getContainerReportsProto(
final ContainerID containerId, final ContainerReplicaProto.State state,
final String originNodeId, final long usedBytes, final long keyCount,
final int replicaIndex) {
final long bcsId, final int replicaIndex) {
final ContainerReportsProto.Builder crBuilder =
ContainerReportsProto.newBuilder();
final ContainerReplicaProto replicaProto =
Expand All @@ -1133,7 +1211,7 @@ protected static ContainerReportsProto getContainerReportsProto(
.setWriteCount(100000000L)
.setReadBytes(2000000000L)
.setWriteBytes(2000000000L)
.setBlockCommitSequenceId(10000L)
.setBlockCommitSequenceId(bcsId)
.setDeleteTransactionId(0)
.setReplicaIndex(replicaIndex)
.build();
Expand Down