This commit is contained in:
James Peru Mmbono 2026-05-12 08:17:27 +01:00 committed by GitHub
commit 744f95c43a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 76 additions and 0 deletions

View File

@ -514,6 +514,17 @@ public class LinstorStorageAdaptor implements StorageAdaptor {
ApiCallRcList answers = api.resourceDefinitionDelete(rd.getName());
checkLinstorAnswersThrow(answers);
deleted = true;
// LINSTOR can return success here while the resource lingers in DELETING state
// on the controller (down peer, lost quorum, etc.). Confirm it's actually gone
// if not, log a WARN so operators can clear it manually. Don't throw: the
// CloudStack-side accounting has already moved on.
if (!LinstorUtil.waitForResourceDefinitionDeleted(api, rd.getName(),
LinstorUtil.DEFAULT_RD_DELETE_VERIFY_TIMEOUT_MILLIS)) {
logger.warn("Linstor: resource {} still present {}ms after delete returned success — " +
"may be stuck in DELETING. Check the LINSTOR controller (linstor resource list).",
rd.getName(), LinstorUtil.DEFAULT_RD_DELETE_VERIFY_TIMEOUT_MILLIS);
}
}
}
return deleted;

View File

@ -232,6 +232,20 @@ public class LinstorPrimaryDataStoreDriverImpl implements PrimaryDataStoreDriver
throw new CloudRuntimeException("Linstor: Unable to delete resource definition: " + rscDefName);
}
logger.info("Linstor: Deleted resource {}", rscDefName);
// LINSTOR can return success on the delete API call while the resource lingers in
// DELETING state (peer issues, lost quorum, satellite down). Verify the resource is
// actually gone if not, log a WARN so operators see it. We deliberately do NOT
// throw here: the volume is already considered gone on the CloudStack side, and
// throwing would leave the CS DB and LINSTOR in different states.
if (!LinstorUtil.waitForResourceDefinitionDeleted(linstorApi, rscDefName,
LinstorUtil.DEFAULT_RD_DELETE_VERIFY_TIMEOUT_MILLIS))
{
logger.warn("Linstor: resource {} still present {}ms after delete returned success — " +
"may be stuck in DELETING. Check the LINSTOR controller (linstor resource list) " +
"and clear manually if the resource has no live peers.",
rscDefName, LinstorUtil.DEFAULT_RD_DELETE_VERIFY_TIMEOUT_MILLIS);
}
} catch (ApiException apiEx)
{
logger.error("Linstor: ApiEx - " + apiEx.getMessage());

View File

@ -401,6 +401,57 @@ public class LinstorUtil {
.collect(Collectors.toList());
}
/**
* Default per-call timeout for {@link #waitForResourceDefinitionDeleted}. Long enough for a
* healthy LINSTOR controller to finish a normal delete; short enough not to block the calling
* agent thread for too long if the delete is genuinely stuck.
*/
public static final long DEFAULT_RD_DELETE_VERIFY_TIMEOUT_MILLIS = 30_000L;
/**
* Returns {@code true} if the named resource definition is no longer present on the LINSTOR
* controller. Used after a {@code resourceDefinitionDelete} to verify the delete actually
* completed (LINSTOR can return success on the API call while the resource lingers in
* DELETING state due to peer issues, lost quorum, or down satellites).
*/
public static boolean isResourceDefinitionGone(DevelopersApi api, String rscName) throws ApiException {
List<ResourceDefinition> all = api.resourceDefinitionList(null, false, null, null, null);
if (all == null) {
return true;
}
return all.stream().noneMatch(rd -> rscName.equalsIgnoreCase(rd.getName()));
}
/**
* Polls the controller until the named resource definition is gone or the timeout elapses.
* Returns {@code true} if the resource was confirmed gone, {@code false} if it was still
* present (or the controller kept erroring) at the deadline. Callers should NOT throw on a
* {@code false} return the upstream API call already reported success and the operator
* may need to investigate manually. Log a WARN with the resource name instead.
*/
public static boolean waitForResourceDefinitionDeleted(DevelopersApi api, String rscName, long timeoutMillis) {
final long deadline = System.currentTimeMillis() + timeoutMillis;
while (true) {
try {
if (isResourceDefinitionGone(api, rscName)) {
return true;
}
} catch (ApiException e) {
LOGGER.debug("LINSTOR delete-verify poll failed for {}: {}", rscName, e.getMessage());
// Keep polling controller may be transiently unavailable.
}
if (System.currentTimeMillis() >= deadline) {
return false;
}
try {
Thread.sleep(1_000L);
} catch (InterruptedException ie) {
Thread.currentThread().interrupt();
return false;
}
}
}
/**
* Returns a pair list of resource-definitions with ther 1:1 mapped resource-group objects that start with the
* resource name `startWith`