fix for failed backup jobs, handling unfit vms

Signed-off-by: Abhishek Kumar <abhishek.mrt22@gmail.com>
This commit is contained in:
Abhishek Kumar 2026-05-07 10:34:56 +05:30
parent ebfe83d2ab
commit e2a7bd2e25
5 changed files with 399 additions and 9 deletions

View File

@ -46,6 +46,9 @@ public interface VolumeDao extends GenericDao<VolumeVO, Long>, StateDao<Volume.S
List<VolumeVO> findByInstanceAndType(long id, Volume.Type vType);
List<VolumeVO> findByInstanceAndNotStates(long id, Volume.State...states);
List<VolumeVO> findIncludingRemovedByInstanceAndType(long id, Volume.Type vType);
List<VolumeVO> findNonDestroyedVolumesByInstanceIdAndPoolId(long instanceId, long poolId);

View File

@ -208,6 +208,17 @@ public class VolumeDaoImpl extends GenericDaoBase<VolumeVO, Long> implements Vol
return listBy(sc);
}
@Override
public List<VolumeVO> findByInstanceAndNotStates(long id, Volume.State...states) {
SearchBuilder<VolumeVO> sb = createSearchBuilder();
sb.and("instanceId", sb.entity().getInstanceId(), Op.EQ);
sb.and("state", sb.entity().getState(), Op.NIN);
SearchCriteria<VolumeVO> sc = sb.create();
sc.setParameters("instanceId", id);
sc.setParameters("state", (Object[]) states);
return listBy(sc);
}
@Override
public List<VolumeVO> findIncludingRemovedByInstanceAndType(long id, Type vType) {
SearchCriteria<VolumeVO> sc = AllFieldsSearch.create();

View File

@ -41,6 +41,7 @@ import org.mockito.Mockito;
import org.mockito.Spy;
import org.mockito.junit.MockitoJUnitRunner;
import com.cloud.storage.Volume;
import com.cloud.storage.VolumeVO;
import com.cloud.utils.db.Filter;
import com.cloud.utils.db.SearchBuilder;
@ -113,6 +114,69 @@ public class VolumeDaoImplTest {
verify(preparedStatementMock, times(1)).executeQuery();
}
@Test
public void findByInstanceAndNotState_queriesWithInstanceIdAndExcludedStates() {
SearchBuilder<VolumeVO> sb = Mockito.mock(SearchBuilder.class);
SearchCriteria<VolumeVO> sc = Mockito.mock(SearchCriteria.class);
Mockito.when(sb.create()).thenReturn(sc);
Mockito.doReturn(new ArrayList<>()).when(volumeDao).listBy(sc);
Mockito.when(volumeDao.createSearchBuilder()).thenReturn(sb);
VolumeVO mockedVO = Mockito.mock(VolumeVO.class);
Mockito.when(sb.entity()).thenReturn(mockedVO);
volumeDao.findByInstanceAndNotStates(42L, Volume.State.Ready);
Mockito.verify(sc).setParameters("instanceId", 42L);
Mockito.verify(sc).setParameters("state", (Object[]) new Volume.State[]{Volume.State.Ready});
}
@Test
public void findByInstanceAndNotStates_withMultipleExcludedStates_passesAllStatesToCriteria() {
SearchBuilder<VolumeVO> sb = Mockito.mock(SearchBuilder.class);
SearchCriteria<VolumeVO> sc = Mockito.mock(SearchCriteria.class);
Mockito.when(sb.create()).thenReturn(sc);
Mockito.doReturn(new ArrayList<>()).when(volumeDao).listBy(sc);
Mockito.when(volumeDao.createSearchBuilder()).thenReturn(sb);
VolumeVO mockedVO = Mockito.mock(VolumeVO.class);
Mockito.when(sb.entity()).thenReturn(mockedVO);
volumeDao.findByInstanceAndNotStates(7L, Volume.State.Destroy, Volume.State.Expunged);
Mockito.verify(sc).setParameters("instanceId", 7L);
Mockito.verify(sc).setParameters("state",
(Object[]) new Volume.State[]{Volume.State.Destroy, Volume.State.Expunged});
}
@Test
public void findByInstanceAndNotStates_returnsResultFromDao() {
SearchBuilder<VolumeVO> sb = Mockito.mock(SearchBuilder.class);
SearchCriteria<VolumeVO> sc = Mockito.mock(SearchCriteria.class);
Mockito.when(sb.create()).thenReturn(sc);
VolumeVO vol = Mockito.mock(VolumeVO.class);
Mockito.doReturn(List.of(vol)).when(volumeDao).listBy(sc);
Mockito.when(volumeDao.createSearchBuilder()).thenReturn(sb);
Mockito.when(sb.entity()).thenReturn(Mockito.mock(VolumeVO.class));
List<VolumeVO> result = volumeDao.findByInstanceAndNotStates(1L, Volume.State.Ready);
Assert.assertEquals(1, result.size());
Assert.assertSame(vol, result.get(0));
}
@Test
public void findByInstanceAndNotStates_noMatchingVolumes_returnsEmptyList() {
SearchBuilder<VolumeVO> sb = Mockito.mock(SearchBuilder.class);
SearchCriteria<VolumeVO> sc = Mockito.mock(SearchCriteria.class);
Mockito.when(sb.create()).thenReturn(sc);
Mockito.doReturn(new ArrayList<>()).when(volumeDao).listBy(sc);
Mockito.when(volumeDao.createSearchBuilder()).thenReturn(sb);
Mockito.when(sb.entity()).thenReturn(Mockito.mock(VolumeVO.class));
List<VolumeVO> result = volumeDao.findByInstanceAndNotStates(99L, Volume.State.Ready);
Assert.assertTrue(result.isEmpty());
}
@Test
public void testSearchRemovedByVmsNoVms() {
Assert.assertTrue(CollectionUtils.isEmpty(volumeDao.searchRemovedByVms(
@ -141,5 +205,4 @@ public class VolumeDaoImplTest {
Mockito.any(SearchCriteria.class), Mockito.any(Filter.class), Mockito.eq(null),
Mockito.eq(false));
}
}

View File

@ -52,6 +52,7 @@ import org.apache.cloudstack.framework.jobs.impl.VmWorkJobVO;
import org.apache.cloudstack.jobs.JobInfo;
import org.apache.cloudstack.storage.datastore.db.PrimaryDataStoreDao;
import org.apache.cloudstack.storage.datastore.db.StoragePoolVO;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.joda.time.DateTime;
import org.springframework.stereotype.Component;
@ -86,6 +87,7 @@ import com.cloud.vm.VMInstanceDetailVO;
import com.cloud.vm.VMInstanceVO;
import com.cloud.vm.VirtualMachine;
import com.cloud.vm.VirtualMachine.State;
import com.cloud.vm.VirtualMachineManager;
import com.cloud.vm.VmDetailConstants;
import com.cloud.vm.VmWork;
import com.cloud.vm.VmWorkConstants;
@ -136,6 +138,9 @@ public class KVMBackupExportServiceImpl extends ManagerBase implements KVMBackup
@Inject
AsyncJobManager asyncJobManager;
@Inject
VirtualMachineManager virtualMachineManager;
VmWorkJobHandlerProxy jobHandlerProxy = new VmWorkJobHandlerProxy(this);
private void verifyKVMBackupExportServiceSupported(Long zoneId) {
@ -145,24 +150,44 @@ public class KVMBackupExportServiceImpl extends ManagerBase implements KVMBackup
}
}
protected void validateVmVolumesForBackup(VMInstanceVO vm) {
List<VolumeVO> volumes = volumeDao.findByInstanceAndNotStates(vm.getId(), Volume.State.Ready);
List<String> nonReadyVolumeIds = volumes
.stream()
.map(VolumeVO::getUuid)
.collect(Collectors.toList());
if (CollectionUtils.isNotEmpty(nonReadyVolumeIds)) {
throw new CloudRuntimeException(String.format("Volumes [%s] of Instance: %s are not in Ready state",
StringUtils.join(nonReadyVolumeIds, ","), vm.getUuid()));
}
}
@Override
public Backup createBackup(StartBackupCmd cmd) {
Long vmId = cmd.getVmId();
VMInstanceVO vm = vmInstanceDao.findById(vmId);
if (vm == null) {
throw new CloudRuntimeException("VM not found: " + vmId);
throw new CloudRuntimeException("Instance not found: " + vmId);
}
verifyKVMBackupExportServiceSupported(vm.getDataCenterId());
if (vm.getState() != State.Running && vm.getState() != State.Stopped) {
throw new CloudRuntimeException("VM must be running or stopped to start backup");
throw new CloudRuntimeException("Instance must be running or stopped to start Backup");
}
Backup existingBackup = backupDao.findByVmId(vmId);
if (existingBackup != null && existingBackup.getStatus() == Backup.Status.BackingUp) {
throw new CloudRuntimeException("Backup already in progress for VM: " + vmId);
throw new CloudRuntimeException("Backup already in progress for Instance: " + vm.getUuid());
}
validateVmVolumesForBackup(vm);
Pair<Long, Long> clusterAndHostId = virtualMachineManager.findClusterAndHostIdForVm(vm, false);
Long hostId = clusterAndHostId.second();
if (hostId == null) {
throw new CloudRuntimeException("Host cannot be determined for Instance: " + vm.getUuid());
}
BackupVO backup = new BackupVO();
@ -190,8 +215,6 @@ public class KVMBackupExportServiceImpl extends ManagerBase implements KVMBackup
backup.setToCheckpointId(toCheckpointId);
backup.setFromCheckpointId(fromCheckpointId);
backup.setType("FULL");
Long hostId = vm.getHostId() != null ? vm.getHostId() : vm.getLastHostId();
backup.setHostId(hostId);
return backupDao.persist(backup);
@ -231,15 +254,20 @@ public class KVMBackupExportServiceImpl extends ManagerBase implements KVMBackup
Long vmId = cmd.getVmId();
VMInstanceVO vm = vmInstanceDao.findById(vmId);
if (vm == null) {
throw new CloudRuntimeException("VM not found: " + vmId);
removeFailedBackup(backup);
throw new CloudRuntimeException("Instance not found for Backup: " + backup.getUuid());
}
List<VolumeVO> volumes = volumeDao.findByInstance(vmId);
Map<String, String> diskPathUuidMap = new HashMap<>();
for (Volume vol : volumes) {
if (vol.getPoolId() == null) {
removeFailedBackup(backup);
throw new CloudRuntimeException("Storage Pool cannot be determined for Volume: " + vol.getUuid());
}
String volumePath = getVolumePathForFileBasedBackend(vol);
diskPathUuidMap.put(volumePath, vol.getUuid());
}
long hostId = backup.getHostId();
Long hostId = backup.getHostId();
VMInstanceDetailVO lastCheckpointId = vmInstanceDetailsDao.findDetail(vmId, VmDetailConstants.LAST_CHECKPOINT_ID);
if (lastCheckpointId != null) {
@ -249,6 +277,10 @@ public class KVMBackupExportServiceImpl extends ManagerBase implements KVMBackup
logger.warn("Failed to delete last checkpoint {} for VM {}, proceeding with backup start", lastCheckpointId.getValue(), vmId, e);
}
}
if (hostId == null) {
removeFailedBackup(backup);
throw new CloudRuntimeException("Host cannot be found for Backup: " + backup.getUuid());
}
Host host = hostDao.findById(hostId);
Map<String, String> vmDetails = vmInstanceDetailsDao.listDetailsKeyPairs(vmId);
@ -276,7 +308,7 @@ public class KVMBackupExportServiceImpl extends ManagerBase implements KVMBackup
if (!answer.getResult()) {
removeFailedBackup(backup);
logger.error("Failed to start {} due to: {}", backup, answer.getDetails());
throw new CloudRuntimeException("Failed to start backup: " + answer.getDetails());
throw new CloudRuntimeException("Failed to start Backup: " + answer.getDetails());
}
// Update backup with checkpoint creation time

View File

@ -0,0 +1,281 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
package org.apache.cloudstack.backup;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertThrows;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.when;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.cloudstack.api.command.admin.backup.StartBackupCmd;
import org.apache.cloudstack.backup.dao.BackupDao;
import org.apache.cloudstack.backup.dao.ImageTransferDao;
import org.junit.Before;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.mockito.ArgumentCaptor;
import org.mockito.InjectMocks;
import org.mockito.Mock;
import org.mockito.junit.MockitoJUnitRunner;
import com.cloud.storage.Volume;
import com.cloud.storage.VolumeVO;
import com.cloud.storage.dao.VolumeDao;
import com.cloud.utils.Pair;
import com.cloud.utils.exception.CloudRuntimeException;
import com.cloud.vm.VMInstanceVO;
import com.cloud.vm.VirtualMachine.State;
import com.cloud.vm.VirtualMachineManager;
import com.cloud.vm.dao.VMInstanceDao;
import com.cloud.vm.dao.VMInstanceDetailsDao;
@RunWith(MockitoJUnitRunner.class)
public class KVMBackupExportServiceImplTest {
@InjectMocks
KVMBackupExportServiceImpl service;
@Mock
VolumeDao volumeDao;
@Mock
VMInstanceDao vmInstanceDao;
@Mock
VMInstanceDetailsDao vmInstanceDetailsDao;
@Mock
BackupDao backupDao;
@Mock
ImageTransferDao imageTransferDao;
@Mock
VirtualMachineManager virtualMachineManager;
VMInstanceVO vm;
@Before
public void setUp() {
vm = mock(VMInstanceVO.class);
when(vm.getId()).thenReturn(1L);
when(vm.getUuid()).thenReturn("vm-uuid");
}
@Test
public void validateVmVolumesForBackup_noNonReadyVolumes_doesNotThrow() {
when(volumeDao.findByInstanceAndNotStates(1L, Volume.State.Ready)).thenReturn(Collections.emptyList());
service.validateVmVolumesForBackup(vm);
}
@Test
public void validateVmVolumesForBackup_oneVolumeNotReady_throwsWithVolumeAndInstanceId() {
VolumeVO vol = mock(VolumeVO.class);
when(vol.getUuid()).thenReturn("vol-not-ready");
when(volumeDao.findByInstanceAndNotStates(1L, Volume.State.Ready)).thenReturn(List.of(vol));
CloudRuntimeException ex = assertThrows(CloudRuntimeException.class,
() -> service.validateVmVolumesForBackup(vm));
assert ex.getMessage().contains("vol-not-ready");
assert ex.getMessage().contains("vm-uuid");
}
@Test
public void validateVmVolumesForBackup_multipleVolumesNotReady_throwsWithAllVolumeIds() {
VolumeVO vol1 = mock(VolumeVO.class);
VolumeVO vol2 = mock(VolumeVO.class);
when(vol1.getUuid()).thenReturn("vol-a");
when(vol2.getUuid()).thenReturn("vol-b");
when(volumeDao.findByInstanceAndNotStates(1L, Volume.State.Ready)).thenReturn(List.of(vol1, vol2));
CloudRuntimeException ex = assertThrows(CloudRuntimeException.class,
() -> service.validateVmVolumesForBackup(vm));
assert ex.getMessage().contains("vol-a");
assert ex.getMessage().contains("vol-b");
assert ex.getMessage().contains("vm-uuid");
}
private StartBackupCmd mockCmd(Long vmId, String name, String description) {
StartBackupCmd cmd = mock(StartBackupCmd.class);
when(cmd.getVmId()).thenReturn(vmId);
when(cmd.getName()).thenReturn(name);
when(cmd.getDescription()).thenReturn(description);
return cmd;
}
private void stubVmRunningWithHost(Long vmId, VMInstanceVO vmInstance, Long hostId) {
when(vmInstanceDao.findById(vmId)).thenReturn(vmInstance);
when(vmInstance.getState()).thenReturn(State.Running);
when(vmInstance.getDataCenterId()).thenReturn(10L);
when(vmInstance.getAccountId()).thenReturn(100L);
when(vmInstance.getDomainId()).thenReturn(200L);
when(backupDao.findByVmId(vmId)).thenReturn(null);
when(volumeDao.findByInstanceAndNotStates(vmId, Volume.State.Ready)).thenReturn(Collections.emptyList());
when(virtualMachineManager.findClusterAndHostIdForVm(vmInstance, false))
.thenReturn(new Pair<>(5L, hostId));
when(vmInstanceDetailsDao.listDetailsKeyPairs(vmId)).thenReturn(new HashMap<>());
}
@Test
public void createBackup_instanceNotFound_throws() {
when(vmInstanceDao.findById(99L)).thenReturn(null);
assertThrows(CloudRuntimeException.class,
() -> service.createBackup(mockCmd(99L, "backup", null)));
}
@Test
public void createBackup_instanceNotRunningOrStopped_throws() {
when(vmInstanceDao.findById(1L)).thenReturn(vm);
when(vm.getState()).thenReturn(State.Migrating);
when(vm.getDataCenterId()).thenReturn(10L);
assertThrows(CloudRuntimeException.class,
() -> service.createBackup(mockCmd(1L, "backup", null)));
}
@Test
public void createBackup_backupAlreadyInProgress_throws() {
when(vmInstanceDao.findById(1L)).thenReturn(vm);
when(vm.getState()).thenReturn(State.Running);
when(vm.getDataCenterId()).thenReturn(10L);
BackupVO existing = mock(BackupVO.class);
when(existing.getStatus()).thenReturn(Backup.Status.BackingUp);
when(backupDao.findByVmId(1L)).thenReturn(existing);
assertThrows(CloudRuntimeException.class,
() -> service.createBackup(mockCmd(1L, "backup", null)));
}
@Test
public void createBackup_hostCannotBeDetermined_throws() {
stubVmRunningWithHost(1L, vm, null);
assertThrows(CloudRuntimeException.class,
() -> service.createBackup(mockCmd(1L, "backup", null)));
}
@Test
public void createBackup_happyPath_persistsBackupWithQueuedStatus() {
stubVmRunningWithHost(1L, vm, 42L);
BackupVO persisted = mock(BackupVO.class);
when(backupDao.persist(any(BackupVO.class))).thenReturn(persisted);
Backup result = service.createBackup(mockCmd(1L, "my-backup", "desc"));
assertNotNull(result);
ArgumentCaptor<BackupVO> captor = ArgumentCaptor.forClass(BackupVO.class);
verify(backupDao).persist(captor.capture());
assertEquals(Backup.Status.Queued, captor.getValue().getStatus());
assertEquals("my-backup", captor.getValue().getName());
assertEquals("desc", captor.getValue().getDescription());
assertEquals(Long.valueOf(42L), captor.getValue().getHostId());
assertEquals(Long.valueOf(1L), captor.getValue().getVmId());
}
@Test
public void createBackup_noNameProvided_generatesNameFromVmId() {
stubVmRunningWithHost(1L, vm, 42L);
when(backupDao.persist(any(BackupVO.class))).thenReturn(mock(BackupVO.class));
service.createBackup(mockCmd(1L, null, null));
ArgumentCaptor<BackupVO> captor = ArgumentCaptor.forClass(BackupVO.class);
verify(backupDao).persist(captor.capture());
assertNotNull(captor.getValue().getName());
assert captor.getValue().getName().startsWith("1-");
}
@Test
public void createBackup_existingBackupNotInProgress_proceedsNormally() {
when(vmInstanceDao.findById(1L)).thenReturn(vm);
when(vm.getState()).thenReturn(State.Stopped);
when(vm.getDataCenterId()).thenReturn(10L);
when(vm.getAccountId()).thenReturn(100L);
when(vm.getDomainId()).thenReturn(200L);
BackupVO existing = mock(BackupVO.class);
when(existing.getStatus()).thenReturn(Backup.Status.BackedUp);
when(backupDao.findByVmId(1L)).thenReturn(existing);
when(volumeDao.findByInstanceAndNotStates(1L, Volume.State.Ready)).thenReturn(Collections.emptyList());
when(virtualMachineManager.findClusterAndHostIdForVm(vm, false)).thenReturn(new Pair<>(5L, 42L));
when(vmInstanceDetailsDao.listDetailsKeyPairs(1L)).thenReturn(new HashMap<>());
when(backupDao.persist(any(BackupVO.class))).thenReturn(mock(BackupVO.class));
Backup result = service.createBackup(mockCmd(1L, "backup", null));
assertNotNull(result);
}
@Test
public void createBackup_withActiveCheckpoint_setsFromCheckpointId() {
when(vmInstanceDao.findById(1L)).thenReturn(vm);
when(vm.getState()).thenReturn(State.Running);
when(vm.getDataCenterId()).thenReturn(10L);
when(vm.getAccountId()).thenReturn(100L);
when(vm.getDomainId()).thenReturn(200L);
when(backupDao.findByVmId(1L)).thenReturn(null);
when(volumeDao.findByInstanceAndNotStates(1L, Volume.State.Ready)).thenReturn(Collections.emptyList());
when(virtualMachineManager.findClusterAndHostIdForVm(vm, false)).thenReturn(new Pair<>(5L, 42L));
Map<String, String> details = new HashMap<>();
details.put("active.checkpoint.id", "ckp-abc123");
when(vmInstanceDetailsDao.listDetailsKeyPairs(1L)).thenReturn(details);
when(backupDao.persist(any(BackupVO.class))).thenReturn(mock(BackupVO.class));
service.createBackup(mockCmd(1L, "backup", null));
ArgumentCaptor<BackupVO> captor = ArgumentCaptor.forClass(BackupVO.class);
verify(backupDao).persist(captor.capture());
assertEquals("ckp-abc123", captor.getValue().getFromCheckpointId());
}
@Test
public void createBackup_noActiveCheckpoint_fromCheckpointIdIsNull() {
stubVmRunningWithHost(1L, vm, 42L);
when(backupDao.persist(any(BackupVO.class))).thenReturn(mock(BackupVO.class));
service.createBackup(mockCmd(1L, "backup", null));
ArgumentCaptor<BackupVO> captor = ArgumentCaptor.forClass(BackupVO.class);
verify(backupDao).persist(captor.capture());
assert captor.getValue().getFromCheckpointId() == null;
assertNotNull(captor.getValue().getToCheckpointId());
assert captor.getValue().getToCheckpointId().startsWith("ckp-");
}
@Test
public void removeFailedBackup_setsErrorStatusAndRemovesRecord() {
BackupVO backup = mock(BackupVO.class);
when(backup.getId()).thenReturn(10L);
service.removeFailedBackup(backup);
verify(backup).setStatus(Backup.Status.Error);
verify(backupDao).update(10L, backup);
verify(backupDao).remove(10L);
}
}