Bug 9043 - VM manual migration - when destination host is out of memory for migration, VMs being migrated remained in 'migrating' state

Changes:
- When migration fails we try to do cleanup on the destination host agent. The AgentUnavailableException in this cleanup was not caught.
-Due to that other cleanup like reverting capacity allocated and vm state were skipped.
-Fix is to catch the AgentUnavailableException so that rest of the cleanup can happen.
- Also corrected the exceptions in various cases of migration failure.
- In case the VM is still starting, HA should schedule a retry. Introduced a special migration exception for handling this.
This commit is contained in:
prachi 2011-04-04 17:28:58 -07:00
parent 6736569d22
commit 53f8ebf6f0
8 changed files with 76 additions and 16 deletions

View File

@ -31,6 +31,7 @@ import com.cloud.exception.ConcurrentOperationException;
import com.cloud.exception.InvalidParameterValueException;
import com.cloud.exception.ManagementServerException;
import com.cloud.exception.ResourceUnavailableException;
import com.cloud.exception.VirtualMachineMigrationException;
import com.cloud.host.Host;
import com.cloud.user.Account;
import com.cloud.uservm.UserVm;
@ -123,6 +124,9 @@ public class MigrateVMCmd extends BaseAsyncCmd {
} catch (ManagementServerException e) {
s_logger.warn("Exception: ", e);
throw new ServerApiException(BaseCmd.INTERNAL_ERROR, e.getMessage());
}
} catch (VirtualMachineMigrationException e) {
s_logger.warn("Exception: ", e);
throw new ServerApiException(BaseCmd.INTERNAL_ERROR, e.getMessage());
}
}
}

View File

@ -0,0 +1,28 @@
/**
* Copyright (C) 2010 Cloud.com, Inc. All rights reserved.
*
* This software is licensed under the GNU General Public License v3 or later.
*
* It is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
package com.cloud.exception;
import com.cloud.utils.SerialVersionUID;
public class VirtualMachineMigrationException extends Exception {
private static final long serialVersionUID = SerialVersionUID.VirtualMachineMigrationException;
public VirtualMachineMigrationException(String message) {
super(message);
}
}

View File

@ -44,6 +44,7 @@ import com.cloud.exception.PermissionDeniedException;
import com.cloud.exception.ResourceAllocationException;
import com.cloud.exception.ResourceUnavailableException;
import com.cloud.exception.StorageUnavailableException;
import com.cloud.exception.VirtualMachineMigrationException;
import com.cloud.hypervisor.Hypervisor.HypervisorType;
import com.cloud.offering.ServiceOffering;
import com.cloud.storage.Volume;
@ -286,6 +287,7 @@ public interface UserVmService {
* @throws ManagementServerException in case we get error finding the VM or host or access errors or other internal errors.
* @throws ConcurrentOperationException if there are multiple users working on the same VM.
* @throws ResourceUnavailableException if the destination host to migrate the VM is not currently available.
* @throws VirtualMachineMigrationException if the VM to be migrated is not in Running state
*/
UserVm migrateVirtualMachine(UserVm vm, Host destinationHost) throws ResourceUnavailableException, ConcurrentOperationException, ManagementServerException;
UserVm migrateVirtualMachine(UserVm vm, Host destinationHost) throws ResourceUnavailableException, ConcurrentOperationException, ManagementServerException, VirtualMachineMigrationException;
}

View File

@ -49,6 +49,7 @@ import com.cloud.exception.InsufficientCapacityException;
import com.cloud.exception.InsufficientServerCapacityException;
import com.cloud.exception.OperationTimedoutException;
import com.cloud.exception.ResourceUnavailableException;
import com.cloud.exception.VirtualMachineMigrationException;
import com.cloud.ha.dao.HighAvailabilityDao;
import com.cloud.host.Host;
import com.cloud.host.HostVO;
@ -479,6 +480,10 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager, Clu
s_logger.warn("Insufficient capacity for migrating a VM.");
_agentMgr.maintenanceFailed(srcHostId);
return (System.currentTimeMillis() >> 10) + _migrateRetryInterval;
} catch (VirtualMachineMigrationException e) {
s_logger.warn("Looks like VM is still starting, we need to retry migrating the VM later.");
_agentMgr.maintenanceFailed(srcHostId);
return (System.currentTimeMillis() >> 10) + _migrateRetryInterval;
}
}

View File

@ -100,6 +100,7 @@ import com.cloud.exception.PermissionDeniedException;
import com.cloud.exception.ResourceAllocationException;
import com.cloud.exception.ResourceUnavailableException;
import com.cloud.exception.StorageUnavailableException;
import com.cloud.exception.VirtualMachineMigrationException;
import com.cloud.ha.HighAvailabilityManager;
import com.cloud.host.Host;
import com.cloud.host.HostVO;
@ -2910,7 +2911,7 @@ public class UserVmManagerImpl implements UserVmManager, UserVmService, Manager
}
@Override
public UserVm migrateVirtualMachine(UserVm vm, Host destinationHost) throws ResourceUnavailableException, ConcurrentOperationException, ManagementServerException{
public UserVm migrateVirtualMachine(UserVm vm, Host destinationHost) throws ResourceUnavailableException, ConcurrentOperationException, ManagementServerException, VirtualMachineMigrationException{
//access check - only root admin can migrate VM
Account caller = UserContext.current().getCaller();
if(caller.getType() != Account.ACCOUNT_TYPE_ADMIN){

View File

@ -29,6 +29,7 @@ import com.cloud.exception.InsufficientServerCapacityException;
import com.cloud.exception.ManagementServerException;
import com.cloud.exception.OperationTimedoutException;
import com.cloud.exception.ResourceUnavailableException;
import com.cloud.exception.VirtualMachineMigrationException;
import com.cloud.hypervisor.Hypervisor.HypervisorType;
import com.cloud.network.NetworkVO;
import com.cloud.service.ServiceOfferingVO;
@ -93,9 +94,9 @@ public interface VirtualMachineManager extends Manager {
<T extends VMInstanceVO> boolean destroy(T vm, User caller, Account account) throws AgentUnavailableException, OperationTimedoutException, ConcurrentOperationException;
boolean migrateAway(VirtualMachine.Type type, long vmid, long hostId) throws InsufficientServerCapacityException;
boolean migrateAway(VirtualMachine.Type type, long vmid, long hostId) throws InsufficientServerCapacityException, VirtualMachineMigrationException;
<T extends VMInstanceVO> T migrate(T vm, long srcHostId, DeployDestination dest) throws ResourceUnavailableException, ConcurrentOperationException, ManagementServerException;
<T extends VMInstanceVO> T migrate(T vm, long srcHostId, DeployDestination dest) throws ResourceUnavailableException, ConcurrentOperationException, ManagementServerException, VirtualMachineMigrationException;
<T extends VMInstanceVO> T reboot(T vm, Map<VirtualMachineProfile.Param, Object> params, User caller, Account account) throws InsufficientCapacityException, ResourceUnavailableException;

View File

@ -85,6 +85,7 @@ import com.cloud.exception.InsufficientServerCapacityException;
import com.cloud.exception.ManagementServerException;
import com.cloud.exception.OperationTimedoutException;
import com.cloud.exception.ResourceUnavailableException;
import com.cloud.exception.VirtualMachineMigrationException;
import com.cloud.ha.HighAvailabilityManager;
import com.cloud.ha.HighAvailabilityManager.WorkType;
import com.cloud.host.Host;
@ -959,36 +960,37 @@ public class VirtualMachineManagerImpl implements VirtualMachineManager, Listene
}
@Override
public <T extends VMInstanceVO> T migrate(T vm, long srcHostId, DeployDestination dest) throws ResourceUnavailableException, ConcurrentOperationException, ManagementServerException {
public <T extends VMInstanceVO> T migrate(T vm, long srcHostId, DeployDestination dest) throws ResourceUnavailableException, ConcurrentOperationException, ManagementServerException, VirtualMachineMigrationException {
s_logger.info("Migrating " + vm + " to " + dest);
long dstHostId = dest.getHost().getId();
Host fromHost = _hostDao.findById(srcHostId);
if (fromHost == null) {
s_logger.info("Unable to find the host to migrate from: " + srcHostId);
throw new ManagementServerException("Unable to find the host to migrate from: " + srcHostId);
throw new CloudRuntimeException("Unable to find the host to migrate from: " + srcHostId);
}
if(fromHost.getClusterId().longValue() != dest.getCluster().getId()){
s_logger.info("Source and destination host are not in same cluster, unable to migrate to host: " + dest.getHost().getId());
throw new ManagementServerException("Source and destination host are not in same cluster, unable to migrate to host: " + dest.getHost().getId());
throw new CloudRuntimeException("Source and destination host are not in same cluster, unable to migrate to host: " + dest.getHost().getId());
}
VirtualMachineGuru<T> vmGuru = getVmGuru(vm);
vm = vmGuru.findById(vm.getId());
long vmId = vm.getId();
vm = vmGuru.findById(vmId);
if (vm == null) {
if (s_logger.isDebugEnabled()) {
s_logger.debug("Unable to find the vm " + vm);
}
throw new ManagementServerException("Unable to find a virtual machine with id " + vm.getId());
throw new ManagementServerException("Unable to find a virtual machine with id " + vmId);
}
if(vm.getState() != State.Running){
if (s_logger.isDebugEnabled()) {
s_logger.debug("VM is not Running, unable to migrate the vm " + vm);
}
throw new ManagementServerException("VM is not Running, unable to migrate the vm " + vm);
throw new VirtualMachineMigrationException("VM is not Running, unable to migrate the vm currently " + vm);
}
short alertType = AlertManager.ALERT_TYPE_USERVM_MIGRATE;
@ -1058,12 +1060,16 @@ public class VirtualMachineManagerImpl implements VirtualMachineManager, Listene
try {
if (!checkVmOnHost(vm, dstHostId)) {
s_logger.error("Unable to complete migration for " + vm);
_agentMgr.send(srcHostId, new Commands(cleanup(vm.getInstanceName())), null);
try{
_agentMgr.send(srcHostId, new Commands(cleanup(vm.getInstanceName())), null);
}catch (AgentUnavailableException e) {
s_logger.error("AgentUnavailableException while cleanup on source host: " + srcHostId);
}
cleanup(vmGuru, new VirtualMachineProfileImpl<T>(vm), work, Event.AgentReportStopped, true, _accountMgr.getSystemUser(), _accountMgr.getSystemAccount());
return null;
}
} catch (OperationTimedoutException e) {
}
}
migrated = true;
return vm;
@ -1072,8 +1078,11 @@ public class VirtualMachineManagerImpl implements VirtualMachineManager, Listene
s_logger.info("Migration was unsuccessful. Cleaning up: " + vm);
_alertMgr.sendAlert(alertType, fromHost.getDataCenterId(), fromHost.getPodId(), "Unable to migrate vm " + vm.getName() + " from host " + fromHost.getName() + " in zone " + dest.getDataCenter().getName() + " and pod " + dest.getPod().getName(), "Migrate Command failed. Please check logs.");
_agentMgr.send(dstHostId, new Commands(cleanup(vm.getInstanceName())), null);
try{
_agentMgr.send(dstHostId, new Commands(cleanup(vm.getInstanceName())), null);
}catch(AgentUnavailableException ae){
s_logger.info("Looks like the destination Host is unavailable for cleanup");
}
stateTransitTo(vm, Event.OperationFailed, srcHostId);
}
@ -1082,6 +1091,7 @@ public class VirtualMachineManagerImpl implements VirtualMachineManager, Listene
_workDao.update(work.getId(), work);
}
}
protected void cancelWorkItems(long nodeId) {
GlobalLock scanLock = GlobalLock.getInternLock(this.getClass().getName());
@ -1119,7 +1129,7 @@ public class VirtualMachineManagerImpl implements VirtualMachineManager, Listene
}
@Override
public boolean migrateAway(VirtualMachine.Type vmType, long vmId, long srcHostId) throws InsufficientServerCapacityException {
public boolean migrateAway(VirtualMachine.Type vmType, long vmId, long srcHostId) throws InsufficientServerCapacityException, VirtualMachineMigrationException {
VirtualMachineGuru<? extends VMInstanceVO> vmGuru = _vmGurus.get(vmType);
VMInstanceVO vm = vmGuru.findById(vmId);
if (vm == null) {
@ -1170,6 +1180,14 @@ public class VirtualMachineManagerImpl implements VirtualMachineManager, Listene
s_logger.debug("Unable to migrate VM due to: " + e.getMessage());
} catch (ManagementServerException e) {
s_logger.debug("Unable to migrate VM: " + e.getMessage());
} catch (VirtualMachineMigrationException e) {
s_logger.debug("Got VirtualMachineMigrationException, Unable to migrate: " + e.getMessage());
if(vm.getState() == State.Starting){
s_logger.debug("VM seems to be still Starting, we should retry migration later");
throw e;
}else{
s_logger.debug("Unable to migrate VM, VM is not in Running or even Starting state, current state: "+vm.getState().toString());
}
}
if (vmInstance != null) {
return true;

View File

@ -60,4 +60,5 @@ public interface SerialVersionUID {
public static final long ConnectionException = Base | 0x20;
public static final long PermissionDeniedException = Base | 0x21;
public static final long sshException = Base | 0x22;
public static final long VirtualMachineMigrationException = Base | 0x24;
}