From 53f8ebf6f0b00350107e520fa32ae8c13abc9af1 Mon Sep 17 00:00:00 2001 From: prachi Date: Mon, 4 Apr 2011 17:28:58 -0700 Subject: [PATCH] Bug 9043 - VM manual migration - when destination host is out of memory for migration, VMs being migrated remained in 'migrating' state Changes: - When migration fails we try to do cleanup on the destination host agent. The AgentUnavailableException in this cleanup was not caught. -Due to that other cleanup like reverting capacity allocated and vm state were skipped. -Fix is to catch the AgentUnavailableException so that rest of the cleanup can happen. - Also corrected the exceptions in various cases of migration failure. - In case the VM is still starting, HA should schedule a retry. Introduced a special migration exception for handling this. --- .../com/cloud/api/commands/MigrateVMCmd.java | 6 ++- .../VirtualMachineMigrationException.java | 28 +++++++++++++ api/src/com/cloud/vm/UserVmService.java | 4 +- .../cloud/ha/HighAvailabilityManagerImpl.java | 5 +++ .../src/com/cloud/vm/UserVmManagerImpl.java | 3 +- .../com/cloud/vm/VirtualMachineManager.java | 5 ++- .../cloud/vm/VirtualMachineManagerImpl.java | 40 ++++++++++++++----- .../src/com/cloud/utils/SerialVersionUID.java | 1 + 8 files changed, 76 insertions(+), 16 deletions(-) create mode 100644 api/src/com/cloud/exception/VirtualMachineMigrationException.java diff --git a/api/src/com/cloud/api/commands/MigrateVMCmd.java b/api/src/com/cloud/api/commands/MigrateVMCmd.java index 7f8ce362532..c3a5cd4b415 100644 --- a/api/src/com/cloud/api/commands/MigrateVMCmd.java +++ b/api/src/com/cloud/api/commands/MigrateVMCmd.java @@ -31,6 +31,7 @@ import com.cloud.exception.ConcurrentOperationException; import com.cloud.exception.InvalidParameterValueException; import com.cloud.exception.ManagementServerException; import com.cloud.exception.ResourceUnavailableException; +import com.cloud.exception.VirtualMachineMigrationException; import com.cloud.host.Host; import com.cloud.user.Account; import com.cloud.uservm.UserVm; @@ -123,6 +124,9 @@ public class MigrateVMCmd extends BaseAsyncCmd { } catch (ManagementServerException e) { s_logger.warn("Exception: ", e); throw new ServerApiException(BaseCmd.INTERNAL_ERROR, e.getMessage()); - } + } catch (VirtualMachineMigrationException e) { + s_logger.warn("Exception: ", e); + throw new ServerApiException(BaseCmd.INTERNAL_ERROR, e.getMessage()); + } } } diff --git a/api/src/com/cloud/exception/VirtualMachineMigrationException.java b/api/src/com/cloud/exception/VirtualMachineMigrationException.java new file mode 100644 index 00000000000..3d136750f4d --- /dev/null +++ b/api/src/com/cloud/exception/VirtualMachineMigrationException.java @@ -0,0 +1,28 @@ +/** + * Copyright (C) 2010 Cloud.com, Inc. All rights reserved. + * + * This software is licensed under the GNU General Public License v3 or later. + * + * It is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or any later version. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + */ +package com.cloud.exception; + +import com.cloud.utils.SerialVersionUID; + +public class VirtualMachineMigrationException extends Exception { + private static final long serialVersionUID = SerialVersionUID.VirtualMachineMigrationException; + + public VirtualMachineMigrationException(String message) { + super(message); + } +} diff --git a/api/src/com/cloud/vm/UserVmService.java b/api/src/com/cloud/vm/UserVmService.java index b8b9e6d46ef..da7d9e32985 100755 --- a/api/src/com/cloud/vm/UserVmService.java +++ b/api/src/com/cloud/vm/UserVmService.java @@ -44,6 +44,7 @@ import com.cloud.exception.PermissionDeniedException; import com.cloud.exception.ResourceAllocationException; import com.cloud.exception.ResourceUnavailableException; import com.cloud.exception.StorageUnavailableException; +import com.cloud.exception.VirtualMachineMigrationException; import com.cloud.hypervisor.Hypervisor.HypervisorType; import com.cloud.offering.ServiceOffering; import com.cloud.storage.Volume; @@ -286,6 +287,7 @@ public interface UserVmService { * @throws ManagementServerException in case we get error finding the VM or host or access errors or other internal errors. * @throws ConcurrentOperationException if there are multiple users working on the same VM. * @throws ResourceUnavailableException if the destination host to migrate the VM is not currently available. + * @throws VirtualMachineMigrationException if the VM to be migrated is not in Running state */ - UserVm migrateVirtualMachine(UserVm vm, Host destinationHost) throws ResourceUnavailableException, ConcurrentOperationException, ManagementServerException; + UserVm migrateVirtualMachine(UserVm vm, Host destinationHost) throws ResourceUnavailableException, ConcurrentOperationException, ManagementServerException, VirtualMachineMigrationException; } diff --git a/server/src/com/cloud/ha/HighAvailabilityManagerImpl.java b/server/src/com/cloud/ha/HighAvailabilityManagerImpl.java index c5344c7d313..c456fcfc9c4 100644 --- a/server/src/com/cloud/ha/HighAvailabilityManagerImpl.java +++ b/server/src/com/cloud/ha/HighAvailabilityManagerImpl.java @@ -49,6 +49,7 @@ import com.cloud.exception.InsufficientCapacityException; import com.cloud.exception.InsufficientServerCapacityException; import com.cloud.exception.OperationTimedoutException; import com.cloud.exception.ResourceUnavailableException; +import com.cloud.exception.VirtualMachineMigrationException; import com.cloud.ha.dao.HighAvailabilityDao; import com.cloud.host.Host; import com.cloud.host.HostVO; @@ -479,6 +480,10 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager, Clu s_logger.warn("Insufficient capacity for migrating a VM."); _agentMgr.maintenanceFailed(srcHostId); return (System.currentTimeMillis() >> 10) + _migrateRetryInterval; + } catch (VirtualMachineMigrationException e) { + s_logger.warn("Looks like VM is still starting, we need to retry migrating the VM later."); + _agentMgr.maintenanceFailed(srcHostId); + return (System.currentTimeMillis() >> 10) + _migrateRetryInterval; } } diff --git a/server/src/com/cloud/vm/UserVmManagerImpl.java b/server/src/com/cloud/vm/UserVmManagerImpl.java index 99f16afc1ca..b2285ebcfe2 100755 --- a/server/src/com/cloud/vm/UserVmManagerImpl.java +++ b/server/src/com/cloud/vm/UserVmManagerImpl.java @@ -100,6 +100,7 @@ import com.cloud.exception.PermissionDeniedException; import com.cloud.exception.ResourceAllocationException; import com.cloud.exception.ResourceUnavailableException; import com.cloud.exception.StorageUnavailableException; +import com.cloud.exception.VirtualMachineMigrationException; import com.cloud.ha.HighAvailabilityManager; import com.cloud.host.Host; import com.cloud.host.HostVO; @@ -2910,7 +2911,7 @@ public class UserVmManagerImpl implements UserVmManager, UserVmService, Manager } @Override - public UserVm migrateVirtualMachine(UserVm vm, Host destinationHost) throws ResourceUnavailableException, ConcurrentOperationException, ManagementServerException{ + public UserVm migrateVirtualMachine(UserVm vm, Host destinationHost) throws ResourceUnavailableException, ConcurrentOperationException, ManagementServerException, VirtualMachineMigrationException{ //access check - only root admin can migrate VM Account caller = UserContext.current().getCaller(); if(caller.getType() != Account.ACCOUNT_TYPE_ADMIN){ diff --git a/server/src/com/cloud/vm/VirtualMachineManager.java b/server/src/com/cloud/vm/VirtualMachineManager.java index 2382a4d6b55..c0e1f95eb01 100644 --- a/server/src/com/cloud/vm/VirtualMachineManager.java +++ b/server/src/com/cloud/vm/VirtualMachineManager.java @@ -29,6 +29,7 @@ import com.cloud.exception.InsufficientServerCapacityException; import com.cloud.exception.ManagementServerException; import com.cloud.exception.OperationTimedoutException; import com.cloud.exception.ResourceUnavailableException; +import com.cloud.exception.VirtualMachineMigrationException; import com.cloud.hypervisor.Hypervisor.HypervisorType; import com.cloud.network.NetworkVO; import com.cloud.service.ServiceOfferingVO; @@ -93,9 +94,9 @@ public interface VirtualMachineManager extends Manager { boolean destroy(T vm, User caller, Account account) throws AgentUnavailableException, OperationTimedoutException, ConcurrentOperationException; - boolean migrateAway(VirtualMachine.Type type, long vmid, long hostId) throws InsufficientServerCapacityException; + boolean migrateAway(VirtualMachine.Type type, long vmid, long hostId) throws InsufficientServerCapacityException, VirtualMachineMigrationException; - T migrate(T vm, long srcHostId, DeployDestination dest) throws ResourceUnavailableException, ConcurrentOperationException, ManagementServerException; + T migrate(T vm, long srcHostId, DeployDestination dest) throws ResourceUnavailableException, ConcurrentOperationException, ManagementServerException, VirtualMachineMigrationException; T reboot(T vm, Map params, User caller, Account account) throws InsufficientCapacityException, ResourceUnavailableException; diff --git a/server/src/com/cloud/vm/VirtualMachineManagerImpl.java b/server/src/com/cloud/vm/VirtualMachineManagerImpl.java index 489e4b13ccb..e8e6045b483 100755 --- a/server/src/com/cloud/vm/VirtualMachineManagerImpl.java +++ b/server/src/com/cloud/vm/VirtualMachineManagerImpl.java @@ -85,6 +85,7 @@ import com.cloud.exception.InsufficientServerCapacityException; import com.cloud.exception.ManagementServerException; import com.cloud.exception.OperationTimedoutException; import com.cloud.exception.ResourceUnavailableException; +import com.cloud.exception.VirtualMachineMigrationException; import com.cloud.ha.HighAvailabilityManager; import com.cloud.ha.HighAvailabilityManager.WorkType; import com.cloud.host.Host; @@ -959,36 +960,37 @@ public class VirtualMachineManagerImpl implements VirtualMachineManager, Listene } @Override - public T migrate(T vm, long srcHostId, DeployDestination dest) throws ResourceUnavailableException, ConcurrentOperationException, ManagementServerException { + public T migrate(T vm, long srcHostId, DeployDestination dest) throws ResourceUnavailableException, ConcurrentOperationException, ManagementServerException, VirtualMachineMigrationException { s_logger.info("Migrating " + vm + " to " + dest); long dstHostId = dest.getHost().getId(); Host fromHost = _hostDao.findById(srcHostId); if (fromHost == null) { s_logger.info("Unable to find the host to migrate from: " + srcHostId); - throw new ManagementServerException("Unable to find the host to migrate from: " + srcHostId); + throw new CloudRuntimeException("Unable to find the host to migrate from: " + srcHostId); } if(fromHost.getClusterId().longValue() != dest.getCluster().getId()){ s_logger.info("Source and destination host are not in same cluster, unable to migrate to host: " + dest.getHost().getId()); - throw new ManagementServerException("Source and destination host are not in same cluster, unable to migrate to host: " + dest.getHost().getId()); + throw new CloudRuntimeException("Source and destination host are not in same cluster, unable to migrate to host: " + dest.getHost().getId()); } VirtualMachineGuru vmGuru = getVmGuru(vm); - vm = vmGuru.findById(vm.getId()); + long vmId = vm.getId(); + vm = vmGuru.findById(vmId); if (vm == null) { if (s_logger.isDebugEnabled()) { s_logger.debug("Unable to find the vm " + vm); } - throw new ManagementServerException("Unable to find a virtual machine with id " + vm.getId()); + throw new ManagementServerException("Unable to find a virtual machine with id " + vmId); } if(vm.getState() != State.Running){ if (s_logger.isDebugEnabled()) { s_logger.debug("VM is not Running, unable to migrate the vm " + vm); } - throw new ManagementServerException("VM is not Running, unable to migrate the vm " + vm); + throw new VirtualMachineMigrationException("VM is not Running, unable to migrate the vm currently " + vm); } short alertType = AlertManager.ALERT_TYPE_USERVM_MIGRATE; @@ -1058,12 +1060,16 @@ public class VirtualMachineManagerImpl implements VirtualMachineManager, Listene try { if (!checkVmOnHost(vm, dstHostId)) { s_logger.error("Unable to complete migration for " + vm); - _agentMgr.send(srcHostId, new Commands(cleanup(vm.getInstanceName())), null); + try{ + _agentMgr.send(srcHostId, new Commands(cleanup(vm.getInstanceName())), null); + }catch (AgentUnavailableException e) { + s_logger.error("AgentUnavailableException while cleanup on source host: " + srcHostId); + } cleanup(vmGuru, new VirtualMachineProfileImpl(vm), work, Event.AgentReportStopped, true, _accountMgr.getSystemUser(), _accountMgr.getSystemAccount()); return null; } } catch (OperationTimedoutException e) { - } + } migrated = true; return vm; @@ -1072,8 +1078,11 @@ public class VirtualMachineManagerImpl implements VirtualMachineManager, Listene s_logger.info("Migration was unsuccessful. Cleaning up: " + vm); _alertMgr.sendAlert(alertType, fromHost.getDataCenterId(), fromHost.getPodId(), "Unable to migrate vm " + vm.getName() + " from host " + fromHost.getName() + " in zone " + dest.getDataCenter().getName() + " and pod " + dest.getPod().getName(), "Migrate Command failed. Please check logs."); - - _agentMgr.send(dstHostId, new Commands(cleanup(vm.getInstanceName())), null); + try{ + _agentMgr.send(dstHostId, new Commands(cleanup(vm.getInstanceName())), null); + }catch(AgentUnavailableException ae){ + s_logger.info("Looks like the destination Host is unavailable for cleanup"); + } stateTransitTo(vm, Event.OperationFailed, srcHostId); } @@ -1082,6 +1091,7 @@ public class VirtualMachineManagerImpl implements VirtualMachineManager, Listene _workDao.update(work.getId(), work); } } + protected void cancelWorkItems(long nodeId) { GlobalLock scanLock = GlobalLock.getInternLock(this.getClass().getName()); @@ -1119,7 +1129,7 @@ public class VirtualMachineManagerImpl implements VirtualMachineManager, Listene } @Override - public boolean migrateAway(VirtualMachine.Type vmType, long vmId, long srcHostId) throws InsufficientServerCapacityException { + public boolean migrateAway(VirtualMachine.Type vmType, long vmId, long srcHostId) throws InsufficientServerCapacityException, VirtualMachineMigrationException { VirtualMachineGuru vmGuru = _vmGurus.get(vmType); VMInstanceVO vm = vmGuru.findById(vmId); if (vm == null) { @@ -1170,6 +1180,14 @@ public class VirtualMachineManagerImpl implements VirtualMachineManager, Listene s_logger.debug("Unable to migrate VM due to: " + e.getMessage()); } catch (ManagementServerException e) { s_logger.debug("Unable to migrate VM: " + e.getMessage()); + } catch (VirtualMachineMigrationException e) { + s_logger.debug("Got VirtualMachineMigrationException, Unable to migrate: " + e.getMessage()); + if(vm.getState() == State.Starting){ + s_logger.debug("VM seems to be still Starting, we should retry migration later"); + throw e; + }else{ + s_logger.debug("Unable to migrate VM, VM is not in Running or even Starting state, current state: "+vm.getState().toString()); + } } if (vmInstance != null) { return true; diff --git a/utils/src/com/cloud/utils/SerialVersionUID.java b/utils/src/com/cloud/utils/SerialVersionUID.java index a964ec0277a..fd8abffb5c1 100755 --- a/utils/src/com/cloud/utils/SerialVersionUID.java +++ b/utils/src/com/cloud/utils/SerialVersionUID.java @@ -60,4 +60,5 @@ public interface SerialVersionUID { public static final long ConnectionException = Base | 0x20; public static final long PermissionDeniedException = Base | 0x21; public static final long sshException = Base | 0x22; + public static final long VirtualMachineMigrationException = Base | 0x24; }