From 13cfa9660dc31226a2e5228465b533f631979571 Mon Sep 17 00:00:00 2001 From: prachi Date: Thu, 15 Sep 2011 18:46:39 -0700 Subject: [PATCH] Bug 11404 - VM was in Running state, had null for a pod_id, basically didnt allow creation of subsequent vm's Reviewed-by: Alex Changes: - When management server starts, it goes through all the pending work items from op_it_work table and schedules HA work for each. It used to mark each item as done. Instead we should keep the item as pending and let it get marked as Done after the HA work is done. - Changes in VirtualMachineMgr::advanceStop() : a) if we find a VM with null hostId, we stop the VM only if it is forced stopped. b) if VM state transition to Stopping fails,for state Starting and Migrating we try to find the pending work item and then do cleanup the VM. In case state is Stopping we can cleanup directly. c) We proceed releasing all resources only if state transitioned to 'Stopping'. - Changes in HA: a) Depend on VirtualMachineMgr::advanceStop() in case host is not found to do VM cleanup - When Vm state between mgmt server and agent syncs from starting -> running, mark any pending work item as done. --- .../cloud/agent/manager/AgentManagerImpl.java | 3 + .../cloud/ha/HighAvailabilityManagerImpl.java | 47 ++++++--- server/src/com/cloud/vm/ItWorkVO.java | 4 + .../cloud/vm/VirtualMachineManagerImpl.java | 99 ++++++++++++++++--- 4 files changed, 124 insertions(+), 29 deletions(-) diff --git a/server/src/com/cloud/agent/manager/AgentManagerImpl.java b/server/src/com/cloud/agent/manager/AgentManagerImpl.java index b552903dcf3..fcd6cc9247d 100755 --- a/server/src/com/cloud/agent/manager/AgentManagerImpl.java +++ b/server/src/com/cloud/agent/manager/AgentManagerImpl.java @@ -2058,6 +2058,9 @@ public class AgentManagerImpl implements AgentManager, HandlerFactory, Manager { boolean doCidrCheck = true; ClusterVO clusterVO = _clusterDao.findById(clusterId); + if(clusterVO == null){ + throw new IllegalArgumentException("Cluster with id " + clusterId + " does not exist, cannot add host with this clusterId"); + } if (clusterVO.getHypervisorType() != scc.getHypervisorType()) { throw new IllegalArgumentException("Can't add host whose hypervisor type is: " + scc.getHypervisorType() + " into cluster: " + clusterId + " whose hypervisor type is: " + clusterVO.getHypervisorType()); diff --git a/server/src/com/cloud/ha/HighAvailabilityManagerImpl.java b/server/src/com/cloud/ha/HighAvailabilityManagerImpl.java index 67c93d9bfb9..9ed6f88ec0c 100644 --- a/server/src/com/cloud/ha/HighAvailabilityManagerImpl.java +++ b/server/src/com/cloud/ha/HighAvailabilityManagerImpl.java @@ -68,10 +68,8 @@ import com.cloud.utils.component.ComponentLocator; import com.cloud.utils.component.Inject; import com.cloud.utils.concurrency.NamedThreadFactory; import com.cloud.utils.exception.CloudRuntimeException; -import com.cloud.utils.fsm.NoTransitionException; import com.cloud.vm.VMInstanceVO; import com.cloud.vm.VirtualMachine; -import com.cloud.vm.VirtualMachine.Event; import com.cloud.vm.VirtualMachine.State; import com.cloud.vm.VirtualMachineManager; import com.cloud.vm.VirtualMachineProfile; @@ -269,9 +267,17 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager, Clu if (hostId == null) { try { s_logger.debug("Found a vm that is scheduled to be restarted but has no host id: " + vm); - _itMgr.stateTransitTo(vm, Event.OperationFailed, null); - } catch (NoTransitionException e) { - } + _itMgr.advanceStop(vm, true, _accountMgr.getSystemUser(), _accountMgr.getSystemAccount()); + } catch (ResourceUnavailableException e) { + assert false : "How do we hit this when force is true?"; + throw new CloudRuntimeException("Caught exception even though it should be handled.", e); + } catch (OperationTimedoutException e) { + assert false : "How do we hit this when force is true?"; + throw new CloudRuntimeException("Caught exception even though it should be handled.", e); + } catch (ConcurrentOperationException e) { + assert false : "How do we hit this when force is true?"; + throw new CloudRuntimeException("Caught exception even though it should be handled.", e); + } return; } @@ -300,12 +306,15 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager, Clu try { _itMgr.advanceStop(vm, true, _accountMgr.getSystemUser(), _accountMgr.getSystemAccount()); } catch (ResourceUnavailableException e) { - // FIXME + assert false : "How do we hit this when force is true?"; + throw new CloudRuntimeException("Caught exception even though it should be handled.", e); } catch (OperationTimedoutException e) { - // FIXME + assert false : "How do we hit this when force is true?"; + throw new CloudRuntimeException("Caught exception even though it should be handled.", e); } catch (ConcurrentOperationException e) { - // FIXME - } + assert false : "How do we hit this when force is true?"; + throw new CloudRuntimeException("Caught exception even though it should be handled.", e); + } } List items = _haDao.findPreviousHA(vm.getId()); @@ -454,7 +463,19 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager, Clu work.setStep(Step.Scheduled); _haDao.update(work.getId(), work); } else { - assert false : "How come that HA step is Investigating and the host is removed?"; + s_logger.debug("How come that HA step is Investigating and the host is removed? Calling forced Stop on Vm anyways"); + try { + _itMgr.advanceStop(vm, true, _accountMgr.getSystemUser(), _accountMgr.getSystemAccount()); + } catch (ResourceUnavailableException e) { + assert false : "How do we hit this when force is true?"; + throw new CloudRuntimeException("Caught exception even though it should be handled.", e); + } catch (OperationTimedoutException e) { + assert false : "How do we hit this when force is true?"; + throw new CloudRuntimeException("Caught exception even though it should be handled.", e); + } catch (ConcurrentOperationException e) { + assert false : "How do we hit this when force is true?"; + throw new CloudRuntimeException("Caught exception even though it should be handled.", e); + } } } @@ -594,12 +615,6 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager, Clu s_logger.info("Stopping " + vm); try { if (work.getWorkType() == WorkType.Stop) { - if (vm.getHostId() == null) { - if (s_logger.isDebugEnabled()) { - s_logger.debug(vm.toString() + " has already been stopped"); - } - return null; - } if (_itMgr.advanceStop(vm, false, _accountMgr.getSystemUser(), _accountMgr.getSystemAccount())) { s_logger.info("Successfully stopped " + vm); return null; diff --git a/server/src/com/cloud/vm/ItWorkVO.java b/server/src/com/cloud/vm/ItWorkVO.java index 14ce83fb42f..d1f9b66115d 100644 --- a/server/src/com/cloud/vm/ItWorkVO.java +++ b/server/src/com/cloud/vm/ItWorkVO.java @@ -132,6 +132,10 @@ public class ItWorkVO { return managementServerId; } + public void setManagementServerId(long managementServerId) { + this.managementServerId = managementServerId; + } + public State getType() { return type; } diff --git a/server/src/com/cloud/vm/VirtualMachineManagerImpl.java b/server/src/com/cloud/vm/VirtualMachineManagerImpl.java index a3f27d4c0b9..9640758ffb0 100755 --- a/server/src/com/cloud/vm/VirtualMachineManagerImpl.java +++ b/server/src/com/cloud/vm/VirtualMachineManagerImpl.java @@ -924,14 +924,34 @@ public class VirtualMachineManagerImpl implements VirtualMachineManager, Listene } return true; } - + //grab outstanding work item if any + ItWorkVO work = _workDao.findByOutstandingWork(vm.getId(), vm.getState()); + if(work != null){ + if (s_logger.isDebugEnabled()) { + s_logger.debug("Found an outstanding work item for this vm "+ vm + " with state:"+vm.getState()+", work id:"+work.getId()); + } + } Long hostId = vm.getHostId(); if (hostId == null) { + if(!forced){ + if (s_logger.isDebugEnabled()) { + s_logger.debug("HostId is null but this is not a forced stop, cannot stop vm "+ vm + " with state:"+vm.getState()); + } + return false; + } try { stateTransitTo(vm, Event.AgentReportStopped, null, null); } catch (NoTransitionException e) { s_logger.warn(e.getMessage()); } + //mark outstanding work item if any as done + if (work != null) { + if (s_logger.isDebugEnabled()) { + s_logger.debug("Updating work item to Done, id:"+work.getId()); + } + work.setStep(Step.Done); + _workDao.update(work.getId(), work); + } return true; } @@ -946,22 +966,46 @@ public class VirtualMachineManagerImpl implements VirtualMachineManager, Listene if (!forced) { throw new CloudRuntimeException("We cannot stop " + vm + " when it is in state " + vm.getState()); } - s_logger.debug("Unable to transition the state but we're moving on because it's forced stop"); - if (state == State.Starting || state == State.Stopping || state == State.Migrating) { - ItWorkVO work = _workDao.findByOutstandingWork(vm.getId(), vm.getState()); + boolean doCleanup = false; + if (s_logger.isDebugEnabled()) { + s_logger.debug("Unable to transition the state but we're moving on because it's forced stop"); + } + if (state == State.Starting || state == State.Migrating) { if (work != null) { - if (cleanup(vmGuru, new VirtualMachineProfileImpl(vm), work, Event.StopRequested, forced, user, account)) { - try { - return stateTransitTo(vm, Event.AgentReportStopped, null); - } catch (NoTransitionException e) { - s_logger.warn("Unable to cleanup " + vm); - return false; - } + doCleanup = true; + }else{ + if (s_logger.isDebugEnabled()) { + s_logger.debug("Unable to cleanup VM: "+ vm+" ,since outstanding work item is not found"); } + throw new CloudRuntimeException("Work item not found, We cannot stop " + vm + " when it is in state " + vm.getState()); + } + }else if(state == State.Stopping){ + doCleanup = true; + } + + if(doCleanup){ + if (cleanup(vmGuru, new VirtualMachineProfileImpl(vm), work, Event.StopRequested, forced, user, account)) { + try { + if (s_logger.isDebugEnabled()) { + s_logger.debug("Updating work item to Done, id:"+work.getId()); + } + return changeState(vm, Event.AgentReportStopped, null, work, Step.Done); + } catch (NoTransitionException e) { + s_logger.warn("Unable to cleanup " + vm); + return false; + } + }else{ + if (s_logger.isDebugEnabled()) { + s_logger.debug("Failed to cleanup VM: "+ vm); + } + throw new CloudRuntimeException("Failed to cleanup " + vm + " , current state " + vm.getState()); } } } + if(vm.getState() != State.Stopping){ + throw new CloudRuntimeException("We cannot proceed with stop VM " + vm + " since it is not in 'Stopping' state, current state: " + vm.getState()); + } String routerPrivateIp = null; if (vm.getType() == VirtualMachine.Type.DomainRouter) { routerPrivateIp = vm.getPrivateIpAddress(); @@ -1017,6 +1061,14 @@ public class VirtualMachineManagerImpl implements VirtualMachineManager, Listene } try { + if (work != null) { + if (s_logger.isDebugEnabled()) { + s_logger.debug("Updating the outstanding work item to Done, id:"+work.getId()); + } + work.setStep(Step.Done); + _workDao.update(work.getId(), work); + } + return stateTransitTo(vm, Event.OperationSucceeded, null, null); } catch (NoTransitionException e) { s_logger.warn(e.getMessage()); @@ -1262,14 +1314,18 @@ public class VirtualMachineManagerImpl implements VirtualMachineManager, Listene if (vm != null) { if (work.getType() == State.Starting) { _haMgr.scheduleRestart(vm, true); + work.setManagementServerId(_nodeId); + _workDao.update(work.getId(), work); } else if (work.getType() == State.Stopping) { _haMgr.scheduleStop(vm, vm.getHostId(), WorkType.CheckStop); + work.setManagementServerId(_nodeId); + _workDao.update(work.getId(), work); } else if (work.getType() == State.Migrating) { _haMgr.scheduleMigration(vm); + work.setStep(Step.Done); + _workDao.update(work.getId(), work); } } - work.setStep(Step.Done); - _workDao.update(work.getId(), work); } catch (Exception e) { s_logger.error("Error while handling " + work, e); } @@ -1693,6 +1749,15 @@ public class VirtualMachineManagerImpl implements VirtualMachineManager, Listene s_logger.debug("VM state is starting on full sync so updating it to running"); vm = findById(vm.getType(), vm.getId()); + + //grab outstanding work item if any + ItWorkVO work = _workDao.findByOutstandingWork(vm.getId(), vm.getState()); + if(work != null){ + if (s_logger.isDebugEnabled()) { + s_logger.debug("Found an outstanding work item for this vm "+ vm + " in state:"+vm.getState()+", work id:"+work.getId()); + } + } + try { stateTransitTo(vm, cause, hostId); } catch (NoTransitionException e1) { @@ -1726,6 +1791,14 @@ public class VirtualMachineManagerImpl implements VirtualMachineManager, Listene } else { s_logger.error("Unable to finalize commands on start for vm: " + vm); } + + if (work != null) { + if (s_logger.isDebugEnabled()) { + s_logger.debug("Updating outstanding work item to Done, id:"+work.getId()); + } + work.setStep(Step.Done); + _workDao.update(work.getId(), work); + } } public Commands fullSync(final long hostId, StartupRoutingCommand startup) {