From 40dbc549989bbcd114fe6600deda6e29a6016e51 Mon Sep 17 00:00:00 2001 From: Kelven Yang Date: Sun, 16 Jun 2013 19:39:32 -0700 Subject: [PATCH] Hook up VM transitional state handling into VirtualMachineManager host ping and periodically maintanance tasksg --- .../src/com/cloud/alert/AlertManager.java | 2 +- .../cloudstack/engine/config/Configs.java | 2 + .../cloud/vm/VirtualMachineManagerImpl.java | 66 +++++++++++++------ 3 files changed, 48 insertions(+), 22 deletions(-) diff --git a/engine/components-api/src/com/cloud/alert/AlertManager.java b/engine/components-api/src/com/cloud/alert/AlertManager.java index b6d005a5f21..7533d4de5e2 100755 --- a/engine/components-api/src/com/cloud/alert/AlertManager.java +++ b/engine/components-api/src/com/cloud/alert/AlertManager.java @@ -48,7 +48,7 @@ public interface AlertManager extends Manager { public static final short ALERT_TYPE_DIRECT_ATTACHED_PUBLIC_IP = 24; public static final short ALERT_TYPE_LOCAL_STORAGE = 25; public static final short ALERT_TYPE_RESOURCE_LIMIT_EXCEEDED = 26; // Generated when the resource limit exceeds the limit. Currently used for recurring snapshots only - + public static final short ALERT_TYPE_SYNC = 27; void clearAlert(short alertType, long dataCenterId, long podId); diff --git a/engine/components-api/src/org/apache/cloudstack/engine/config/Configs.java b/engine/components-api/src/org/apache/cloudstack/engine/config/Configs.java index aa5ca70d91b..9dcb86d15c5 100644 --- a/engine/components-api/src/org/apache/cloudstack/engine/config/Configs.java +++ b/engine/components-api/src/org/apache/cloudstack/engine/config/Configs.java @@ -41,4 +41,6 @@ public interface Configs { public static final ConfigKey VmDestroyForcestop = new ConfigKey( Boolean.class, "vm.destroy.forcestop", "Advanced", OrchestrationService.class, "false", "On destroy, force-stop takes this value ", null); + public static final ConfigKey PingInterval = new ConfigKey( + Long.class, "ping.interval", "Advanced", OrchestrationService.class, "60", "Ping interval in seconds", null); } diff --git a/engine/orchestration/src/com/cloud/vm/VirtualMachineManagerImpl.java b/engine/orchestration/src/com/cloud/vm/VirtualMachineManagerImpl.java index 2d82814119d..74219bd877f 100755 --- a/engine/orchestration/src/com/cloud/vm/VirtualMachineManagerImpl.java +++ b/engine/orchestration/src/com/cloud/vm/VirtualMachineManagerImpl.java @@ -170,6 +170,8 @@ import com.cloud.vm.snapshot.VMSnapshotManager; public class VirtualMachineManagerImpl extends ManagerBase implements VirtualMachineManager, Listener { private static final Logger s_logger = Logger.getLogger(VirtualMachineManagerImpl.class); + private static final String VM_SYNC_ALERT_SUBJECT = "VM state sync alert"; + @Inject protected EntityManager _entityMgr; @Inject @@ -257,6 +259,7 @@ public class VirtualMachineManagerImpl extends ManagerBase implements VirtualMac protected ConfigValue _lockStateRetry; protected ConfigValue _operationTimeout; protected ConfigValue _forceStop; + protected ConfigValue _pingInterval; protected long _nodeId; SearchBuilder RootVolumeSearch; @@ -425,7 +428,8 @@ public class VirtualMachineManagerImpl extends ManagerBase implements VirtualMac @Override public boolean start() { - _executor.scheduleAtFixedRate(new CleanupTask(), _cleanupInterval.value(), _cleanupInterval.value(), TimeUnit.SECONDS); + _executor.scheduleAtFixedRate(new TransitionTask(), _pingInterval.value(), _pingInterval.value(), TimeUnit.SECONDS); + _executor.scheduleAtFixedRate(new CleanupTask(), _pingInterval.value()*2, _pingInterval.value()*2, TimeUnit.SECONDS); cancelWorkItems(_nodeId); return true; @@ -447,6 +451,7 @@ public class VirtualMachineManagerImpl extends ManagerBase implements VirtualMac _lockStateRetry = _configRepo.get(Configs.VmOpLockStateRetry); _operationTimeout = _configRepo.get(Configs.Wait).setMultiplier(2); _forceStop = _configRepo.get(Configs.VmDestroyForcestop); + _pingInterval = _configRepo.get(Configs.PingInterval).setMultiplier(1000); ReservationContextImpl.setComponents(_entityMgr); VirtualMachineProfileImpl.setComponents(_entityMgr); @@ -2616,22 +2621,10 @@ public class VirtualMachineManagerImpl extends ManagerBase implements VirtualMac PingRoutingCommand ping = (PingRoutingCommand) cmd; if (ping.getNewStates() != null && ping.getNewStates().size() > 0) { _syncMgr.processHostVmStatePingReport(agentId, ping.getNewStates()); - -/* TODO - Commands commands = deltaHostSync(agentId, ping.getNewStates()); - if (commands.size() > 0) { - try { - _agentMgr.send(agentId, commands, this); - } catch (final AgentUnavailableException e) { - s_logger.warn("Agent is now unavailable", e); - } - } -*/ - } - // take the chance to scan stalled VM - scanStalledVMInTransitionState(agentId); + // take the chance to scan VMs that are stuck in transitional states and are missing from the report + scanStalledVMInTransitionStateOnUpHost(agentId); processed = true; } } @@ -2740,7 +2733,9 @@ public class VirtualMachineManagerImpl extends ManagerBase implements VirtualMac s_logger.debug("Couldn't lock the db"); return; } + try { +/* lock.addRef(); List instances = _vmDao.findVMInTransition(new Date(new Date().getTime() - (_operationTimeout.value() * 1000)), State.Starting, State.Stopping); for (VMInstanceVO instance : instances) { @@ -2751,6 +2746,9 @@ public class VirtualMachineManagerImpl extends ManagerBase implements VirtualMac _haMgr.scheduleRestart(instance, true); } } +*/ + scanStalledVMInTransitionStateOnDisconnectedHosts(); + } catch (Exception e) { s_logger.warn("Caught the following exception on transition checking", e); } finally { @@ -3452,7 +3450,10 @@ public class VirtualMachineManagerImpl extends ManagerBase implements VirtualMac } catch(NoTransitionException e) { s_logger.warn("Unexpected VM state transition exception, race-condition?", e); } - // TODO we need to alert admin or user about this risky state transition + + // we need to alert admin or user about this risky state transition + _alertMgr.sendAlert(AlertManager.ALERT_TYPE_SYNC, vm.getDataCenterId(), vm.getPodIdToDeployIn(), + VM_SYNC_ALERT_SUBJECT, "VM " + vm.getHostName() + "(" + vm.getInstanceName() + ") state is sync-ed (Starting -> Running) from out-of-context transition. VM network environment may need to be reset"); break; case Running : @@ -3472,8 +3473,9 @@ public class VirtualMachineManagerImpl extends ManagerBase implements VirtualMac } catch(NoTransitionException e) { s_logger.warn("Unexpected VM state transition exception, race-condition?", e); } - // TODO we need to alert admin or user about this risky state transition - break; + _alertMgr.sendAlert(AlertManager.ALERT_TYPE_SYNC, vm.getDataCenterId(), vm.getPodIdToDeployIn(), + VM_SYNC_ALERT_SUBJECT, "VM " + vm.getHostName() + "(" + vm.getInstanceName() + ") state is sync-ed (" + vm.getState() + " -> Running) from out-of-context transition. VM network environment may need to be reset"); + break; case Destroyed : case Expunging : @@ -3529,7 +3531,7 @@ public class VirtualMachineManagerImpl extends ManagerBase implements VirtualMac } } - private void scanStalledVMInTransitionState(long hostId) { + private void scanStalledVMInTransitionStateOnUpHost(long hostId) { // // Check VM that is stuck in Starting, Stopping, Migrating states, we won't check // VMs in expunging state (this need to be handled specially) @@ -3540,10 +3542,32 @@ public class VirtualMachineManagerImpl extends ManagerBase implements VirtualMac // // When host is UP, soon or later we will get a report from the host about the VM, // however, if VM is missing from the host report (it may happen in out of band changes - // or from designed behave of XS/KVM) + // or from designed behave of XS/KVM), the VM may not get a chance to run the state-sync logic // + // Therefor, we will scan thoses VMs on UP host based on last update timestamp, if the host is UP + // and a VM stalls for status update, we will consider them to be powered off + // (which is relatively safe to do so) - + long stallThresholdInMs = _pingInterval.value() + (_pingInterval.value() >> 1); + Date cutTime = new Date(DateUtil.currentGMTTime().getTime() - stallThresholdInMs); + List mostlikelyStoppedVMs = listStalledVMInTransitionStateOnUpHost(hostId, cutTime); + for(Long vmId : mostlikelyStoppedVMs) { + VMInstanceVO vm = _vmDao.findById(vmId); + assert(vm != null); + handlePowerOffReportWithNoPendingJobsOnVM(vm); + } + } + + private void scanStalledVMInTransitionStateOnDisconnectedHosts() { + Date cutTime = new Date(DateUtil.currentGMTTime().getTime() - this._operationTimeout.value()*1000); + List stuckAndUncontrollableVMs = listStalledVMInTransitionStateOnDisconnectedHosts(cutTime); + for(Long vmId : stuckAndUncontrollableVMs) { + VMInstanceVO vm = _vmDao.findById(vmId); + + // We now only alert administrator about this situation + _alertMgr.sendAlert(AlertManager.ALERT_TYPE_SYNC, vm.getDataCenterId(), vm.getPodIdToDeployIn(), + VM_SYNC_ALERT_SUBJECT, "VM " + vm.getHostName() + "(" + vm.getInstanceName() + ") is stuck in " + vm.getState() + " state and its host is unreachable for too long"); + } } // TODO, use sql query directly for quick prototype, need to refactor to use joins and search builders