From bee6953e098cf90198e4418bf5e3052917fe9275 Mon Sep 17 00:00:00 2001 From: Alex Huang Date: Mon, 7 Feb 2011 09:06:36 -0800 Subject: [PATCH] ha done? needs testing --- api/src/com/cloud/vm/VirtualMachine.java | 2 +- server/src/com/cloud/ha/HaWorkVO.java | 9 +- .../com/cloud/ha/HighAvailabilityManager.java | 27 ++- .../cloud/ha/HighAvailabilityManagerImpl.java | 137 ++++++++------- .../com/cloud/ha/dao/HighAvailabilityDao.java | 4 +- .../cloud/ha/dao/HighAvailabilityDaoImpl.java | 59 ++++--- .../cloud/vm/VirtualMachineManagerImpl.java | 160 ++++++++---------- 7 files changed, 213 insertions(+), 185 deletions(-) diff --git a/api/src/com/cloud/vm/VirtualMachine.java b/api/src/com/cloud/vm/VirtualMachine.java index 06d0ea9b986..3761a9f95d6 100755 --- a/api/src/com/cloud/vm/VirtualMachine.java +++ b/api/src/com/cloud/vm/VirtualMachine.java @@ -92,7 +92,7 @@ public interface VirtualMachine extends RunningOn, ControlledEntity, StateObject s_fsm.addTransition(State.Migrating, VirtualMachine.Event.MigrationFailedOnDest, State.Running); s_fsm.addTransition(State.Migrating, VirtualMachine.Event.AgentReportRunning, State.Running); s_fsm.addTransition(State.Migrating, VirtualMachine.Event.AgentReportStopped, State.Stopped); - s_fsm.addTransition(State.Migrating, VirtualMachine.Event.AgentReportShutdowned, State.Stopped); + s_fsm.addTransition(State.Migrating, VirtualMachine.Event.AgentReportShutdowned, State.Stopped); s_fsm.addTransition(State.Stopping, VirtualMachine.Event.OperationSucceeded, State.Stopped); s_fsm.addTransition(State.Stopping, VirtualMachine.Event.OperationFailed, State.Running); s_fsm.addTransition(State.Stopping, VirtualMachine.Event.AgentReportRunning, State.Running); diff --git a/server/src/com/cloud/ha/HaWorkVO.java b/server/src/com/cloud/ha/HaWorkVO.java index 92a3aa1833f..b8f32f09445 100644 --- a/server/src/com/cloud/ha/HaWorkVO.java +++ b/server/src/com/cloud/ha/HaWorkVO.java @@ -31,6 +31,7 @@ import javax.persistence.Temporal; import javax.persistence.TemporalType; import com.cloud.ha.HighAvailabilityManager.Step; +import com.cloud.ha.HighAvailabilityManager.WorkType; import com.cloud.utils.db.GenericDao; import com.cloud.vm.VirtualMachine; import com.cloud.vm.VirtualMachine.State; @@ -38,14 +39,6 @@ import com.cloud.vm.VirtualMachine.State; @Entity @Table(name="op_ha_work") public class HaWorkVO { - public enum WorkType { - Migration, - Stop, - CheckStop, - Destroy, - HA; - } - @Id @GeneratedValue(strategy=GenerationType.IDENTITY) @Column(name="id") diff --git a/server/src/com/cloud/ha/HighAvailabilityManager.java b/server/src/com/cloud/ha/HighAvailabilityManager.java index 6e336c8aa66..fa9e1496d03 100644 --- a/server/src/com/cloud/ha/HighAvailabilityManager.java +++ b/server/src/com/cloud/ha/HighAvailabilityManager.java @@ -28,6 +28,14 @@ import com.cloud.vm.VMInstanceVO; * HighAvailabilityManager checks to make sure the VMs are running fine. */ public interface HighAvailabilityManager extends Manager { + public enum WorkType { + Migration, // Migrating VMs off of a host. + Stop, // Stops a VM for storage pool migration purposes. This should be obsolete now. + CheckStop, // Checks if a VM has been stopped. + ForceStop, // Force a VM to stop even if the states don't allow it. Use this only if you know the VM is stopped on the physical hypervisor. + Destroy, // Destroy a VM. + HA; // Restart a VM. + } enum Step { Scheduled, @@ -35,9 +43,7 @@ public interface HighAvailabilityManager extends Manager { Fencing, Stopping, Restarting, - Preparing, Migrating, - Checking, Cancelled, Done, Error, @@ -63,7 +69,7 @@ public interface HighAvailabilityManager extends Manager { * @param vm the vm that has gone away. * @param investigate must be investigated before we do anything with this vm. */ - void scheduleRestart(final VMInstanceVO vm, boolean investigate); + void scheduleRestart(VMInstanceVO vm, boolean investigate); void cancelDestroy(VMInstanceVO vm, Long hostId); @@ -73,7 +79,7 @@ public interface HighAvailabilityManager extends Manager { * Schedule restarts for all vms running on the host. * @param host host. */ - void scheduleRestartForVmsOnHost(final HostVO host); + void scheduleRestartForVmsOnHost(HostVO host); /** * Schedule the vm for migration. @@ -81,18 +87,23 @@ public interface HighAvailabilityManager extends Manager { * @param vm * @return true if schedule worked. */ - boolean scheduleMigration(final VMInstanceVO vm); + boolean scheduleMigration(VMInstanceVO vm); List findTakenMigrationWork(); /** - * Stops a VM. + * Schedules a work item to stop a VM. This method schedules a work + * item to do one of three things. + * + * 1. Perform a regular stop of a VM: WorkType.Stop + * 2. Perform a force stop of a VM: WorkType.ForceStop + * 3. Check if a VM has been stopped: WorkType.CheckStop * * @param vm virtual machine to stop. * @param host host the virtual machine is on. - * @param verifyHost make sure it is the same host as the schedule time. + * @param type which type of stop is requested. */ - void scheduleStop(final VMInstanceVO vm, long hostId, boolean verifyHost); + void scheduleStop(VMInstanceVO vm, long hostId, WorkType type); void cancelScheduledMigrations(HostVO host); } diff --git a/server/src/com/cloud/ha/HighAvailabilityManagerImpl.java b/server/src/com/cloud/ha/HighAvailabilityManagerImpl.java index 6ffb17aa5f8..b7f4c2ac8f2 100644 --- a/server/src/com/cloud/ha/HighAvailabilityManagerImpl.java +++ b/server/src/com/cloud/ha/HighAvailabilityManagerImpl.java @@ -33,9 +33,9 @@ import javax.naming.ConfigurationException; import org.apache.log4j.Logger; import com.cloud.agent.AgentManager; -import com.cloud.agent.api.Answer; -import com.cloud.agent.api.Command; import com.cloud.alert.AlertManager; +import com.cloud.cluster.ClusterManagerListener; +import com.cloud.cluster.ManagementServerHostVO; import com.cloud.configuration.dao.ConfigurationDao; import com.cloud.dc.ClusterDetailsDao; import com.cloud.dc.DataCenterVO; @@ -48,7 +48,6 @@ import com.cloud.exception.InsufficientCapacityException; import com.cloud.exception.InsufficientServerCapacityException; import com.cloud.exception.OperationTimedoutException; import com.cloud.exception.ResourceUnavailableException; -import com.cloud.ha.HaWorkVO.WorkType; import com.cloud.ha.dao.HighAvailabilityDao; import com.cloud.host.Host; import com.cloud.host.HostVO; @@ -65,6 +64,7 @@ import com.cloud.utils.component.Adapters; import com.cloud.utils.component.ComponentLocator; import com.cloud.utils.component.Inject; import com.cloud.utils.concurrency.NamedThreadFactory; +import com.cloud.utils.exception.CloudRuntimeException; import com.cloud.vm.VMInstanceVO; import com.cloud.vm.VirtualMachine; import com.cloud.vm.VirtualMachine.Event; @@ -110,7 +110,7 @@ import com.cloud.vm.dao.VMInstanceDao; * } **/ @Local(value={HighAvailabilityManager.class}) -public class HighAvailabilityManagerImpl implements HighAvailabilityManager { +public class HighAvailabilityManagerImpl implements HighAvailabilityManager, ClusterManagerListener { protected static final Logger s_logger = Logger.getLogger(HighAvailabilityManagerImpl.class); String _name; WorkerThread[] _workers; @@ -222,17 +222,18 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager { } @Override - public void scheduleStop(final VMInstanceVO vm, long hostId, boolean verifyHost) { + public void scheduleStop(VMInstanceVO vm, long hostId, WorkType type) { + assert (type == WorkType.CheckStop || type == WorkType.ForceStop || type == WorkType.Stop); - if (_haDao.hasBeenScheduled(vm.getId(), verifyHost ? WorkType.CheckStop : WorkType.Stop)) { - s_logger.info("There's already a job scheduled to stop " + vm.toString()); + if (_haDao.hasBeenScheduled(vm.getId(), type)) { + s_logger.info("There's already a job scheduled to stop " + vm); return; } - final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), verifyHost ? WorkType.CheckStop : WorkType.Stop, Step.Scheduled, hostId, vm.getState(), 0, vm.getUpdated()); + HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), type, Step.Scheduled, hostId, vm.getState(), 0, vm.getUpdated()); _haDao.persist(work); if (s_logger.isDebugEnabled()) { - s_logger.debug("Scheduled " + work.toString() + " verifyHost = " + verifyHost); + s_logger.debug("Scheduled " + work); } wakeupWorkers(); } @@ -323,14 +324,13 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager { return null; } - s_logger.info("HA on " + vm.toString()); + s_logger.info("HA on " + vm); if (vm.getState() != work.getPreviousState() || vm.getUpdated() != work.getUpdateTime()) { - s_logger.info("VM " + vm.toString() + " has been changed. Current State = " + vm.getState() + " Previous State = " + work.getPreviousState() + " last updated = " + vm.getUpdated() + " previous updated = " + work.getUpdateTime()); + s_logger.info("VM " + vm + " has been changed. Current State = " + vm.getState() + " Previous State = " + work.getPreviousState() + " last updated = " + vm.getUpdated() + " previous updated = " + work.getUpdateTime()); return null; } - final HostVO host = _hostDao.findById(work.getHostId()); - boolean nativeHA = _agentMgr.isHostNativeHAEnabled(work.getHostId()); + HostVO host = _hostDao.findById(work.getHostId()); DataCenterVO dcVO = _dcDao.findById(host.getDataCenterId()); HostPodVO podVO = _podDao.findById(host.getPodId()); @@ -347,9 +347,6 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager { if (work.getStep() == Step.Investigating) { if (vm.getHostId() == null || vm.getHostId() != work.getHostId()) { s_logger.info("VM " + vm.toString() + " is now no longer on host " + work.getHostId()); - if (vm.getState() == State.Starting && vm.getUpdated() == work.getUpdateTime()) { - _itMgr.stateTransitTo(vm, Event.AgentReportStopped, null); - } return null; } @@ -366,7 +363,7 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager { if (alive != null && alive) { s_logger.debug("VM " + vm.getName() + " is found to be alive by " + investigator.getName()); if (host.getStatus() == Status.Up) { -//FIXME compareState(vm, new AgentVmInfo(vm.getInstanceName(), null, State.Running), false, nativeHA); + s_logger.info(vm + " is alive and host is up. No need to restart it."); return null; } else { s_logger.debug("Rescheduling because the host is not up but the vm is alive"); @@ -397,11 +394,14 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager { try { _itMgr.advanceStop(vm, true, _accountMgr.getSystemUser(), _accountMgr.getSystemAccount()); } catch (ResourceUnavailableException e) { - // FIXME + assert false : "How do we hit this when force is true?"; + throw new CloudRuntimeException("Caught exception even though it should be handled.", e); } catch (OperationTimedoutException e) { - // FIXME + assert false : "How do we hit this when force is true?"; + throw new CloudRuntimeException("Caught exception even though it should be handled.", e); } catch (ConcurrentOperationException e) { - // FIXME + assert false : "How do we hit this when force is true?"; + throw new CloudRuntimeException("Caught exception even though it should be handled.", e); } work.setStep(Step.Scheduled); @@ -462,7 +462,8 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager { _alertMgr.sendAlert(alertType, vm.getDataCenterId(), vm.getPodId(), "Unable to restart " + vm.getName() + " which was running on host " + hostDesc, "The Storage is unavailable for trying to restart VM, name: " + vm.getName() + ", id: " + vmId + " which was running on host " + hostDesc); return null; } catch (OperationTimedoutException e) { - // FIXME + s_logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage()); + _alertMgr.sendAlert(alertType, vm.getDataCenterId(), vm.getPodId(), "Unable to restart " + vm.getName() + " which was running on host " + hostDesc, "The Storage is unavailable for trying to restart VM, name: " + vm.getName() + ", id: " + vmId + " which was running on host " + hostDesc); return null; } } @@ -470,8 +471,12 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager { public Long migrate(final HaWorkVO work) { long vmId = work.getInstanceId(); + long srcHostId = work.getHostId(); try { + work.setStep(Step.Migrating); + _haDao.update(work.getId(), work); + if (!_itMgr.migrateAway(work.getType(), vmId, srcHostId)) { s_logger.warn("Unable to migrate vm from " + srcHostId); _agentMgr.maintenanceFailed(srcHostId); @@ -532,40 +537,44 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager { return (System.currentTimeMillis() >> 10) + _stopRetryInterval; } - protected Long stopVM(final HaWorkVO work) { - final VMInstanceVO vm = _itMgr.findById(work.getType(), work.getInstanceId()); - s_logger.info("Stopping " + vm.toString()); + protected Long stopVM(final HaWorkVO work) throws ConcurrentOperationException { + VMInstanceVO vm = _itMgr.findById(work.getType(), work.getInstanceId()); + if (vm == null) { + s_logger.info("No longer can find VM " + work.getInstanceId() + ". Throwing away " + work); + work.setStep(Step.Done); + return null; + } + s_logger.info("Stopping " + vm); try { if (work.getWorkType() == WorkType.Stop) { - if (vm.getHostId() != null) { -// FIXME if (_itMgr.advanceStop(vm, false, _accountMgr.getSystemUser(), _accountMgr.getSystemAccount())) { -// s_logger.info("Successfully stopped " + vm.toString()); -// return null; -// } - } else { - if (s_logger.isDebugEnabled()) { - s_logger.debug(vm.toString() + " has already been stopped"); - } - return null; + if (vm.getHostId() == null) { + if (s_logger.isDebugEnabled()) { + s_logger.debug(vm.toString() + " has already been stopped"); + } + return null; } + if (_itMgr.advanceStop(vm, false, _accountMgr.getSystemUser(), _accountMgr.getSystemAccount())) { + s_logger.info("Successfully stopped " + vm); + return null; + } } else if (work.getWorkType() == WorkType.CheckStop) { - if ((vm.getState() != State.Stopping) || vm.getHostId() == null || vm.getHostId().longValue() != work.getHostId()) { - if (s_logger.isDebugEnabled()) { - s_logger.debug(vm.toString() + " is different now. Scheduled Host: " + work.getHostId() + " Current Host: " + (vm.getHostId() != null ? vm.getHostId() : "none") + " State: " + vm.getState()); - } + if ((vm.getState() != work.getPreviousState()) || vm.getUpdated() != work.getUpdateTime() || vm.getHostId() == null || vm.getHostId().longValue() != work.getHostId()) { + s_logger.info(vm + " is different now. Scheduled Host: " + work.getHostId() + " Current Host: " + (vm.getHostId() != null ? vm.getHostId() : "none") + " State: " + vm.getState()); return null; - } else { - Command cmd = null; -//FIXME Command cmd = _itMgr.cleanup(vm.getInstanceName()); - Answer ans = _agentMgr.send(work.getHostId(), cmd); - if (ans.getResult()) { - _itMgr.stateTransitTo(vm, Event.AgentReportStopped, null); -//FIXME mgr.finalizeStop(new VirtualMachineProfileImpl(vm), (StopAnswer)ans); - s_logger.info("Successfully stopped " + vm.toString()); - return null; - } - s_logger.debug("Stop for " + vm.toString() + " was unsuccessful. Detail: " + ans.getDetails()); } + if (_itMgr.advanceStop(vm, false, _accountMgr.getSystemUser(), _accountMgr.getSystemAccount())) { + s_logger.info("Stop for " + vm + " was successful"); + return null; + } + } else if (work.getWorkType() == WorkType.ForceStop){ + if ((vm.getState() != work.getPreviousState()) || vm.getUpdated() != work.getUpdateTime() || vm.getHostId() == null || vm.getHostId().longValue() != work.getHostId()) { + s_logger.info(vm + " is different now. Scheduled Host: " + work.getHostId() + " Current Host: " + (vm.getHostId() != null ? vm.getHostId() : "none") + " State: " + vm.getState()); + return null; + } + if (_itMgr.advanceStop(vm, true, _accountMgr.getSystemUser(), _accountMgr.getSystemAccount())) { + s_logger.info("Stop for " + vm + " was successful"); + return null; + } } else { assert false : "Who decided there's other steps but didn't modify the guy who does the work?"; } @@ -576,6 +585,9 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager { } work.setTimesTried(work.getTimesTried() + 1); + if (s_logger.isDebugEnabled()) { + s_logger.debug("Stop was unsuccessful. Rescheduling"); + } return (System.currentTimeMillis() >> 10) + _stopRetryInterval; } @@ -650,6 +662,8 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager { if (_instance == null) { _instance = "VMOPS"; } + + _haDao.releaseWorkItems(_serverId); _stopped = true; @@ -726,7 +740,7 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager { } } - s_logger.info("Working on " + work.toString()); + s_logger.info("Processing " + work); try { final WorkType wt = work.getWorkType(); @@ -735,7 +749,7 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager { nextTime = migrate(work); } else if (wt == WorkType.HA) { nextTime = restart(work); - } else if (wt == WorkType.Stop || wt == WorkType.CheckStop) { + } else if (wt == WorkType.Stop || wt == WorkType.CheckStop || wt == WorkType.ForceStop) { nextTime = stopVM(work); } else if (wt == WorkType.Destroy) { nextTime = destroyVM(work); @@ -745,20 +759,16 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager { } if (nextTime == null) { - if (s_logger.isDebugEnabled()) { - s_logger.debug(work.toString() + " is complete"); - } + s_logger.info("Completed " + work); work.setStep(Step.Done); } else { - if (s_logger.isDebugEnabled()) { - s_logger.debug("Rescheduling " + work.toString() + " for instance " + work.getInstanceId() + " to try again at " + new Date(nextTime << 10)); - } + s_logger.info("Rescheduling " + work + " to try again at " + new Date(nextTime << 10)); work.setTimeToTry(nextTime); work.setServerId(null); work.setDateTaken(null); } } catch (Exception e) { - s_logger.error("Caught this exception while processing the work queue.", e); + s_logger.error("Terminating " + work, e); work.setStep(Step.Error); } _haDao.update(work.getId(), work); @@ -775,5 +785,16 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager { notifyAll(); } } + + @Override + public void onManagementNodeJoined(List nodeList, long selfNodeId) { + } + + @Override + public void onManagementNodeLeft(List nodeList, long selfNodeId) { + for (ManagementServerHostVO node : nodeList) { + _haDao.releaseWorkItems(node.getMsid()); + } + } } diff --git a/server/src/com/cloud/ha/dao/HighAvailabilityDao.java b/server/src/com/cloud/ha/dao/HighAvailabilityDao.java index c7b96609dc9..f01f7dcf05e 100644 --- a/server/src/com/cloud/ha/dao/HighAvailabilityDao.java +++ b/server/src/com/cloud/ha/dao/HighAvailabilityDao.java @@ -20,7 +20,7 @@ package com.cloud.ha.dao; import java.util.List; import com.cloud.ha.HaWorkVO; -import com.cloud.ha.HaWorkVO.WorkType; +import com.cloud.ha.HighAvailabilityManager.WorkType; import com.cloud.utils.db.GenericDao; public interface HighAvailabilityDao extends GenericDao { @@ -63,4 +63,6 @@ public interface HighAvailabilityDao extends GenericDao { * @return true if it has been scheduled and false if it hasn't. */ boolean hasBeenScheduled(long instanceId, WorkType type); + + int releaseWorkItems(long nodeId); } diff --git a/server/src/com/cloud/ha/dao/HighAvailabilityDaoImpl.java b/server/src/com/cloud/ha/dao/HighAvailabilityDaoImpl.java index 597eea3a619..2ebca1e06c0 100644 --- a/server/src/com/cloud/ha/dao/HighAvailabilityDaoImpl.java +++ b/server/src/com/cloud/ha/dao/HighAvailabilityDaoImpl.java @@ -24,14 +24,15 @@ import javax.ejb.Local; import org.apache.log4j.Logger; -import com.cloud.ha.HighAvailabilityManager; import com.cloud.ha.HaWorkVO; +import com.cloud.ha.HighAvailabilityManager; import com.cloud.ha.HighAvailabilityManager.Step; -import com.cloud.ha.HaWorkVO.WorkType; +import com.cloud.ha.HighAvailabilityManager.WorkType; import com.cloud.utils.db.Filter; import com.cloud.utils.db.GenericDaoBase; import com.cloud.utils.db.SearchBuilder; import com.cloud.utils.db.SearchCriteria; +import com.cloud.utils.db.SearchCriteria.Op; import com.cloud.utils.db.Transaction; import com.cloud.utils.exception.CloudRuntimeException; @@ -45,44 +46,51 @@ public class HighAvailabilityDaoImpl extends GenericDaoBase impl private final SearchBuilder CleanupSearch; private final SearchBuilder PreviousWorkSearch; private final SearchBuilder TakenWorkSearch; + private final SearchBuilder ReleaseSearch; protected HighAvailabilityDaoImpl() { super(); CleanupSearch = createSearchBuilder(); - CleanupSearch.and("time", CleanupSearch.entity().getTimeToTry(), SearchCriteria.Op.LTEQ); - CleanupSearch.and("step", CleanupSearch.entity().getStep(), SearchCriteria.Op.IN); + CleanupSearch.and("time", CleanupSearch.entity().getTimeToTry(), Op.LTEQ); + CleanupSearch.and("step", CleanupSearch.entity().getStep(), Op.IN); CleanupSearch.done(); TBASearch = createSearchBuilder(); - TBASearch.and("server", TBASearch.entity().getServerId(), SearchCriteria.Op.NULL); - TBASearch.and("taken", TBASearch.entity().getDateTaken(), SearchCriteria.Op.NULL); - TBASearch.and("time", TBASearch.entity().getTimeToTry(), SearchCriteria.Op.LTEQ); + TBASearch.and("server", TBASearch.entity().getServerId(), Op.NULL); + TBASearch.and("taken", TBASearch.entity().getDateTaken(), Op.NULL); + TBASearch.and("time", TBASearch.entity().getTimeToTry(), Op.LTEQ); TBASearch.done(); PreviousInstanceSearch = createSearchBuilder(); - PreviousInstanceSearch.and("instance", PreviousInstanceSearch.entity().getInstanceId(), SearchCriteria.Op.EQ); + PreviousInstanceSearch.and("instance", PreviousInstanceSearch.entity().getInstanceId(), Op.EQ); PreviousInstanceSearch.done(); UntakenMigrationSearch = createSearchBuilder(); - UntakenMigrationSearch.and("host", UntakenMigrationSearch.entity().getHostId(), SearchCriteria.Op.EQ); - UntakenMigrationSearch.and("type", UntakenMigrationSearch.entity().getWorkType(), SearchCriteria.Op.EQ); - UntakenMigrationSearch.and("server", UntakenMigrationSearch.entity().getServerId(), SearchCriteria.Op.NULL); - UntakenMigrationSearch.and("taken", UntakenMigrationSearch.entity().getDateTaken(), SearchCriteria.Op.NULL); + UntakenMigrationSearch.and("host", UntakenMigrationSearch.entity().getHostId(), Op.EQ); + UntakenMigrationSearch.and("type", UntakenMigrationSearch.entity().getWorkType(), Op.EQ); + UntakenMigrationSearch.and("server", UntakenMigrationSearch.entity().getServerId(), Op.NULL); + UntakenMigrationSearch.and("taken", UntakenMigrationSearch.entity().getDateTaken(), Op.NULL); UntakenMigrationSearch.done(); TakenWorkSearch = createSearchBuilder(); - TakenWorkSearch.and("type", TakenWorkSearch.entity().getWorkType(), SearchCriteria.Op.EQ); - TakenWorkSearch.and("server", TakenWorkSearch.entity().getServerId(), SearchCriteria.Op.NNULL); - TakenWorkSearch.and("taken", TakenWorkSearch.entity().getDateTaken(), SearchCriteria.Op.NNULL); - TakenWorkSearch.and("step", TakenWorkSearch.entity().getStep(), SearchCriteria.Op.NIN); + TakenWorkSearch.and("type", TakenWorkSearch.entity().getWorkType(), Op.EQ); + TakenWorkSearch.and("server", TakenWorkSearch.entity().getServerId(), Op.NNULL); + TakenWorkSearch.and("taken", TakenWorkSearch.entity().getDateTaken(), Op.NNULL); + TakenWorkSearch.and("step", TakenWorkSearch.entity().getStep(), Op.NIN); TakenWorkSearch.done(); PreviousWorkSearch = createSearchBuilder(); - PreviousWorkSearch.and("instance", PreviousWorkSearch.entity().getInstanceId(), SearchCriteria.Op.EQ); - PreviousWorkSearch.and("type", PreviousWorkSearch.entity().getWorkType(), SearchCriteria.Op.EQ); - PreviousWorkSearch.and("taken", PreviousWorkSearch.entity().getDateTaken(), SearchCriteria.Op.NULL); + PreviousWorkSearch.and("instance", PreviousWorkSearch.entity().getInstanceId(), Op.EQ); + PreviousWorkSearch.and("type", PreviousWorkSearch.entity().getWorkType(), Op.EQ); + PreviousWorkSearch.and("taken", PreviousWorkSearch.entity().getDateTaken(), Op.NULL); PreviousWorkSearch.done(); + + ReleaseSearch = createSearchBuilder(); + ReleaseSearch.and("server", ReleaseSearch.entity().getServerId(), Op.EQ); + ReleaseSearch.and("step", ReleaseSearch.entity().getStep(), Op.NIN); + ReleaseSearch.and("taken", ReleaseSearch.entity().getDateTaken(), Op.NNULL); + ReleaseSearch.done(); } @Override @@ -171,4 +179,17 @@ public class HighAvailabilityDaoImpl extends GenericDaoBase impl sc.setParameters("type", type); return listBy(sc, null).size() > 0; } + + @Override + public int releaseWorkItems(long nodeId) { + SearchCriteria sc = ReleaseSearch.create(); + sc.setParameters("server", nodeId); + sc.setParameters("step", Step.Done, Step.Cancelled, Step.Error); + + HaWorkVO vo = createForUpdate(); + vo.setDateTaken(null); + vo.setServerId(null); + + return update(vo, sc); + } } \ No newline at end of file diff --git a/server/src/com/cloud/vm/VirtualMachineManagerImpl.java b/server/src/com/cloud/vm/VirtualMachineManagerImpl.java index 8430fa39754..3f616e4a0f5 100644 --- a/server/src/com/cloud/vm/VirtualMachineManagerImpl.java +++ b/server/src/com/cloud/vm/VirtualMachineManagerImpl.java @@ -86,6 +86,7 @@ import com.cloud.exception.InsufficientServerCapacityException; import com.cloud.exception.OperationTimedoutException; import com.cloud.exception.ResourceUnavailableException; import com.cloud.ha.HighAvailabilityManager; +import com.cloud.ha.HighAvailabilityManager.WorkType; import com.cloud.host.Host; import com.cloud.host.HostVO; import com.cloud.host.Status; @@ -696,7 +697,7 @@ public class VirtualMachineManagerImpl implements VirtualMachineManager, StateLi } finally { if (startedVm == null) { _workDao.updateStep(work, Step.Release); - cleanup(vmGuru, vmProfile, work, false, caller, account); + cleanup(vmGuru, vmProfile, work, Event.OperationFailed, false, caller, account); } } } @@ -744,7 +745,7 @@ public class VirtualMachineManagerImpl implements VirtualMachineManager, StateLi return true; } - protected boolean cleanup(VirtualMachineGuru guru, VirtualMachineProfile profile, ItWorkVO work, boolean force, User user, Account account) { + protected boolean cleanup(VirtualMachineGuru guru, VirtualMachineProfile profile, ItWorkVO work, Event event, boolean force, User user, Account account) { T vm = profile.getVirtualMachine(); State state = vm.getState(); s_logger.debug("Cleaning up resources for the vm " + vm + " in " + state + " state"); @@ -838,7 +839,7 @@ public class VirtualMachineManagerImpl implements VirtualMachineManager, StateLi if ((vm.getState() == State.Starting || vm.getState() == State.Stopping || vm.getState() == State.Migrating) && forced) { ItWorkVO work = _workDao.findByOutstandingWork(vm.getId(), vm.getState()); if (work != null) { - if (cleanup(vmGuru, new VirtualMachineProfileImpl(vm), work, forced, user, account)) { + if (cleanup(vmGuru, new VirtualMachineProfileImpl(vm), work, Event.StopRequested, forced, user, account)) { return stateTransitTo(vm, Event.AgentReportStopped, null); } } @@ -1231,12 +1232,12 @@ public class VirtualMachineManagerImpl implements VirtualMachineManager, StateLi } public Commands deltaSync(long hostId, Map newStates) { - Map states = convertToIds(newStates); + Map states = convertToInfos(newStates); Commands commands = new Commands(OnError.Continue); boolean nativeHA = _agentMgr.isHostNativeHAEnabled(hostId); - for (final Map.Entry entry : states.entrySet()) { + for (Map.Entry entry : states.entrySet()) { AgentVmInfo info = entry.getValue(); VMInstanceVO vm = info.vm; @@ -1245,6 +1246,9 @@ public class VirtualMachineManagerImpl implements VirtualMachineManager, StateLi if (vm != null) { command = compareState(vm, info, false, nativeHA); } else { + if (s_logger.isDebugEnabled()) { + s_logger.debug("Cleaning up a VM that is no longer found: " + info.name); + } command = cleanup(info.name); } @@ -1256,7 +1260,7 @@ public class VirtualMachineManagerImpl implements VirtualMachineManager, StateLi return commands; } - protected Map convertToIds(final Map states) { + protected Map convertToInfos(final Map states) { final HashMap map = new HashMap(); if (states == null) { @@ -1272,13 +1276,13 @@ public class VirtualMachineManagerImpl implements VirtualMachineManager, StateLi VMInstanceVO vm = vmGuru.findByName(name); if (vm != null) { - map.put(vm.getId(), new AgentVmInfo(entry.getKey(), vm, entry.getValue())); + map.put(vm.getId(), new AgentVmInfo(entry.getKey(), vmGuru, vm, entry.getValue())); break; } Long id = vmGuru.convertToId(name); if (id != null) { - map.put(id, new AgentVmInfo(entry.getKey(), null, entry.getValue())); + map.put(id, new AgentVmInfo(entry.getKey(), vmGuru, null, entry.getValue())); } } } @@ -1324,61 +1328,59 @@ public class VirtualMachineManagerImpl implements VirtualMachineManager, StateLi _alertMgr.sendAlert(alertType, vm.getDataCenterId(), vm.getPodId(), "VM (name: " + vm.getName() + ", id: " + vm.getId() + ") stopped on host " + hostDesc + " due to storage failure", "Virtual Machine " + vm.getName() + " (id: " + vm.getId() + ") running on host [" + vm.getHostId() + "] stopped due to storage failure."); } - if (serverState == State.Migrating) { - s_logger.debug("Skipping vm in migrating state: " + vm.toString()); - return null; - } +// if (serverState == State.Migrating) { +// s_logger.debug("Skipping vm in migrating state: " + vm.toString()); +// return null; +// } if (agentState == serverState) { if (s_logger.isDebugEnabled()) { - s_logger.debug("Both states are " + agentState.toString() + " for " + serverName); + s_logger.debug("Both states are " + agentState + " for " + vm); } assert (agentState == State.Stopped || agentState == State.Running) : "If the states we send up is changed, this must be changed."; - stateTransitTo(vm, agentState == State.Stopped ? VirtualMachine.Event.AgentReportStopped : VirtualMachine.Event.AgentReportRunning, vm.getHostId()); - if (agentState == State.Stopped) { - s_logger.debug("State matches but the agent said stopped so let's send a cleanup anyways."); - return cleanup(agentName); + if (agentState == State.Running) { + stateTransitTo(vm, VirtualMachine.Event.AgentReportRunning, vm.getHostId()); + // FIXME: What if someone comes in and sets it to stopping? Then what? + return null; } - return null; - } - if (agentState == State.Shutdowned ) { - if ( serverState == State.Running || serverState == State.Starting || serverState == State.Stopping ) { - stateTransitTo(vm, VirtualMachine.Event.AgentReportShutdowned, null); + s_logger.debug("State matches but the agent said stopped so let's send a cleanup command anyways."); + return cleanup(agentName); + } + + if (agentState == State.Shutdowned) { + if (serverState == State.Running || serverState == State.Starting || serverState == State.Stopping) { + try { + advanceStop(vm, true, _accountMgr.getSystemUser(), _accountMgr.getSystemAccount()); + } catch (AgentUnavailableException e) { + assert(false) : "How do we hit this with forced on?"; + return null; + } catch (OperationTimedoutException e) { + assert(false) : "How do we hit this with forced on?"; + return null; + } catch (ConcurrentOperationException e) { + assert(false) : "How do we hit this with forced on?"; + return null; + } + } else { + s_logger.debug("Sending cleanup to a shutdowned vm: " + agentName); + command = cleanup(agentName); } - s_logger.debug("Sending cleanup to a shutdowned vm: " + agentName); - command = cleanup(agentName); } else if (agentState == State.Stopped) { // This state means the VM on the agent was detected previously // and now is gone. This is slightly different than if the VM // was never completed but we still send down a Stop Command // to ensure there's cleanup. - if (serverState == State.Running ) { - if(!nativeHA) { - // Our records showed that it should be running so let's restart it. - vm = findById(vm.getType(), vm.getId()); - _haMgr.scheduleRestart(vm, false); - command = cleanup(agentName); - } else { - s_logger.info("VM is in runnting state, agent reported as stopped and native HA is enabled => skip sync action"); - stateTransitTo(vm, Event.AgentReportStopped, null); - } + if (serverState == State.Running) { + // Our records showed that it should be running so let's restart it. + _haMgr.scheduleRestart(vm, false); } else if (serverState == State.Stopping) { - if (fullSync) { - s_logger.debug("VM is in stopping state on full sync. Updating the status to stopped"); - vm = findById(vm.getType(), vm.getId()); -// advanceStop(vm, true, _accountMgr.getSystemUser(), _accountMgr.getSystemAccount()); - command = cleanup(agentName); - } else { - s_logger.debug("Ignoring VM in stopping mode: " + vm.getName()); - } + _haMgr.scheduleStop(vm, vm.getHostId(), WorkType.ForceStop); + s_logger.debug("Scheduling a check stop for VM in stopping mode: " + vm); } else if (serverState == State.Starting) { s_logger.debug("Ignoring VM in starting mode: " + vm.getName()); - stateTransitTo(vm, VirtualMachine.Event.AgentReportStopped, null); - } else { - s_logger.debug("Sending cleanup to a stopped vm: " + agentName); - stateTransitTo(vm, VirtualMachine.Event.AgentReportStopped, null); - command = cleanup(agentName); - } + _haMgr.scheduleRestart(vm, false); + } + command = cleanup(agentName); } else if (agentState == State.Running) { if (serverState == State.Starting) { if (fullSync) { @@ -1414,39 +1416,13 @@ public class VirtualMachineManagerImpl implements VirtualMachineManager, StateLi } } } else if (serverState == State.Stopping) { - if (fullSync) { - s_logger.debug("VM state is in stopping on fullsync so resend stop."); - vm = findById(vm.getType(), vm.getId()); - stateTransitTo(vm, Event.AgentReportStopped, null); - //finalizeStop(new VirtualMachineProfileImpl(vm), null); - command = cleanup(agentName); - } else { - s_logger.debug("VM is in stopping state so no action."); - } - } else if (serverState == State.Destroyed || serverState == State.Stopped || serverState == State.Expunging) { + s_logger.debug("Scheduling a stop command for " + vm); + _haMgr.scheduleStop(vm, vm.getHostId(), WorkType.Stop); + } else { s_logger.debug("VM state is in stopped so stopping it on the agent"); command = cleanup(agentName); - } else { - stateTransitTo(vm, VirtualMachine.Event.AgentReportRunning, vm.getHostId()); - } - } /*else if (agentState == State.Unknown) { - if (serverState == State.Running) { - if (fullSync) { - vm = info.vmGuru.get(vm.getId()); - } - scheduleRestart(vm, false); - } else if (serverState == State.Starting) { - if (fullSync) { - vm = info.vmGuru.get(vm.getId()); - } - scheduleRestart(vm, false); - } else if (serverState == State.Stopping) { - if (fullSync) { - s_logger.debug("VM state is stopping in full sync. Resending stop"); - command = info.vmGuru.getCleanupCommand(vm, agentName); - } - } - }*/ + } + } return command; } @@ -1455,26 +1431,28 @@ public class VirtualMachineManagerImpl implements VirtualMachineManager, StateLi final List vms = _vmDao.listByHostId(hostId); s_logger.debug("Found " + vms.size() + " VMs for host " + hostId); - Map states = convertToIds(newStates); + Map infos = convertToInfos(newStates); boolean nativeHA = _agentMgr.isHostNativeHAEnabled(hostId); - for (final VMInstanceVO vm : vms) { - AgentVmInfo info = states.remove(vm.getId()); + for (VMInstanceVO vm : vms) { + AgentVmInfo info = infos.remove(vm.getId()); + VMInstanceVO castedVm = null; if (info == null) { - info = new AgentVmInfo(vm.getInstanceName(), null, State.Stopped); + info = new AgentVmInfo(vm.getInstanceName(), getVmGuru(vm), vm, State.Stopped); + castedVm = info.guru.findById(vm.getId()); + } else { + castedVm = info.vm; } - VirtualMachineGuru vmGuru = getVmGuru(vm); - VMInstanceVO castedVm = vmGuru.findById(vm.getId()); - final Command command = compareState(castedVm, info, true, nativeHA); + Command command = compareState(castedVm, info, true, nativeHA); if (command != null) { commands.addCommand(command); } } - for (final AgentVmInfo left : states.values()) { + for (final AgentVmInfo left : infos.values()) { if (nativeHA) { for (VirtualMachineGuru vmGuru : _vmGurus.values()) { VMInstanceVO vm = vmGuru.findByName(left.name); @@ -1615,7 +1593,7 @@ public class VirtualMachineManagerImpl implements VirtualMachineManager, StateLi for (VMInstanceVO instance : instances) { State state = instance.getState(); if (state == State.Stopping) { - _haMgr.scheduleStop(instance, instance.getHostId(), true); + _haMgr.scheduleStop(instance, instance.getHostId(), WorkType.CheckStop); } else if (state == State.Starting) { _haMgr.scheduleRestart(instance, true); } @@ -1632,13 +1610,15 @@ public class VirtualMachineManagerImpl implements VirtualMachineManager, StateLi protected class AgentVmInfo { public String name; public State state; - public State action; public VMInstanceVO vm; + public VirtualMachineGuru guru; - public AgentVmInfo(String name, VMInstanceVO vm, State state) { + @SuppressWarnings("unchecked") + public AgentVmInfo(String name, VirtualMachineGuru guru, VMInstanceVO vm, State state) { this.name = name; this.state = state; this.vm = vm; + this.guru = (VirtualMachineGuru)guru; } } }