From 6137f216b13472ca6c3d84a170d34c77ddfb5da2 Mon Sep 17 00:00:00 2001 From: Alex Huang Date: Sun, 12 Jun 2011 09:18:21 -0700 Subject: [PATCH] bug 10094: The problem was we added code that won't add any more ha work items if it already has one. However, that is wrong. HA Manager stores the existing snapshot of the VM state machine. Before working on HA for a VM, it checks to see if that snapshot has been changed. So by not scheduling HA work, we've effectively made HA not work under multi-failure situations. I've fixed by removing that code and instead at the time of performing HA, do a quick check to see if there are pwork underway for the same VM and work scheduled in the future for that VM. If there are work scheduled in the future, then we simply cancel the current work. If there are already work underway, then we retry again in 1 minute. --- .../cloud/ha/HighAvailabilityManagerImpl.java | 44 ++++++++++++------- .../com/cloud/ha/dao/HighAvailabilityDao.java | 17 +++++++ .../cloud/ha/dao/HighAvailabilityDaoImpl.java | 35 +++++++++++++++ 3 files changed, 81 insertions(+), 15 deletions(-) diff --git a/server/src/com/cloud/ha/HighAvailabilityManagerImpl.java b/server/src/com/cloud/ha/HighAvailabilityManagerImpl.java index 8c3ee25908c..24c41200549 100644 --- a/server/src/com/cloud/ha/HighAvailabilityManagerImpl.java +++ b/server/src/com/cloud/ha/HighAvailabilityManagerImpl.java @@ -293,36 +293,50 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager, Clu } } - final List items = _haDao.findPreviousHA(vm.getId()); + List items = _haDao.findPreviousHA(vm.getId()); int maxRetries = 0; - boolean NeedToAddNew = true; - for (final HaWorkVO item : items) { + for (HaWorkVO item : items) { if (maxRetries < item.getTimesTried() && !item.canScheduleNew(_timeBetweenFailures)) { maxRetries = item.getTimesTried(); break; } } - for (final HaWorkVO item : items) { - if (!(item.getStep() == Step.Error || item.getStep() == Step.Done || item.getStep() == Step.Cancelled)) { - NeedToAddNew = false; - } - } - if (NeedToAddNew) { - final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.HA, investigate ? Step.Investigating : Step.Scheduled, hostId, vm.getState(), maxRetries + 1, vm.getUpdated()); - _haDao.persist(work); - } + HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.HA, investigate ? Step.Investigating : Step.Scheduled, hostId, vm.getState(), maxRetries + 1, vm.getUpdated()); + _haDao.persist(work); if (s_logger.isInfoEnabled()) { - s_logger.info("Schedule vm for HA: " + vm.toString()); + s_logger.info("Schedule vm for HA: " + vm); } wakeupWorkers(); } - protected Long restart(final HaWorkVO work) { - final long vmId = work.getInstanceId(); + protected Long restart(HaWorkVO work) { + List items = _haDao.listFutureHaWorkForVm(work.getInstanceId(), work.getId()); + if (items.size() > 0) { + StringBuilder str = new StringBuilder("Cancelling this work item because newer ones have been scheduled. Work Ids = ["); + for (HaWorkVO item : items) { + str.append(item.getId()).append(", "); + } + str.delete(str.length() - 2, str.length()).append("]"); + s_logger.info(str.toString()); + return null; + } + + items = _haDao.listRunningHaWorkForVm(work.getInstanceId()); + if (items.size() > 0) { + StringBuilder str = new StringBuilder("Waiting because there's HA work being executed on an item currently. Work Ids =["); + for (HaWorkVO item : items) { + str.append(item.getId()).append(", "); + } + str.delete(str.length() - 2, str.length()).append("]"); + s_logger.info(str.toString()); + return (System.currentTimeMillis() >> 10) + _investigateRetryInterval; + } + + long vmId = work.getInstanceId(); VMInstanceVO vm = _itMgr.findById(work.getType(), work.getInstanceId()); if (vm == null) { diff --git a/server/src/com/cloud/ha/dao/HighAvailabilityDao.java b/server/src/com/cloud/ha/dao/HighAvailabilityDao.java index f01f7dcf05e..5754d580bf7 100644 --- a/server/src/com/cloud/ha/dao/HighAvailabilityDao.java +++ b/server/src/com/cloud/ha/dao/HighAvailabilityDao.java @@ -65,4 +65,21 @@ public interface HighAvailabilityDao extends GenericDao { boolean hasBeenScheduled(long instanceId, WorkType type); int releaseWorkItems(long nodeId); + + /** + * Look for HA work that has been scheduled for a vm since a certain work id. + * + * @param vmId virtual machine id. + * @param workId work item id. + * @return List of work items. + */ + List listFutureHaWorkForVm(long vmId, long workId); + + /** + * Look for HA work that is being run right now for a VM. + * + * @param vmId virtual machine id + * @return List of work items + */ + List listRunningHaWorkForVm(long vmId); } diff --git a/server/src/com/cloud/ha/dao/HighAvailabilityDaoImpl.java b/server/src/com/cloud/ha/dao/HighAvailabilityDaoImpl.java index 2ebca1e06c0..1f6777c1fd5 100644 --- a/server/src/com/cloud/ha/dao/HighAvailabilityDaoImpl.java +++ b/server/src/com/cloud/ha/dao/HighAvailabilityDaoImpl.java @@ -47,6 +47,8 @@ public class HighAvailabilityDaoImpl extends GenericDaoBase impl private final SearchBuilder PreviousWorkSearch; private final SearchBuilder TakenWorkSearch; private final SearchBuilder ReleaseSearch; + private final SearchBuilder FutureHaWorkSearch; + private final SearchBuilder RunningHaWorkSearch; protected HighAvailabilityDaoImpl() { super(); @@ -91,6 +93,39 @@ public class HighAvailabilityDaoImpl extends GenericDaoBase impl ReleaseSearch.and("step", ReleaseSearch.entity().getStep(), Op.NIN); ReleaseSearch.and("taken", ReleaseSearch.entity().getDateTaken(), Op.NNULL); ReleaseSearch.done(); + + FutureHaWorkSearch = createSearchBuilder(); + FutureHaWorkSearch.and("instance", FutureHaWorkSearch.entity().getInstanceId(), Op.EQ); + FutureHaWorkSearch.and("type", FutureHaWorkSearch.entity().getType(), Op.EQ); + FutureHaWorkSearch.and("id", FutureHaWorkSearch.entity().getId(), Op.GT); + FutureHaWorkSearch.done(); + + RunningHaWorkSearch = createSearchBuilder(); + RunningHaWorkSearch.and("instance", RunningHaWorkSearch.entity().getInstanceId(), Op.EQ); + RunningHaWorkSearch.and("type", RunningHaWorkSearch.entity().getType(), Op.EQ); + RunningHaWorkSearch.and("taken", RunningHaWorkSearch.entity().getDateTaken(), Op.NNULL); + RunningHaWorkSearch.and("step", RunningHaWorkSearch.entity().getStep(), Op.NIN); + RunningHaWorkSearch.done(); + } + + @Override + public List listRunningHaWorkForVm(long vmId) { + SearchCriteria sc = RunningHaWorkSearch.create(); + sc.setParameters("instance", vmId); + sc.setParameters("type", WorkType.HA); + sc.setParameters("step", Step.Done, Step.Error, Step.Cancelled); + + return search(sc, null); + } + + @Override + public List listFutureHaWorkForVm(long vmId, long workId) { + SearchCriteria sc = FutureHaWorkSearch.create(); + sc.setParameters("instance", vmId); + sc.setParameters("type", HighAvailabilityManager.WorkType.HA); + sc.setParameters("id", workId); + + return search(sc, null); } @Override