From cbf2c3bbf66214ec4a6b6f0666657185446c88ce Mon Sep 17 00:00:00 2001 From: Koushik Das Date: Fri, 28 Aug 2015 17:59:17 +0530 Subject: [PATCH] CLOUDSTACK-8785: Proper enforcement of retry count (max.retries) for all work type handled by HighAvailability manager Retry count is properly enforced for all work types in HA manager. Also reorganized some of the code for easy testing. --- .../cloud/ha/HighAvailabilityManagerImpl.java | 126 ++++++++---------- .../ha/HighAvailabilityManagerImplTest.java | 44 ++++++ 2 files changed, 103 insertions(+), 67 deletions(-) diff --git a/server/src/com/cloud/ha/HighAvailabilityManagerImpl.java b/server/src/com/cloud/ha/HighAvailabilityManagerImpl.java index bec1fd58cc2..cce44578929 100644 --- a/server/src/com/cloud/ha/HighAvailabilityManagerImpl.java +++ b/server/src/com/cloud/ha/HighAvailabilityManagerImpl.java @@ -383,10 +383,10 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai } List items = _haDao.findPreviousHA(vm.getId()); - int maxRetries = 0; + int timesTried = 0; for (HaWorkVO item : items) { - if (maxRetries < item.getTimesTried() && !item.canScheduleNew(_timeBetweenFailures)) { - maxRetries = item.getTimesTried(); + if (timesTried < item.getTimesTried() && !item.canScheduleNew(_timeBetweenFailures)) { + timesTried = item.getTimesTried(); break; } } @@ -396,7 +396,7 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai } HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.HA, investigate ? Step.Investigating : Step.Scheduled, - hostId != null ? hostId : 0L, vm.getState(), maxRetries + 1, vm.getUpdated()); + hostId != null ? hostId : 0L, vm.getState(), timesTried, vm.getUpdated()); _haDao.persist(work); if (s_logger.isInfoEnabled()) { @@ -407,7 +407,7 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai } - protected Long restart(HaWorkVO work) { + protected Long restart(final HaWorkVO work) { List items = _haDao.listFutureHaWorkForVm(work.getInstanceId(), work.getId()); if (items.size() > 0) { StringBuilder str = new StringBuilder("Cancelling this work item because newer ones have been scheduled. Work Ids = ["); @@ -571,11 +571,6 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai return null; } - if (work.getTimesTried() > _maxRetries) { - s_logger.warn("Retried to max times so deleting: " + vmId); - return null; - } - try { HashMap params = new HashMap(); if (_haTag != null) { @@ -663,7 +658,7 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai _haDao.delete(vm.getId(), WorkType.Destroy); } - protected Long destroyVM(HaWorkVO work) { + protected Long destroyVM(final HaWorkVO work) { final VirtualMachine vm = _itMgr.findById(work.getInstanceId()); s_logger.info("Destroying " + vm.toString()); try { @@ -690,7 +685,6 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai s_logger.debug("concurrent operation: " + e.getMessage()); } - work.setTimesTried(work.getTimesTried() + 1); return (System.currentTimeMillis() >> 10) + _stopRetryInterval; } @@ -738,10 +732,6 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai s_logger.debug("operation timed out: " + e.getMessage()); } - work.setTimesTried(work.getTimesTried() + 1); - if (s_logger.isDebugEnabled()) { - s_logger.debug("Stop was unsuccessful. Rescheduling"); - } return (System.currentTimeMillis() >> 10) + _stopRetryInterval; } @@ -765,6 +755,56 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai return vms; } + private void rescheduleWork(final HaWorkVO work, final long nextTime) { + s_logger.info("Rescheduling work " + work + " to try again at " + new Date(nextTime << 10)); + work.setTimeToTry(nextTime); + work.setTimesTried(work.getTimesTried() + 1); + work.setServerId(null); + work.setDateTaken(null); + } + + private void processWork(final HaWorkVO work) { + try { + final WorkType wt = work.getWorkType(); + Long nextTime = null; + if (wt == WorkType.Migration) { + nextTime = migrate(work); + } else if (wt == WorkType.HA) { + nextTime = restart(work); + } else if (wt == WorkType.Stop || wt == WorkType.CheckStop || wt == WorkType.ForceStop) { + nextTime = stopVM(work); + } else if (wt == WorkType.Destroy) { + nextTime = destroyVM(work); + } else { + assert false : "How did we get here with " + wt.toString(); + return; + } + + if (nextTime == null) { + s_logger.info("Completed work " + work); + work.setStep(Step.Done); + } else { + rescheduleWork(work, nextTime.longValue()); + } + } catch (Exception e) { + s_logger.warn("Encountered unhandled exception during HA process, reschedule work", e); + + long nextTime = (System.currentTimeMillis() >> 10) + _restartRetryInterval; + rescheduleWork(work, nextTime); + + // if restart failed in the middle due to exception, VM state may has been changed + // recapture into the HA worker so that it can really continue in it next turn + VMInstanceVO vm = _instanceDao.findById(work.getInstanceId()); + work.setUpdateTime(vm.getUpdated()); + work.setPreviousState(vm.getState()); + } + if (!Step.Done.equals(work.getStep()) && work.getTimesTried() >= _maxRetries) { + s_logger.warn("Giving up, retried max. times for work: " + work); + work.setStep(Step.Done); + } + _haDao.update(work.getId(), work); + } + @Override public boolean configure(final String name, final Map xmlParams) throws ConfigurationException { _serverId = _msServer.getId(); @@ -881,7 +921,7 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai private void runWithContext() { HaWorkVO work = null; try { - s_logger.trace("Checking the database"); + s_logger.trace("Checking the database for work"); work = _haDao.take(_serverId); if (work == null) { try { @@ -896,56 +936,8 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai } NDC.push("work-" + work.getId()); - s_logger.info("Processing " + work); - - try { - final WorkType wt = work.getWorkType(); - Long nextTime = null; - if (wt == WorkType.Migration) { - nextTime = migrate(work); - } else if (wt == WorkType.HA) { - nextTime = restart(work); - } else if (wt == WorkType.Stop || wt == WorkType.CheckStop || wt == WorkType.ForceStop) { - nextTime = stopVM(work); - } else if (wt == WorkType.Destroy) { - nextTime = destroyVM(work); - } else { - assert false : "How did we get here with " + wt.toString(); - return; - } - - if (nextTime == null) { - s_logger.info("Completed " + work); - work.setStep(Step.Done); - } else { - s_logger.info("Rescheduling " + work + " to try again at " + new Date(nextTime << 10)); - work.setTimeToTry(nextTime); - work.setTimesTried(work.getTimesTried() + 1); - work.setServerId(null); - work.setDateTaken(null); - } - } catch (Exception e) { - s_logger.warn("Encountered unhandled exception during HA process, reschedule retry", e); - - long nextTime = (System.currentTimeMillis() >> 10) + _restartRetryInterval; - - s_logger.info("Rescheduling " + work + " to try again at " + new Date(nextTime << 10)); - work.setTimeToTry(nextTime); - work.setTimesTried(work.getTimesTried() + 1); - work.setServerId(null); - work.setDateTaken(null); - - // if restart failed in the middle due to exception, VM state may has been changed - // recapture into the HA worker so that it can really continue in it next turn - VMInstanceVO vm = _instanceDao.findById(work.getInstanceId()); - work.setUpdateTime(vm.getUpdated()); - work.setPreviousState(vm.getState()); - if (!Step.Done.equals(work.getStep()) && work.getTimesTried() >= _maxRetries) { - s_logger.warn("Giving up, retries max times for work: " + work); - work.setStep(Step.Done); - } - } - _haDao.update(work.getId(), work); + s_logger.info("Processing work " + work); + processWork(work); } catch (final Throwable th) { s_logger.error("Caught this throwable, ", th); } finally { diff --git a/server/test/com/cloud/ha/HighAvailabilityManagerImplTest.java b/server/test/com/cloud/ha/HighAvailabilityManagerImplTest.java index 087c71a05ad..3102c9ac697 100644 --- a/server/test/com/cloud/ha/HighAvailabilityManagerImplTest.java +++ b/server/test/com/cloud/ha/HighAvailabilityManagerImplTest.java @@ -16,11 +16,14 @@ // under the License. package com.cloud.ha; +import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; import java.lang.reflect.Array; import java.lang.reflect.Field; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -31,6 +34,7 @@ import org.apache.cloudstack.engine.orchestration.service.VolumeOrchestrationSer import org.apache.cloudstack.framework.config.dao.ConfigurationDao; import org.apache.cloudstack.managed.context.ManagedContext; import org.junit.Before; +import org.junit.BeforeClass; import org.junit.Test; import org.junit.runner.RunWith; import org.mockito.Mock; @@ -44,6 +48,8 @@ import com.cloud.dc.DataCenterVO; import com.cloud.dc.HostPodVO; import com.cloud.dc.dao.DataCenterDao; import com.cloud.dc.dao.HostPodDao; +import com.cloud.ha.HighAvailabilityManager.Step; +import com.cloud.ha.HighAvailabilityManager.WorkType; import com.cloud.ha.dao.HighAvailabilityDao; import com.cloud.host.Host; import com.cloud.host.HostVO; @@ -106,6 +112,17 @@ public class HighAvailabilityManagerImplTest { HostVO hostVO; HighAvailabilityManagerImpl highAvailabilityManager; + HighAvailabilityManagerImpl highAvailabilityManagerSpy; + static Method processWorkMethod = null; + + @BeforeClass + public static void initOnce() { + try { + processWorkMethod = HighAvailabilityManagerImpl.class.getDeclaredMethod("processWork", HaWorkVO.class); + processWorkMethod.setAccessible(true); + } catch (NoSuchMethodException e) { + } + } @Before public void setup() throws IllegalArgumentException, @@ -123,8 +140,12 @@ public class HighAvailabilityManagerImplTest { injectField.set(highAvailabilityManager, obj); } } + } else if (injectField.getName().equals("_maxRetries")) { + injectField.setAccessible(true); + injectField.set(highAvailabilityManager, 5); } } + highAvailabilityManagerSpy = Mockito.spy(highAvailabilityManager); } @Test @@ -201,4 +222,27 @@ public class HighAvailabilityManagerImplTest { assertNull(highAvailabilityManager.investigate(1l)); } + + private void processWorkWithRetryCount(int count, Step expectedStep) { + assertNotNull(processWorkMethod); + HaWorkVO work = new HaWorkVO(1l, VirtualMachine.Type.User, WorkType.Migration, Step.Scheduled, 1l, VirtualMachine.State.Running, count, 12345678l); + Mockito.doReturn(12345678l).when(highAvailabilityManagerSpy).migrate(work); + try { + processWorkMethod.invoke(highAvailabilityManagerSpy, work); + } catch (IllegalAccessException e) { + } catch (IllegalArgumentException e) { + } catch (InvocationTargetException e) { + } + assertTrue(work.getStep() == expectedStep); + } + + @Test + public void processWorkWithRetryCountExceeded() { + processWorkWithRetryCount(5, Step.Done); // max retry count is 5 + } + + @Test + public void processWorkWithRetryCountNotExceeded() { + processWorkWithRetryCount(3, Step.Scheduled); + } }