FR3: Host-HA backported changes from master (#50)

- Improves job scheduling using state/event-driven logic
- Reduced database and cpu load, by reducing all background threads to one
- Improves Simulator and KVM host-ha integration tests
- Triggers VM HA on successful host (ipmi reboot) recovery
- Improves internal datastructures and checks around HA counter
- New FSM events to retry fencing and recovery
- Fixes KVM activity script to aggresively check against last update time

Signed-off-by: Rohit Yadav <rohit.yadav@shapeblue.com>
This commit is contained in:
Rohit Yadav 2017-09-26 09:20:14 +05:30 committed by Rohit Yadav
parent 1f52cd4245
commit 7df52405b0
20 changed files with 652 additions and 1408 deletions

View File

@ -108,8 +108,4 @@ public class PrepareForMaintenanceCmd extends BaseAsyncCmd {
throw new ServerApiException(ApiErrorCode.INTERNAL_ERROR, "Failed to prepare host for maintenance");
}
}
public void setHostId(final Long hostId) {
id = hostId;
}
}

View File

@ -47,8 +47,10 @@ public interface HAConfig extends StateObject<HAConfig.HAState>, InternalIdentit
ActivityCheckFailureUnderThresholdRatio,
PowerCycle,
Recovered,
RetryRecovery,
RecoveryWaitPeriodTimeout,
RecoveryOperationThresholdExceeded,
RetryFencing,
Fenced;
public Long getServerId() {
@ -123,6 +125,7 @@ public interface HAConfig extends StateObject<HAConfig.HAState>, InternalIdentit
FSM.addTransition(Recovering, Event.Disabled, Disabled);
FSM.addTransition(Recovering, Event.Ineligible, Ineligible);
FSM.addTransition(Recovering, Event.RetryRecovery, Recovering);
FSM.addTransition(Recovering, Event.Recovered, Recovered);
FSM.addTransition(Recovering, Event.RecoveryOperationThresholdExceeded, Fencing);
@ -132,6 +135,7 @@ public interface HAConfig extends StateObject<HAConfig.HAState>, InternalIdentit
FSM.addTransition(Fencing, Event.Disabled, Disabled);
FSM.addTransition(Fencing, Event.Ineligible, Ineligible);
FSM.addTransition(Fencing, Event.RetryFencing, Fencing);
FSM.addTransition(Fencing, Event.Fenced, Fenced);
FSM.addTransition(Fenced, Event.Disabled, Disabled);

View File

@ -72,6 +72,9 @@ public class SimulatorHAProvider extends HAAbstractHostProvider implements HAPro
@Override
public boolean isEligible(final Host host) {
if (host == null) {
return false;
}
final SimulatorHAState haState = hostHAStateMap.get(host.getId());
return !isInMaintenanceMode(host) && !isDisabled(host) && haState != null
&& Hypervisor.HypervisorType.Simulator.equals(host.getHypervisorType());
@ -130,15 +133,8 @@ public class SimulatorHAProvider extends HAAbstractHostProvider implements HAPro
}
}
@Override
public boolean preStateTransitionEvent(final HAConfig.HAState oldState, final HAConfig.Event event,
final HAConfig.HAState newState, final HAConfig vo, final boolean status, final Object opaque) {
return false;
}
@Override
public boolean postStateTransitionEvent(final StateMachine2.Transition<HAConfig.HAState, HAConfig.Event> transition,
final HAConfig vo, final boolean status, final Object opaque) {
private boolean addStateTransition(final HAConfig vo, final boolean status,
final HAConfig.HAState oldState, final HAConfig.HAState newState, final HAConfig.Event event) {
if (vo.getResourceType() != HAResource.ResourceType.Host) {
return false;
}
@ -147,6 +143,18 @@ public class SimulatorHAProvider extends HAAbstractHostProvider implements HAPro
return false;
}
final HAResourceCounter counter = haManager.getHACounter(vo.getResourceId(), vo.getResourceType());
return haState.addStateTransition(transition.getToState(), transition.getCurrentState(), transition.getEvent(), counter);
return haState.addStateTransition(newState, oldState, event, counter);
}
@Override
public boolean preStateTransitionEvent(final HAConfig.HAState oldState, final HAConfig.Event event,
final HAConfig.HAState newState, final HAConfig vo, final boolean status, final Object opaque) {
return addStateTransition(vo, status, oldState, newState, event);
}
@Override
public boolean postStateTransitionEvent(final StateMachine2.Transition<HAConfig.HAState, HAConfig.Event> transition,
final HAConfig vo, final boolean status, final Object opaque) {
return addStateTransition(vo, status, transition.getCurrentState(), transition.getToState(), transition.getEvent());
}
}

View File

@ -40,7 +40,7 @@
<dependency>
<groupId>br.com.autonomiccs</groupId>
<artifactId>apache-cloudstack-java-client</artifactId>
<version>1.0.4</version>
<version>1.0.5</version>
</dependency>
</dependencies>
</project>

View File

@ -116,7 +116,8 @@ else
lastUpdateTime=${arrTime[1]}
echo "$SuspectTime:$latestUpdateTime:$MSTime" > $acFile
if [[ $lastSuspectTime -ne $SuspectTime ]]; then
suspectTimeDiff=$(expr $SuspectTime - $lastSuspectTime)
if [[ $suspectTimeDiff -lt 0 ]]; then
if [[ $latestUpdateTime -gt $SuspectTime ]]; then
echo "=====> ALIVE <====="
else

View File

@ -2150,7 +2150,7 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
}
try {
SSHCmdHelper.SSHCmdResult result = SSHCmdHelper.sshExecuteCmdOneShot(connection, "service cloudstack-agent restart");
SSHCmdHelper.SSHCmdResult result = SSHCmdHelper.sshExecuteCmdOneShot(connection, "service cloudstack-agent restart || systemctl restart cloudstack-agent");
s_logger.debug(result.toString());
} catch (SshException e) {
return false;

View File

@ -73,4 +73,4 @@ public interface HAManager extends HAConfigManager {
boolean isHAEligible(final HAResource resource);
Boolean isVMAliveOnHost(final Host host);
Status getHostStatus(final Host host);
}
}

View File

@ -17,31 +17,20 @@
package org.apache.cloudstack.ha;
import com.cloud.cluster.ClusterManagerListener;
import com.cloud.cluster.ManagementServerHost;
import com.cloud.dc.ClusterDetailsDao;
import com.cloud.dc.ClusterDetailsVO;
import com.cloud.dc.DataCenter;
import com.cloud.dc.DataCenterDetailVO;
import com.cloud.dc.dao.DataCenterDetailsDao;
import com.cloud.domain.Domain;
import com.cloud.event.ActionEvent;
import com.cloud.event.ActionEventUtils;
import com.cloud.event.EventTypes;
import com.cloud.host.Host;
import com.cloud.host.Status;
import com.cloud.host.dao.HostDao;
import com.cloud.org.Cluster;
import com.cloud.utils.component.ComponentContext;
import com.cloud.utils.component.ManagerBase;
import com.cloud.utils.component.PluggableService;
import com.cloud.utils.db.Transaction;
import com.cloud.utils.db.TransactionCallback;
import com.cloud.utils.db.TransactionStatus;
import com.cloud.utils.exception.CloudRuntimeException;
import com.cloud.utils.fsm.NoTransitionException;
import com.google.common.base.Preconditions;
import com.google.common.base.Strings;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import javax.inject.Inject;
import javax.naming.ConfigurationException;
import org.apache.cloudstack.api.ApiErrorCode;
import org.apache.cloudstack.api.ServerApiException;
import org.apache.cloudstack.api.command.admin.ha.ConfigureHAForHostCmd;
@ -70,20 +59,35 @@ import org.apache.cloudstack.poll.BackgroundPollTask;
import org.apache.cloudstack.utils.identity.ManagementServerNode;
import org.apache.log4j.Logger;
import javax.inject.Inject;
import javax.naming.ConfigurationException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import com.cloud.cluster.ClusterManagerListener;
import com.cloud.cluster.ManagementServerHost;
import com.cloud.dc.ClusterDetailsDao;
import com.cloud.dc.ClusterDetailsVO;
import com.cloud.dc.DataCenter;
import com.cloud.dc.DataCenterDetailVO;
import com.cloud.dc.dao.DataCenterDetailsDao;
import com.cloud.domain.Domain;
import com.cloud.event.ActionEvent;
import com.cloud.event.ActionEventUtils;
import com.cloud.event.EventTypes;
import com.cloud.host.Host;
import com.cloud.host.Status;
import com.cloud.host.dao.HostDao;
import com.cloud.org.Cluster;
import com.cloud.utils.component.ComponentContext;
import com.cloud.utils.component.ManagerBase;
import com.cloud.utils.component.PluggableService;
import com.cloud.utils.db.Transaction;
import com.cloud.utils.db.TransactionCallback;
import com.cloud.utils.db.TransactionStatus;
import com.cloud.utils.exception.CloudRuntimeException;
import com.cloud.utils.fsm.NoTransitionException;
import com.cloud.utils.fsm.StateListener;
import com.cloud.utils.fsm.StateMachine2;
import com.google.common.base.Preconditions;
import com.google.common.base.Strings;
public final class HAManagerImpl extends ManagerBase implements HAManager, ClusterManagerListener, PluggableService, Configurable {
public final class HAManagerImpl extends ManagerBase implements HAManager, ClusterManagerListener, PluggableService, Configurable, StateListener<HAConfig.HAState, HAConfig.Event, HAConfig> {
public static final Logger LOG = Logger.getLogger(HAManagerImpl.class);
@Inject
@ -151,7 +155,9 @@ public final class HAManagerImpl extends ManagerBase implements HAManager, Clust
if (result) {
final String message = String.format("Transitioned host HA state from:%s to:%s due to event:%s for the host id:%d",
currentHAState, nextState, event, haConfig.getResourceId());
LOG.debug(message);
if (LOG.isTraceEnabled()) {
LOG.trace(message);
}
if (nextState == HAConfig.HAState.Recovering || nextState == HAConfig.HAState.Fencing || nextState == HAConfig.HAState.Fenced) {
ActionEventUtils.onActionEvent(CallContext.current().getCallingUserId(), CallContext.current().getCallingAccountId(),
Domain.ROOT_DOMAIN, EventTypes.EVENT_HA_STATE_TRANSITION, message);
@ -306,7 +312,7 @@ public final class HAManagerImpl extends ManagerBase implements HAManager, Clust
LOG.debug("HA: Agent is available/suspect/checking Up " + host.getId());
}
return Status.Down;
} else if (haConfig.getState() == HAConfig.HAState.Degraded || haConfig.getState() == HAConfig.HAState.Recovering || haConfig.getState() == HAConfig.HAState.Recovered || haConfig.getState() == HAConfig.HAState.Fencing) {
} else if (haConfig.getState() == HAConfig.HAState.Degraded || haConfig.getState() == HAConfig.HAState.Recovering || haConfig.getState() == HAConfig.HAState.Fencing) {
if (LOG.isDebugEnabled()){
LOG.debug("HA: Agent is disconnected " + host.getId());
}
@ -454,23 +460,90 @@ public final class HAManagerImpl extends ManagerBase implements HAManager, Clust
return cmdList;
}
//////////////////////////////////////////////////////////////////
//////////////// Clustered Manager Listeners /////////////////////
//////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////
//////////////// Event Listeners /////////////////////
//////////////////////////////////////////////////////
@Override
public void onManagementNodeJoined(List<? extends ManagementServerHost> nodeList, long selfNodeId) {
}
@Override
public void onManagementNodeLeft(List<? extends ManagementServerHost> nodeList, long selfNodeId) {
}
@Override
public void onManagementNodeIsolated() {
}
private boolean processHAStateChange(final HAConfig haConfig, final HAConfig.HAState newState, final boolean status) {
if (!status || !checkHAOwnership(haConfig)) {
return false;
}
final HAResource resource = validateAndFindHAResource(haConfig);
if (resource == null) {
return false;
}
final HAProvider<HAResource> haProvider = validateAndFindHAProvider(haConfig, resource);
if (haProvider == null) {
return false;
}
final HAResourceCounter counter = getHACounter(haConfig.getResourceId(), haConfig.getResourceType());
// Perform activity checks
if (newState == HAConfig.HAState.Checking) {
final ActivityCheckTask job = ComponentContext.inject(new ActivityCheckTask(resource, haProvider, haConfig,
HAProviderConfig.ActivityCheckTimeout, activityCheckExecutor, counter.getSuspectTimeStamp()));
activityCheckExecutor.submit(job);
}
// Attempt recovery
if (newState == HAConfig.HAState.Recovering) {
if (counter.getRecoveryCounter() >= (Long) (haProvider.getConfigValue(HAProviderConfig.MaxRecoveryAttempts, resource))) {
return false;
}
final RecoveryTask task = ComponentContext.inject(new RecoveryTask(resource, haProvider, haConfig,
HAProviderConfig.RecoveryTimeout, recoveryExecutor));
final Future<Boolean> recoveryFuture = recoveryExecutor.submit(task);
counter.setRecoveryFuture(recoveryFuture);
}
// Fencing
if (newState == HAConfig.HAState.Fencing) {
final FenceTask task = ComponentContext.inject(new FenceTask(resource, haProvider, haConfig,
HAProviderConfig.FenceTimeout, fenceExecutor));
final Future<Boolean> fenceFuture = fenceExecutor.submit(task);
counter.setFenceFuture(fenceFuture);
}
return true;
}
@Override
public boolean preStateTransitionEvent(final HAConfig.HAState oldState, final HAConfig.Event event, final HAConfig.HAState newState, final HAConfig haConfig, final boolean status, final Object opaque) {
if (oldState != newState || newState == HAConfig.HAState.Suspect || newState == HAConfig.HAState.Checking) {
return false;
}
if (LOG.isTraceEnabled()) {
LOG.trace("HA state pre-transition:: new state=" + newState + ", old state=" + oldState + ", for resource id=" + haConfig.getResourceId() + ", status=" + status + ", ha config state=" + haConfig.getState());
}
if (status && haConfig.getState() != newState) {
LOG.warn("HA state pre-transition:: HA state is not equal to transition state, HA state=" + haConfig.getState() + ", new state=" + newState);
}
return processHAStateChange(haConfig, newState, status);
}
@Override
public boolean postStateTransitionEvent(final StateMachine2.Transition<HAConfig.HAState, HAConfig.Event> transition, final HAConfig haConfig, final boolean status, final Object opaque) {
if (LOG.isTraceEnabled()) {
LOG.trace("HA state post-transition:: new state=" + transition.getToState() + ", old state=" + transition.getCurrentState() + ", for resource id=" + haConfig.getResourceId() + ", status=" + status + ", ha config state=" + haConfig.getState());
}
if (status && haConfig.getState() != transition.getToState()) {
LOG.warn("HA state post-transition:: HA state is not equal to transition state, HA state=" + haConfig.getState() + ", new state=" + transition.getToState());
}
return processHAStateChange(haConfig, transition.getToState(), status);
}
///////////////////////////////////////////////////
@ -522,10 +595,8 @@ public final class HAManagerImpl extends ManagerBase implements HAManager, Clust
0L, TimeUnit.MILLISECONDS,
new ArrayBlockingQueue<Runnable>(fenceOperationQueueSize, true), new ThreadPoolExecutor.CallerRunsPolicy());
pollManager.submitTask(new HealthCheckPollTask());
pollManager.submitTask(new ActivityCheckPollTask());
pollManager.submitTask(new RecoveryPollTask());
pollManager.submitTask(new FencingPollTask());
pollManager.submitTask(new HAManagerBgPollTask());
HAConfig.HAState.getStateMachine().registerListener(this);
LOG.debug("HA manager has been configured");
return true;
@ -558,7 +629,7 @@ public final class HAManagerImpl extends ManagerBase implements HAManager, Clust
//////////////// Poll Tasks /////////////////////
/////////////////////////////////////////////////
private final class HealthCheckPollTask extends ManagedContextRunnable implements BackgroundPollTask {
private final class HAManagerBgPollTask extends ManagedContextRunnable implements BackgroundPollTask {
@Override
protected void runInContext() {
try {
@ -581,6 +652,19 @@ public final class HAManagerImpl extends ManagerBase implements HAManager, Clust
continue;
}
switch (haConfig.getState()) {
case Available:
case Suspect:
case Degraded:
case Fenced:
final HealthCheckTask task = ComponentContext.inject(new HealthCheckTask(resource, haProvider, haConfig,
HAProviderConfig.HealthCheckTimeout, healthCheckExecutor));
healthCheckExecutor.submit(task);
break;
default:
break;
}
final HAResourceCounter counter = getHACounter(haConfig.getResourceId(), haConfig.getResourceType());
if (haConfig.getState() == HAConfig.HAState.Suspect) {
@ -595,17 +679,25 @@ public final class HAManagerImpl extends ManagerBase implements HAManager, Clust
}
}
switch (haConfig.getState()) {
case Available:
case Suspect:
case Degraded:
case Fenced:
final HealthCheckTask task = ComponentContext.inject(new HealthCheckTask(resource, haProvider, haConfig,
HAProviderConfig.HealthCheckTimeout, healthCheckExecutor));
healthCheckExecutor.submit(task);
break;
default:
break;
if (haConfig.getState() == HAConfig.HAState.Recovering) {
if (counter.getRecoveryCounter() >= (Long) (haProvider.getConfigValue(HAProviderConfig.MaxRecoveryAttempts, resource))) {
transitionHAState(HAConfig.Event.RecoveryOperationThresholdExceeded, haConfig);
} else {
transitionHAState(HAConfig.Event.RetryRecovery, haConfig);
}
}
if (haConfig.getState() == HAConfig.HAState.Recovered) {
counter.markRecoveryStarted();
if (counter.canExitRecovery((Long)(haProvider.getConfigValue(HAProviderConfig.RecoveryWaitTimeout, resource)))) {
if (transitionHAState(HAConfig.Event.RecoveryWaitPeriodTimeout, haConfig)) {
counter.markRecoveryCompleted();
}
}
}
if (haConfig.getState() == HAConfig.HAState.Fencing && counter.canAttemptFencing()) {
transitionHAState(HAConfig.Event.RetryFencing, haConfig);
}
}
} catch (Throwable t) {
@ -617,151 +709,5 @@ public final class HAManagerImpl extends ManagerBase implements HAManager, Clust
public Long getDelay() {
return null;
}
}
private final class ActivityCheckPollTask extends ManagedContextRunnable implements BackgroundPollTask {
@Override
protected void runInContext() {
try {
if (LOG.isTraceEnabled()) {
LOG.trace("HA activity check task is running...");
}
final List<HAConfig> haConfigList = new ArrayList<HAConfig>(haConfigDao.listAll());
for (final HAConfig haConfig : haConfigList) {
if (!checkHAOwnership(haConfig)) {
continue;
}
final HAResource resource = validateAndFindHAResource(haConfig);
if (resource == null) {
continue;
}
final HAProvider<HAResource> haProvider = validateAndFindHAProvider(haConfig, resource);
if (haProvider == null) {
continue;
}
if (haConfig.getState() == HAConfig.HAState.Checking) {
final HAResourceCounter counter = getHACounter(haConfig.getResourceId(), haConfig.getResourceType());
final ActivityCheckTask job = ComponentContext.inject(new ActivityCheckTask(resource, haProvider, haConfig,
HAProviderConfig.ActivityCheckTimeout, activityCheckExecutor, counter.getSuspectTimeStamp()));
activityCheckExecutor.submit(job);
}
}
} catch (Throwable t) {
LOG.error("Error trying to perform activity checks in HA manager", t);
}
}
@Override
public Long getDelay() {
return null;
}
}
private final class RecoveryPollTask extends ManagedContextRunnable implements BackgroundPollTask {
@Override
protected void runInContext() {
try {
if (LOG.isTraceEnabled()) {
LOG.trace("HA recovery task is running...");
}
final List<HAConfig> haConfigList = new ArrayList<HAConfig>(haConfigDao.listAll());
for (final HAConfig haConfig : haConfigList) {
if (!checkHAOwnership(haConfig)) {
continue;
}
final HAResource resource = validateAndFindHAResource(haConfig);
if (resource == null) {
continue;
}
final HAProvider<HAResource> haProvider = validateAndFindHAProvider(haConfig, resource);
if (haProvider == null) {
continue;
}
final HAResourceCounter counter = getHACounter(haConfig.getResourceId(), haConfig.getResourceType());
if (haConfig.getState() == HAConfig.HAState.Recovering) {
if (counter.canAttemptRecovery()) {
if (counter.getRecoveryCounter() >= (Long)(haProvider.getConfigValue(HAProviderConfig.MaxRecoveryAttempts, resource))) {
transitionHAState(HAConfig.Event.RecoveryOperationThresholdExceeded, haConfig);
continue;
}
final RecoveryTask task = ComponentContext.inject(new RecoveryTask(resource, haProvider, haConfig,
HAProviderConfig.RecoveryTimeout, recoveryExecutor));
final Future<Boolean> recoveryFuture = recoveryExecutor.submit(task);
counter.setRecoveryFuture(recoveryFuture);
counter.incrRecoveryCounter();
}
}
if (haConfig.getState() == HAConfig.HAState.Recovered) {
counter.markRecoveryStarted();
if (counter.canExitRecovery((Long)(haProvider.getConfigValue(HAProviderConfig.RecoveryWaitTimeout, resource)))) {
transitionHAState(HAConfig.Event.RecoveryWaitPeriodTimeout, haConfig);
counter.markRecoveryCompleted();
}
}
}
} catch (Throwable t) {
LOG.error("Error trying to perform recovery operation in HA manager", t);
}
}
@Override
public Long getDelay() {
return null;
}
}
private final class FencingPollTask extends ManagedContextRunnable implements BackgroundPollTask {
@Override
protected void runInContext() {
try {
if (LOG.isTraceEnabled()) {
LOG.trace("HA fencing task is running...");
}
final List<HAConfig> haConfigList = new ArrayList<HAConfig>(haConfigDao.listAll());
for (final HAConfig haConfig : haConfigList) {
if (!checkHAOwnership(haConfig)) {
continue;
}
final HAResource resource = validateAndFindHAResource(haConfig);
if (resource == null) {
continue;
}
final HAProvider<HAResource> haProvider = validateAndFindHAProvider(haConfig, resource);
if (haProvider == null) {
continue;
}
final HAResourceCounter counter = getHACounter(haConfig.getResourceId(), haConfig.getResourceType());
if (counter.lastFencingCompleted()) {
if (haConfig.getState() == HAConfig.HAState.Fencing) {
final FenceTask task = ComponentContext.inject(new FenceTask(resource, haProvider, haConfig,
HAProviderConfig.FenceTimeout, fenceExecutor));
final Future<Boolean> fenceFuture = fenceExecutor.submit(task);
counter.setFenceFuture(fenceFuture);
}
}
}
} catch (Throwable t) {
LOG.error("Error trying to perform fencing operation in HA manager", t);
}
}
@Override
public Long getDelay() {
return null;
}
}
}

View File

@ -41,7 +41,6 @@ public final class HAResourceCounter {
}
public synchronized void incrActivityCounter(final boolean isFailure) {
lastActivityCheckTimestamp = System.currentTimeMillis();
activityCheckCounter.incrementAndGet();
if (isFailure) {
activityCheckFailureCounter.incrementAndGet();
@ -71,8 +70,12 @@ public final class HAResourceCounter {
return activityCheckFailureCounter.get() > (activityCheckCounter.get() * failureRatio);
}
public boolean canPerformActivityCheck(final Long activityCheckInterval) {
return lastActivityCheckTimestamp == null || (System.currentTimeMillis() - lastActivityCheckTimestamp) > (activityCheckInterval * 1000);
public synchronized boolean canPerformActivityCheck(final Long activityCheckInterval) {
if (lastActivityCheckTimestamp == null || (System.currentTimeMillis() - lastActivityCheckTimestamp) > (activityCheckInterval * 1000)) {
lastActivityCheckTimestamp = System.currentTimeMillis();
return true;
}
return false;
}
public boolean canRecheckActivity(final Long maxDegradedPeriod) {
@ -121,7 +124,7 @@ public final class HAResourceCounter {
fenceFuture = future;
}
public boolean lastFencingCompleted() {
public boolean canAttemptFencing() {
return fenceFuture == null || fenceFuture.isDone();
}

View File

@ -17,12 +17,11 @@
package org.apache.cloudstack.ha.provider;
import com.cloud.utils.component.Adapter;
import org.apache.cloudstack.ha.HAConfig;
import org.apache.cloudstack.ha.HAResource;
import org.joda.time.DateTime;
import org.apache.cloudstack.ha.HAResource;
import com.cloud.utils.component.Adapter;
public interface HAProvider<R extends HAResource> extends Adapter {
@ -57,7 +56,9 @@ public interface HAProvider<R extends HAResource> extends Adapter {
boolean fence(R r) throws HAFenceException;
void setFenced(R r);
void fenceSubResources(R r);
void enableMaintenance(R r);
void sendAlert(R r, HAConfig.HAState nextState);

View File

@ -71,7 +71,7 @@ public abstract class HAAbstractHostProvider extends AdapterBase implements HAPr
}
@Override
public void setFenced(final Host r) {
public void fenceSubResources(final Host r) {
if (r.getState() != Status.Down) {
try {
LOG.debug("Trying to disconnect the host without investigation and scheduling HA for the VMs on host id=" + r.getId());
@ -80,11 +80,15 @@ public abstract class HAAbstractHostProvider extends AdapterBase implements HAPr
} catch (Exception e) {
LOG.error("Failed to disconnect host and schedule HA restart of VMs after fencing the host: ", e);
}
try {
resourceManager.resourceStateTransitTo(r, ResourceState.Event.InternalEnterMaintenance, ManagementServerNode.getManagementServerId());
} catch (NoTransitionException e) {
LOG.error("Failed to put host in maintenance mode after host-ha fencing and scheduling VM-HA: ", e);
}
}
}
@Override
public void enableMaintenance(final Host r) {
try {
resourceManager.resourceStateTransitTo(r, ResourceState.Event.InternalEnterMaintenance, ManagementServerNode.getManagementServerId());
} catch (NoTransitionException e) {
LOG.error("Failed to put host in maintenance mode after host-ha fencing and scheduling VM-HA: ", e);
}
}

View File

@ -17,6 +17,10 @@
package org.apache.cloudstack.ha.task;
import java.util.concurrent.ExecutorService;
import javax.inject.Inject;
import org.apache.cloudstack.ha.HAConfig;
import org.apache.cloudstack.ha.HAManager;
import org.apache.cloudstack.ha.HAResource;
@ -25,11 +29,7 @@ import org.apache.cloudstack.ha.provider.HACheckerException;
import org.apache.cloudstack.ha.provider.HAProvider;
import org.apache.cloudstack.ha.provider.HAProvider.HAProviderConfig;
import org.apache.log4j.Logger;
import javax.inject.Inject;
import org.joda.time.DateTime;
import java.util.concurrent.ExecutorService;
public class ActivityCheckTask extends BaseHATask {
@ -38,22 +38,24 @@ public class ActivityCheckTask extends BaseHATask {
@Inject
private HAManager haManager;
private final long disconnectTime;
private long disconnectTime;
private long maxActivityChecks;
private double activityCheckFailureRatio;
public ActivityCheckTask(final HAResource resource, final HAProvider<HAResource> haProvider, final HAConfig haConfig, final HAProvider.HAProviderConfig haProviderConfig,
final ExecutorService executor, final long disconnectTime) {
super(resource, haProvider, haConfig, haProviderConfig, executor);
this.disconnectTime = disconnectTime;
this.maxActivityChecks = (Long)haProvider.getConfigValue(HAProviderConfig.MaxActivityChecks, resource);
this.activityCheckFailureRatio = (Double)haProvider.getConfigValue(HAProviderConfig.ActivityCheckFailureRatio, resource);
}
public boolean performAction() throws HACheckerException {
return getHaProvider().hasActivity(getResource(), new DateTime(disconnectTime));
}
public void processResult(boolean result, Throwable t) {
public synchronized void processResult(boolean result, Throwable t) {
final HAConfig haConfig = getHaConfig();
final HAProvider<HAResource> haProvider = getHaProvider();
final HAResource resource = getResource();
final HAResourceCounter counter = haManager.getHACounter(haConfig.getResourceId(), haConfig.getResourceType());
if (t != null && t instanceof HACheckerException) {
@ -64,18 +66,17 @@ public class ActivityCheckTask extends BaseHATask {
counter.incrActivityCounter(!result);
long maxActivityChecks = (Long)haProvider.getConfigValue(HAProviderConfig.MaxActivityChecks, resource);
if (counter.getActivityCheckCounter() < maxActivityChecks) {
haManager.transitionHAState(HAConfig.Event.TooFewActivityCheckSamples, haConfig);
return;
}
double activityCheckFailureRatio = (Double)haProvider.getConfigValue(HAProviderConfig.ActivityCheckFailureRatio, resource);
if (counter.hasActivityThresholdExceeded(activityCheckFailureRatio)) {
haManager.transitionHAState(HAConfig.Event.ActivityCheckFailureOverThresholdRatio, haConfig);
} else {
haManager.transitionHAState(HAConfig.Event.ActivityCheckFailureUnderThresholdRatio, haConfig);
counter.markResourceDegraded();
if (haManager.transitionHAState(HAConfig.Event.ActivityCheckFailureUnderThresholdRatio, haConfig)) {
counter.markResourceDegraded();
}
}
counter.resetActivityCounter();
}

View File

@ -17,6 +17,13 @@
package org.apache.cloudstack.ha.task;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import org.apache.cloudstack.ha.HAConfig;
import org.apache.cloudstack.ha.HAResource;
import org.apache.cloudstack.ha.provider.HACheckerException;
@ -24,13 +31,7 @@ import org.apache.cloudstack.ha.provider.HAFenceException;
import org.apache.cloudstack.ha.provider.HAProvider;
import org.apache.cloudstack.ha.provider.HARecoveryException;
import org.apache.log4j.Logger;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import org.joda.time.DateTime;
public abstract class BaseHATask implements Callable<Boolean> {
public static final Logger LOG = Logger.getLogger(BaseHATask.class);
@ -40,6 +41,7 @@ public abstract class BaseHATask implements Callable<Boolean> {
private final HAConfig haConfig;
private final ExecutorService executor;
private Long timeout;
private DateTime created;
public BaseHATask(final HAResource resource, final HAProvider<HAResource> haProvider, final HAConfig haConfig, final HAProvider.HAProviderConfig haProviderConfig,
final ExecutorService executor) {
@ -48,6 +50,7 @@ public abstract class BaseHATask implements Callable<Boolean> {
this.haConfig = haConfig;
this.executor = executor;
this.timeout = (Long)haProvider.getConfigValue(haProviderConfig, resource);
this.created = new DateTime();
}
public HAProvider<HAResource> getHaProvider() {
@ -74,6 +77,9 @@ public abstract class BaseHATask implements Callable<Boolean> {
@Override
public Boolean call() {
if (new DateTime().minusHours(1).isAfter(getCreated())) {
return false;
}
final Future<Boolean> future = executor.submit(new Callable<Boolean>() {
@Override
public Boolean call() throws HACheckerException, HAFenceException, HARecoveryException {
@ -99,4 +105,7 @@ public abstract class BaseHATask implements Callable<Boolean> {
return result;
}
public DateTime getCreated() {
return created;
}
}

View File

@ -48,7 +48,8 @@ public class FenceTask extends BaseHATask {
if (result) {
counter.resetRecoveryCounter();
haManager.transitionHAState(HAConfig.Event.Fenced, haConfig);
getHaProvider().setFenced(getResource());
getHaProvider().fenceSubResources(getResource());
getHaProvider().enableMaintenance(getResource());
}
getHaProvider().sendAlert(getResource(), HAConfig.HAState.Fencing);
}

View File

@ -17,16 +17,18 @@
package org.apache.cloudstack.ha.task;
import java.util.concurrent.ExecutorService;
import javax.inject.Inject;
import org.apache.cloudstack.ha.HAConfig;
import org.apache.cloudstack.ha.HAManager;
import org.apache.cloudstack.ha.HAResource;
import org.apache.cloudstack.ha.HAResourceCounter;
import org.apache.cloudstack.ha.provider.HACheckerException;
import org.apache.cloudstack.ha.provider.HAProvider;
import org.apache.cloudstack.ha.provider.HARecoveryException;
import javax.inject.Inject;
import java.util.concurrent.ExecutorService;
public class RecoveryTask extends BaseHATask {
@Inject
@ -43,8 +45,13 @@ public class RecoveryTask extends BaseHATask {
public void processResult(boolean result, Throwable e) {
final HAConfig haConfig = getHaConfig();
final HAResourceCounter counter = haManager.getHACounter(haConfig.getResourceId(), haConfig.getResourceType());
counter.incrRecoveryCounter();
counter.resetActivityCounter();
if (result) {
haManager.transitionHAState(HAConfig.Event.Recovered, haConfig);
getHaProvider().fenceSubResources(getResource());
}
getHaProvider().sendAlert(getResource(), HAConfig.HAState.Recovering);
}

View File

@ -263,7 +263,7 @@ public class OutOfBandManagementServiceImpl extends ManagerBase implements OutOf
}
public boolean isOutOfBandManagementEnabled(final Host host) {
return isOutOfBandManagementEnabledForZone(host.getDataCenterId())
return host != null && isOutOfBandManagementEnabledForZone(host.getDataCenterId())
&& isOutOfBandManagementEnabledForCluster(host.getClusterId())
&& isOutOfBandManagementEnabledForHost(host.getId());
}

View File

@ -1,247 +0,0 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from marvin.cloudstackTestCase import *
from marvin.cloudstackAPI import *
from marvin.lib.utils import *
from marvin.lib.common import *
from nose.plugins.attrib import attr
import cmd
from cmd import Cmd
class TestHaForHost(cloudstackTestCase):
""" Test cases for configuring HA for Host
"""
def setUp(self):
testClient = super(TestHaForHost, self).getClsTestClient()
self.apiclient = testClient.getApiClient()
self.dbclient = testClient.getDbConnection()
self.services = testClient.getParsedTestDataConfig()
self.zone = get_zone(self.apiclient, testClient.getZoneForTests())
self.host = None
self.server = None
self.cleanup = []
def tearDown(self):
try:
self.dbclient.execute("delete from ha_config where resource_type='Host'")
cleanup_resources(self.apiclient, self.cleanup)
except Exception as e:
raise Exception("Warning: Exception during cleanup : %s" % e)
def getHost(self, hostId=None):
if self.host and hostId is None:
return self.host
response = list_hosts(
self.apiclient,
zoneid=self.zone.id,
type='Routing',
id=hostId
)
if len(response) > 0:
self.host = response[0]
return self.host
raise self.skipTest("No hosts found, skipping HA for Host test")
def getHaProvider(self, host):
cmd = listHostHAProviders.listHostHAProvidersCmd()
cmd.hypervisor = host.hypervisor
response = self.apiclient.listHostHAProviders(cmd)
return response[0].haprovider
def configureHaProvider(self):
cmd = configureHAForHost.configureHAForHostCmd()
cmd.hostid = self.getHost().id
cmd.provider = self.getHaProvider(self.getHost())
return self.apiclient.configureHAForHost(cmd)
def getHaForHostEnableCmd(self):
cmd = enableHAForHost.enableHAForHostCmd()
cmd.hostid = self.getHost().id
return cmd
def getHaForHostDisableCmd(self):
cmd = disableHAForHost.disableHAForHostCmd()
cmd.hostid = self.getHost().id
return cmd
def getListHostHAResources(self):
cmd = listHostHAResources.listHostHAResourcesCmd()
cmd.hostid = self.getHost().id
return cmd
@attr(tags=["advanced",
"advancedns",
"smoke",
"basic",
"sg"],
required_hardware="false")
def test_enable_ha_for_host(self):
"""
This test enables HA for a host
"""
self.configureHaProvider()
cmd = self.getHaForHostEnableCmd()
response = self.apiclient.enableHAForHost(cmd)
self.assertEqual(response.hostid, cmd.hostid)
self.assertEqual(response.haenable, True)
@attr(tags=["advanced",
"advancedns",
"smoke",
"basic",
"sg"],
required_hardware="false")
def test_enable_ha_for_host_invalid(self):
"""
This is a negative test for enable HA for a host
"""
self.configureHaProvider()
cmd = self.getHaForHostEnableCmd()
cmd.hostid = -1
try:
response = self.apiclient.enableHAForHost(cmd)
except Exception:
pass
else:
self.fail("Expected an exception to be thrown, failing")
@attr(tags=["advanced",
"advancedns",
"smoke",
"basic",
"sg"],
required_hardware="false")
def test_disable_ha_for_host(self):
"""
This test disables HA for a host
"""
self.configureHaProvider()
cmd = self.getHaForHostDisableCmd()
response = self.apiclient.disableHAForHost(cmd)
self.assertTrue(response.hostid, cmd.hostid)
self.assertEqual(response.haenable, False)
response = self.getHost(cmd.hostid)
self.assertEqual(response.hostha.hastate, "Disabled")
@attr(tags=["advanced",
"advancedns",
"smoke",
"basic",
"sg"],
required_hardware="false")
def test_disable_ha_for_host_invalid(self):
"""
This is a negative test for disable HA for a host
"""
self.configureHaProvider()
cmd = self.getHaForHostDisableCmd()
cmd.hostid = -1
try:
response = self.apiclient.disableHAForHost(cmd)
except Exception:
pass
else:
self.fail("Expected an exception to be thrown, failing")
@attr(tags=["advanced",
"advancedns",
"smoke",
"basic",
"sg"],
required_hardware="false")
def test_list_ha_for_host(self):
"""
Test that verifies the listHAForHost API
"""
self.configureHaProvider()
db_count = self.dbclient.execute("SELECT count(*) FROM cloud.ha_config")
cmd = self.getListHostHAResources()
del cmd.hostid
response = self.apiclient.listHostHAResources(cmd)
self.assertEqual(db_count[0][0], len(response))
@attr(tags=["advanced",
"advancedns",
"smoke",
"basic",
"sg"],
required_hardware="false")
def test_list_ha_for_host_valid(self):
"""
Valid test for listing a specific host HA resources
"""
self.configureHaProvider()
cmd = self.getListHostHAResources()
response = self.apiclient.listHostHAResources(cmd)
self.assertEqual(response[0].hostid, cmd.hostid)
@attr(tags=["advanced",
"advancedns",
"smoke",
"basic",
"sg"],
required_hardware="false")
def test_list_ha_for_host_invalid(self):
"""
Test that listHostHAResources is returning exception when called with invalid data
"""
self.configureHaProvider()
cmd = self.getListHostHAResources()
cmd.hostid = "someinvalidvalue"
try:
response = self.apiclient.listHostHAResources(cmd)
except Exception:
pass
else:
self.fail("Expected an exception to be thrown, failing")

View File

@ -1,535 +0,0 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from marvin.cloudstackTestCase import *
from marvin.lib.utils import *
from marvin.lib.base import *
from marvin.lib.common import *
from nose.plugins.attrib import attr
from ipmisim.ipmisim import IpmiServerContext, IpmiServer, ThreadedIpmiServer
import random
import socket
import thread
class TestHaKVMAgent(cloudstackTestCase):
""" Test cases for out of band management
"""
def setUp(self):
testClient = super(TestHaKVMAgent, self).getClsTestClient()
self.apiClient = testClient.getApiClient()
self.dbclient = testClient.getDbConnection()
self.services = testClient.getParsedTestDataConfig()
self.zone = get_zone(self.apiClient, testClient.getZoneForTests())
self.host = self.getHost()
self.cluster_id = self.host.clusterid
self.server = None
self.hypervisor = self.testClient.getHypervisorInfo()
self.mgtSvrDetails = self.config.__dict__["mgtSvr"][0].__dict__
self.hostConfig = self.config.__dict__["zones"][0].__dict__["pods"][0].__dict__["clusters"][0].__dict__["hosts"][0].__dict__
self.fakeMsId = random.randint(10000, 99999) * random.randint(10, 20)
# Cleanup any existing configs
self.dbclient.execute("delete from ha_config where resource_type='Host'")
# use random port for ipmisim
s = socket.socket()
s.bind(('', 0))
self.serverPort = s.getsockname()[1]
s.close()
# Set Cluster-level setting in order to run tests faster
self.update_configuration("kvm.ha.activity.check.failure.ratio", "0.7")
self.update_configuration("kvm.ha.activity.check.interval", "10")
self.update_configuration("kvm.ha.activity.check.max.attempts", "5")
self.update_configuration("kvm.ha.activity.check.timeout", "60")
self.update_configuration("kvm.ha.degraded.max.period", "30")
self.update_configuration("kvm.ha.fence.timeout", "60")
self.update_configuration("kvm.ha.health.check.timeout", "10")
self.update_configuration("kvm.ha.recover.failure.threshold", "1")
self.update_configuration("kvm.ha.recover.timeout", "120")
self.update_configuration("kvm.ha.recover.wait.period", "60")
self.service_offering = ServiceOffering.create(
self.apiClient,
self.services["service_offerings"]
)
self.template = get_template(
self.apiClient,
self.zone.id,
self.services["ostype"]
)
self.cleanup = [self.service_offering]
def tearDown(self):
try:
self.dbclient.execute("delete from mshost_peer where peer_runid=%s" % self.getFakeMsRunId())
self.dbclient.execute("delete from mshost where runid=%s" % self.getFakeMsRunId())
self.dbclient.execute("delete from cluster_details where name='resourceHAEnabled'")
self.dbclient.execute("delete from data_center_details where name='resourceHAEnabled'")
self.dbclient.execute("delete from ha_config where resource_type='Host'")
self.dbclient.execute("delete from oobm where port=%d" % self.getIpmiServerPort())
self.dbclient.execute("delete from mshost_peer where peer_runid=%s" % self.getFakeMsRunId())
self.dbclient.execute("delete from mshost where runid=%s" % self.getFakeMsRunId())
self.dbclient.execute("delete from cluster_details where name='outOfBandManagementEnabled'")
self.dbclient.execute("delete from data_center_details where name='outOfBandManagementEnabled'")
cleanup_resources(self.apiClient, self.cleanup)
if self.server:
self.server.shutdown()
self.server.server_close()
except Exception as e:
raise Exception("Warning: Exception during cleanup : %s" % e)
def getFakeMsId(self):
return self.fakeMsId
def getFakeMsRunId(self):
return self.fakeMsId * 1000
def getHostHaConfigCmd(self, provider='kvmhaprovider'):
cmd = configureHAForHost.configureHAForHostCmd()
cmd.provider = provider
cmd.hostid = self.host.id
return cmd
def getHostHaEnableCmd(self):
cmd = enableHAForHost.enableHAForHostCmd()
cmd.hostid = self.host.id
return cmd
def getHost(self, hostId=None):
response = list_hosts(
self.apiClient,
zoneid=self.zone.id,
type='Routing',
id=hostId
)
if len(response) > 0:
self.host = response[0]
return self.host
raise self.skipTest("No hosts found, skipping out-of-band management test")
def getIpmiServerIp(self):
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
s.connect((self.mgtSvrDetails["mgtSvrIp"], self.mgtSvrDetails["port"]))
return s.getsockname()[0]
def getIpmiServerPort(self):
return self.serverPort
def getOobmConfigCmd(self):
cmd = configureOutOfBandManagement.configureOutOfBandManagementCmd()
cmd.driver = 'ipmitool' # The default available driver
cmd.address = self.getIpmiServerIp()
cmd.port = self.getIpmiServerPort()
cmd.username = 'admin'
cmd.password = 'password'
cmd.hostid = self.host.id
return cmd
def getOobmEnableCmd(self):
cmd = enableOutOfBandManagementForHost.enableOutOfBandManagementForHostCmd()
cmd.hostid = self.host.id
return cmd
def getOobmDisableCmd(self):
cmd = disableOutOfBandManagementForHost.disableOutOfBandManagementForHostCmd()
cmd.hostid = self.host.id
return cmd
def getOobmIssueActionCmd(self):
cmd = issueOutOfBandManagementPowerAction.issueOutOfBandManagementPowerActionCmd()
cmd.hostid = self.host.id
cmd.action = 'STATUS'
return cmd
def issue_power_action_cmd(self, action, timeout=None):
cmd = self.getOobmIssueActionCmd()
cmd.action = action
if timeout:
cmd.timeout = timeout
try:
return self.apiClient.issueOutOfBandManagementPowerAction(cmd)
except Exception as e:
if "packet session id 0x0 does not match active session" in str(e):
raise self.skipTest("Known ipmitool issue hit, skipping test")
raise e
def configure_and_enable_oobm(self):
self.apiClient.configureOutOfBandManagement(self.getOobmConfigCmd())
response = self.apiClient.enableOutOfBandManagementForHost(self.getOobmEnableCmd())
self.assertEqual(response.enabled, True)
def start_ipmi_server(self):
def startIpmiServer(tname, server):
self.debug("Starting ipmisim server")
try:
server.serve_forever()
except Exception: pass
IpmiServerContext('reset')
ThreadedIpmiServer.allow_reuse_address = False
server = ThreadedIpmiServer(('0.0.0.0', self.getIpmiServerPort()), IpmiServer)
thread.start_new_thread(startIpmiServer, ("ipmi-server", server,))
self.server = server
def checkSyncToState(self, state, interval):
def checkForStateSync(expectedState):
response = self.getHost(hostId=self.host.id).outofbandmanagement
return response.powerstate == expectedState, None
sync_interval = 1 + int(interval)/1000
res, _ = wait_until(sync_interval, 10, checkForStateSync, state)
if not res:
self.fail("Failed to get host.powerstate synced to expected state:" + state)
response = self.getHost(hostId=self.host.id).outofbandmanagement
self.assertEqual(response.powerstate, state)
def get_host_in_available_state(self):
self.configure_and_start_ipmi_server()
self.assert_issue_command_state('ON', 'On')
self.configureAndEnableHostHa()
self.check_host_transition_to_available()
response = self.getHost()
if response.hostha.hastate is not "Available":
print response
self.assertEqual(response.hostha.hastate, "Available")
def configureAndEnableHostHa(self):
self.apiClient.configureHAForHost(self.getHostHaConfigCmd())
response = self.apiClient.enableHAForHost(self.getHostHaEnableCmd())
self.assertEqual(response.haenable, True)
def configure_and_start_ipmi_server(self, power_state=None):
"""
Setup ipmisim and enable out-of-band management for host
"""
self.configure_and_enable_oobm()
self.start_ipmi_server()
if power_state:
bmc = IpmiServerContext().bmc
bmc.powerstate = power_state
def assert_issue_command_state(self, command, expected):
"""
Asserts power action result for a given power command
"""
if command != 'STATUS':
self.issue_power_action_cmd(command)
response = self.issue_power_action_cmd('STATUS')
self.assertEqual(response.powerstate, expected)
def kill_agent(self):
t_end = time.time() + 90
while time.time() < t_end:
try:
SshClient(self.host.ipaddress, port=22, user=self.hostConfig["username"],
passwd=self.hostConfig["password"]).execute \
("kill $(ps aux | grep 'cloudstack-agent' | awk '{print $2}')")
return
except Exception:
print("Cannot ssh into: " + self.host.ipaddress)
self.fail(self)
def set_host_to_alert(self):
self.dbclient.execute("update host set host.status = 'Alert' where host.uuid = '%s'" % self.host.id)
def check_host_transitioned_to_degraded(self):
t_end = time.time() + 120
while time.time() < t_end:
host = self.getHost()
if host.hostha.hastate in "Degraded":
return
else:
continue
self.fail(self)
def wait_util_host_is_fencing(self):
t_end = time.time() + 120
while time.time() < t_end:
host = self.getHost()
if host.hostha.hastate in "Fencing":
return
else:
continue
self.fail(self)
def check_host_transitioned_to_suspect(self):
t_end = time.time() + 120
while time.time() < t_end:
host = self.getHost()
if host.hostha.hastate in "Suspect":
return
else:
continue
self.fail(self)
def check_host_transitioned_to_checking(self):
t_end = time.time() + 120
while time.time() < t_end:
host = self.getHost()
if host.hostha.hastate in "Checking":
return
else:
continue
self.fail(self)
def wait_util_host_is_fenced(self):
t_end = time.time() + 120
while time.time() < t_end:
host = self.getHost()
if host.hostha.hastate in "Fenced":
return
else:
continue
self.fail(self)
def wait_util_host_is_up(self):
t_end = time.time() + 120
while time.time() < t_end:
host = self.getHost()
if host.state in "Up":
return
else:
continue
self.fail(self)
def stop_agent(self):
SshClient(self.host.ipaddress, port=22, user=self.hostConfig["username"], passwd=self.hostConfig["password"]).execute\
("service cloudstack-agent stop")
def start_agent(self):
self.ssh_and_restart_agent()
self.check_host_transition_to_available()
def ssh_and_restart_agent(self):
t_end = time.time() + 90
while time.time() < t_end:
try:
SshClient(self.host.ipaddress, port=22, user=self.hostConfig["username"],
passwd=self.hostConfig["password"]).execute \
("service cloudstack-agent restart")
return
except Exception:
print("Cannot ssh into: " + self.host.ipaddress)
self.fail(self)
def check_host_transition_to_available(self):
t_end = time.time() + 90
while time.time() < t_end:
host = self.getHost()
if host.hostha.hastate == "Available":
return
else:
continue
self.fail(self)
def wait_util_host_is_recovered(self):
t_end = time.time() + 180
while time.time() < t_end:
host = self.getHost()
if host.hostha.hastate in "Recovered":
return
else:
continue
self.fail(self)
def reset_host(self):
SshClient(self.host.ipaddress, port=22, user=self.hostConfig["username"],
passwd=self.hostConfig["password"]).execute \
("reboot")
def deploy_vm(self):
vm = VirtualMachine.create(
self.apiClient,
services=self.services["virtual_machine"],
serviceofferingid=self.service_offering.id,
templateid=self.template.id,
zoneid=self.zone.id,
hostid = self.host.id,
method="POST"
)
self.cleanup.append(vm)
def update_configuration(self, name, value):
update_configuration_cmd = updateConfiguration.updateConfigurationCmd()
update_configuration_cmd.name = name
update_configuration_cmd.value = value
update_configuration_cmd.clusterid = self.cluster_id
self.apiClient.updateConfiguration(update_configuration_cmd)
@attr(tags = ["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="true")
def test_ha_stop_agent_host_is_degraded(self):
"""
Tests HA state turns Degraded when agent is stopped
"""
self.deploy_vm()
# Configure and Enable OOBM, Set HA Provider and Enable HA. At the end checks if HA State is Available
self.get_host_in_available_state()
# SSH into the KVM Host and executes kill -9 of the agent
self.stop_agent()
# Checks if the host would turn into Degraded in the next 120 seconds
try:
self.check_host_transitioned_to_degraded()
except Exception as e:
self.start_agent()
raise Exception("Warning: Exception during test execution : %s" % e)
# Enable Host
self.start_agent()
#@attr(tags=["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="true")
def test_ha_recovering_start_agent_host_is_available(self):
"""
Tests HA state turns Recovered when agent is stopped and host is reset
"""
# Configure and Enable OOBM, Set HA Provider and Enable HA. At the end checks if HA State is Available
# Then kills the agent and wait untill the state is Degraded
self.deploy_vm()
# Configure and Enable OOBM, Set HA Provider and Enable HA. At the end checks if HA State is Available
self.get_host_in_available_state()
# SSH into the KVM Host and executes kill -9 of the agent
self.kill_agent()
# Checks if the host would turn into Degraded in the next 120 seconds
try:
self.check_host_transitioned_to_degraded()
except Exception as e:
self.start_agent()
raise Exception("Warning: Exception during test execution : %s" % e)
# Reset host so a shut down could be emulated. During the bootup host should transition into recovered state
self.reset_host()
# Waits until Degraded host turns into Recovered for 180 seconds,
# if it fails it tries to revert host back to Available
try:
self.wait_util_host_is_recovered()
except Exception as e:
self.start_agent()
raise Exception("Warning: Exception during test execution : %s" % e)
# SSH into the KVM Host and executes service cloudstack-agent restart of the agent
self.start_agent()
#@attr(tags=["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="true")
def test_ha_fencing_host(self):
"""
Tests HA state turns Recovered when agent is stopped and host is reset,
then configure incorrect OOBM configuration, so that Recover command would fail
and host would transition into Fenced state.
"""
self.deploy_vm()
# Configure and Enable OOBM, Set HA Provider and Enable HA. At the end checks if HA State is Available
self.get_host_in_available_state()
# SSH into the KVM Host and executes kill -9 of the agent
self.kill_agent()
# Checks if the host would turn into Degraded in the next 120 seconds
try:
self.check_host_transitioned_to_degraded()
except Exception as e:
self.start_agent()
raise Exception("Warning: Exception during test execution : %s" % e)
# Change OOBM Configuration to invalid so it would fail the recover operations.
cmd = self.getOobmConfigCmd()
cmd.address = "1.1.1.1"
self.apiClient.configureOutOfBandManagement(cmd)
# Reset host so a shut down could be emulated. During the bootup host should transition into recovered state
self.reset_host()
self.kill_agent()
# Waits until Recovering host turns into Fencing for 180 seconds,
# if it fails it tries to revert host back to Up
try:
self.wait_util_host_is_fencing()
except Exception as e:
self.ssh_and_restart_agent()
raise Exception("Warning: Exception during test execution : %s" % e)
# Configure correct OOBM configuration so that the Fencing operation would succeed
self.apiClient.configureOutOfBandManagement(self.getOobmConfigCmd())
# Waits until Fencing host turns into Fenced for 180 seconds,
# if it fails it tries to revert host back to Up
try:
self.wait_util_host_is_fenced()
except Exception as e:
self.ssh_and_restart_agent()
raise Exception("Warning: Exception during test execution : %s" % e)
# SSH into the KVM Host and executes service cloudstack-agent restart of the agent
self.ssh_and_restart_agent()
# Waits until state is Up so that cleanup would be successful
self.wait_util_host_is_up()
@attr(tags=["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="true")
def test_ha_kill_agent_host_is_degraded(self):
"""
Tests HA state turns Suspect/Checking when some activity/health checks fail
Configures HA, Logs into to a host and restarts the service
Then it confirms the ha state jumps through Suspect -> Checking -> Available
"""
# Configure and Enable OOBM, Set HA Provider and Enable HA. At the end checks if HA State is Available
self.get_host_in_available_state()
# SSH into the KVM Host and executes kill -9 of the agent
self.ssh_and_restart_agent()
# Checks if the host would turn into Suspect in the next 120 seconds
try:
self.check_host_transitioned_to_suspect()
except Exception as e:
self.start_agent()
raise Exception("Warning: Exception during test execution : %s" % e)
# Checks if the host would turn into Degraded in the next 120 seconds
try:
self.check_host_transitioned_to_checking()
except Exception as e:
self.start_agent()
raise Exception("Warning: Exception during test execution : %s" % e)
# Enable Host
self.check_host_transition_to_available()

View File

@ -39,25 +39,62 @@ class TestHAKVM(cloudstackTestCase):
"""
def setUp(self):
self.testClient = super(TestHAKVM, self).getClsTestClient()
self.apiclient = self.testClient.getApiClient()
self.hypervisor = self.testClient.getHypervisorInfo()
self.dbclient = self.testClient.getDbConnection()
self.services = self.testClient.getParsedTestDataConfig()
self.logger = logging.getLogger('TestHAKVM')
#Get Zone specifics
self.zone = get_zone(self.apiclient, self.testClient.getZoneForTests())
self.hypervisor = self.testClient.getHypervisorInfo()
self.host = self.getHost()
self.hostConfig = self.config.__dict__["zones"][0].__dict__["pods"][0].__dict__["clusters"][0].__dict__["hosts"][0].__dict__
self.mgtSvrDetails = self.config.__dict__["mgtSvr"][0].__dict__
self.fakeMsId = random.randint(10000, 99999) * random.randint(10, 20)
self.cluster_id = self.host.clusterid
# Cleanup any existing configs
self.dbclient.execute("delete from ha_config where resource_type='Host'")
self.host = self.getHost()
# use random port for ipmisim
self.fakeMsId = random.randint(10000, 99999) * random.randint(10, 20)
s = socket.socket()
s.bind(('', 0))
self.serverPort = s.getsockname()[1]
s.close()
self.cleanup = []
# Set Cluster-level setting in order to run tests faster
self.updateConfiguration("kvm.ha.activity.check.failure.ratio", "0.6")
self.updateConfiguration("kvm.ha.activity.check.interval", "8")
self.updateConfiguration("kvm.ha.activity.check.max.attempts", "5")
self.updateConfiguration("kvm.ha.activity.check.timeout", "30")
self.updateConfiguration("kvm.ha.degraded.max.period", "30")
self.updateConfiguration("kvm.ha.fence.timeout", "30")
self.updateConfiguration("kvm.ha.health.check.timeout", "30")
self.updateConfiguration("kvm.ha.recover.failure.threshold", "2")
self.updateConfiguration("kvm.ha.recover.timeout", "30")
self.updateConfiguration("kvm.ha.recover.wait.period", "30")
self.service_offering = ServiceOffering.create(
self.apiclient,
self.services["service_offerings"]["hasmall"]
)
self.template = get_template(
self.apiclient,
self.zone.id,
self.services["ostype"]
)
self.configureAndDisableHostHa()
self.cleanup = [self.service_offering]
def updateConfiguration(self, name, value):
cmd = updateConfiguration.updateConfigurationCmd()
cmd.name = name
cmd.value = value
cmd.clusterid = self.cluster_id
self.apiclient.updateConfiguration(cmd)
def getFakeMsId(self):
return self.fakeMsId
@ -66,6 +103,8 @@ class TestHAKVM(cloudstackTestCase):
return self.fakeMsId * 1000
def tearDown(self):
self.configureAndDisableHostHa()
self.host = None
try:
self.dbclient.execute("delete from mshost_peer where peer_runid=%s" % self.getFakeMsRunId())
self.dbclient.execute("delete from mshost where runid=%s" % self.getFakeMsRunId())
@ -83,70 +122,43 @@ class TestHAKVM(cloudstackTestCase):
def getHostHaEnableCmd(self):
cmd = enableHAForHost.enableHAForHostCmd()
cmd.hostid = self.getHost().id
cmd.hostid = self.host.id
return cmd
def check_host_transition_to_available(self):
t_end = time.time() + 90
while time.time() < t_end:
host = self.getHost()
if host.hostha.hastate == "Available":
return
else:
continue
self.fail(self)
def getHost(self):
response = list_hosts(
self.apiclient,
type='Routing',
resourcestate='Enabled'
)
if response and len(response) > 0:
self.host = response[0]
return self.host
raise self.skipTest("No KVM hosts found, skipping host-ha test")
def getHost(self, hostId=None):
response = list_hosts(
self.apiclient,
type='Routing',
hypervisor='kvm',
id=hostId
)
# Check if more than one kvm hosts are available in order to successfully configure host-ha
if response and len(response) > 0:
self.host = response[0]
return self.host
raise self.skipTest("No KVM hosts found, skipping host-ha test")
raise self.skipTest("Not enough KVM hosts found, skipping host-ha test")
def getHostHaConfigCmd(self, provider='kvmhaprovider'):
cmd = configureHAForHost.configureHAForHostCmd()
cmd.provider = provider
cmd.hostid = self.getHost().id
return cmd
def getHostHaEnableCmd(self):
cmd = enableHAForHost.enableHAForHostCmd()
cmd.hostid = self.getHost().id
cmd.hostid = self.host.id
return cmd
def getHostHaDisableCmd(self):
cmd = disableHAForHost.disableHAForHostCmd()
cmd.hostid = self.getHost().id
cmd.hostid = self.host.id
return cmd
def configureAndEnableHostHa(self, initialize=True):
def configureAndEnableHostHa(self):
#Adding sleep between configuring and enabling
self.apiclient.configureHAForHost(self.getHostHaConfigCmd())
response = self.apiclient.enableHAForHost(self.getHostHaEnableCmd())
self.assertEqual(response.haenable, True)
if initialize:
self.configureKVMHAProviderState(True, True, True, False)
def configureAndDisableHostHa(self, hostId):
def configureAndDisableHostHa(self):
self.apiclient.configureHAForHost(self.getHostHaConfigCmd())
cmd = self.getHostHaDisableCmd()
cmd.hostid = hostId
cmd.hostid = self.host.id
response = self.apiclient.disableHAForHost(cmd)
self.assertEqual(response.hostid, cmd.hostid)
self.assertEqual(response.haenable, False)
@ -159,301 +171,95 @@ class TestHAKVM(cloudstackTestCase):
self.assertEqual(response.haenable, True)
return response
def configureKVMHAProviderState(self, health, activity, recover, fence):
cmd = configureHAForHost.configureHAForHostCmd()
cmd.hostid = self.getHost().id
cmd.health = health
cmd.activity = activity
cmd.recover = recover
cmd.fence = fence
response = self.apiclient.configureKVMHAProviderState(cmd)
self.assertEqual(response.success, 'true')
def disableAgent(self):
SshClient(self.host.ipaddress, port=22, user=self.hostConfig["username"], passwd=self.hostConfig["password"]).execute\
("systemctl disable cloudstack-agent || chkconfig cloudstack-agent off")
def checkSyncToState(self, state, interval=5000):
def checkForStateSync(expectedState):
response = self.getHost(hostId=self.getHost().id).hostha
return response.hastate == expectedState, None
def resetHost(self):
SshClient(self.host.ipaddress, port=22, user=self.hostConfig["username"],
passwd=self.hostConfig["password"]).execute \
("reboot")
sync_interval = 1 + int(interval) / 1000
res, _ = wait_until(sync_interval, 10, checkForStateSync, state)
def enableAgent(self):
SshClient(self.host.ipaddress, port=22, user=self.hostConfig["username"], passwd=self.hostConfig["password"]).execute\
("systemctl enable cloudstack-agent || chkconfig cloudstack-agent on")
def waitUntilHostInState(self, state="Available", interval=3):
def checkForState(expectedState):
response = self.getHost(self.host.id)
return response.hostha.hastate == expectedState, None
res, _ = wait_until(interval, 200, checkForState, state)
if not res:
self.fail("Failed to get host.hastate synced to expected state:" + state)
response = self.getHost(hostId=self.getHost().id).hostha
self.assertEqual(response.hastate, state)
self.fail("Failed to see host ha state in :" + state)
@attr(tags=["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="true")
def test_hostha_configure_invalid_provider(self):
"""
Tests host-ha configuration with invalid driver
"""
cmd = self.getHostHaConfigCmd()
cmd.provider = 'randomDriverThatDoesNotExist'
try:
response = self.apiclient.configureHAForHost(cmd)
except Exception:
pass
else:
self.fail("Expected an exception to be thrown, failing")
@attr(tags=["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="true")
def test_hostha_configure_default_driver(self):
"""
Tests host-ha configuration with valid data
"""
cmd = self.getHostHaConfigCmd()
response = self.apiclient.configureHAForHost(cmd)
self.assertEqual(response.hostid, cmd.hostid)
self.assertEqual(response.haprovider, cmd.provider.lower())
@attr(tags=["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="true")
def test_ha_enable_feature_invalid(self):
"""
Tests ha feature enable command with invalid options
"""
cmd = self.getHostHaEnableCmd()
cmd.hostid = -1
try:
response = self.apiclient.enableHAForHost(cmd)
except Exception:
pass
else:
self.fail("Expected an exception to be thrown, failing")
try:
cmd = enableHAForCluster.enableHAForClusterCmd()
response = self.apiclient.enableHAForCluster(cmd)
except Exception:
pass
else:
self.fail("Expected an exception to be thrown, failing")
try:
cmd = enableHAForZone.enableHAForZoneCmd()
response = self.apiclient.enableHAForZone(cmd)
except Exception:
pass
else:
self.fail("Expected an exception to be thrown, failing")
@attr(tags=["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="true")
def test_ha_disable_feature_invalid(self):
"""
Tests ha feature disable command with invalid options
"""
cmd = self.getHostHaDisableCmd()
cmd.hostid = -1
def deployVM(self):
try:
response = self.apiclient.disableHAForHost(cmd)
except Exception:
pass
else:
self.fail("Expected an exception to be thrown, failing")
try:
cmd = disableHAForCluster.disableHAForClusterCmd()
response = self.apiclient.disableHAForCluster(cmd)
except Exception:
pass
else:
self.fail("Expected an exception to be thrown, failing")
try:
cmd = disableHAForZone.disableHAForZoneCmd()
response = self.apiclient.disableHAForZone(cmd)
except Exception:
pass
else:
self.fail("Expected an exception to be thrown, failing")
@attr(tags=["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="true")
def test_hostha_enable_feature_valid(self):
"""
Tests host-ha enable feature with valid options
"""
self.apiclient.configureHAForHost(self.getHostHaConfigCmd())
cmd = self.getHostHaEnableCmd()
response = self.apiclient.enableHAForHost(cmd)
self.assertEqual(response.hostid, cmd.hostid)
self.assertEqual(response.haenable, True)
@attr(tags=["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="true")
def test_hostha_disable_feature_valid(self):
"""
Tests host-ha disable feature with valid options
"""
self.apiclient.configureHAForHost(self.getHostHaConfigCmd())
cmd = self.getHostHaDisableCmd()
response = self.apiclient.disableHAForHost(cmd)
self.assertEqual(response.hostid, cmd.hostid)
self.assertEqual(response.haenable, False)
response = self.getHost(hostId=cmd.hostid).hostha
self.assertEqual(response.hastate, 'Disabled')
@attr(tags=["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="true")
def test_configure_ha_provider_invalid(self):
"""
Tests configure HA Provider with invalid provider options
"""
# Enable ha for host
self.apiclient.configureHAForHost(self.getHostHaConfigCmd())
cmd = self.getHostHaEnableCmd()
response = self.apiclient.enableHAForHost(cmd)
self.assertEqual(response.hostid, cmd.hostid)
self.assertEqual(response.haenable, True)
host = self.getHost(response.hostid)
# Setup wrong configuration for the host
conf_ha_cmd = configureHAForHost.configureHAForHostCmd()
if host.hypervisor.lower() in "simulator":
conf_ha_cmd.provider = "kvmhaprovider"
if host.hypervisor.lower() in "kvm":
conf_ha_cmd.provider = "simulatorhaprovider"
conf_ha_cmd.hostid = cmd.hostid
# Call the configure HA provider API with not supported provider for HA
try:
self.apiclient.configureHAForHost(conf_ha_cmd)
except Exception:
pass
else:
self.fail("Expected an exception to be thrown, failing")
@attr(tags=["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="true")
def test_configure_ha_provider_valid(self):
"""
Tests configure HA Provider with valid provider options
"""
# Enable ha for host
self.apiclient.configureHAForHost(self.getHostHaConfigCmd())
cmd = self.getHostHaEnableCmd()
response = self.apiclient.enableHAForHost(cmd)
self.assertEqual(response.hostid, cmd.hostid)
self.assertEqual(response.haenable, True)
host = self.getHost(response.hostid)
# Setup configuration for the host
conf_ha_cmd = configureHAForHost.configureHAForHostCmd()
if host.hypervisor.lower() in "kvm":
conf_ha_cmd.provider = "kvmhaprovider"
if host.hypervisor.lower() in "simulator":
conf_ha_cmd.provider = "simulatorhaprovider"
conf_ha_cmd.hostid = cmd.hostid
# Call the configure HA provider API with not supported provider for HA
response = self.apiclient.configureHAForHost(conf_ha_cmd)
# Check the response contains the set provider and hostID
self.assertEqual(response.haprovider, conf_ha_cmd.provider)
self.assertEqual(response.hostid, conf_ha_cmd.hostid)
vm = VirtualMachine.create(
self.apiclient,
services=self.services["virtual_machine"],
serviceofferingid=self.service_offering.id,
templateid=self.template.id,
zoneid=self.zone.id,
hostid = self.host.id,
method="POST"
)
self.cleanup.append(vm)
except Exception as e:
raise self.skipTest("Failed to deploy VM, skipping kvm host-ha test case")
@attr(tags=["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="true")
def test_disable_oobm_ha_state_ineligible(self):
"""
Tests that when HA is enabled for a host, if oobm is disabled HA State should turn into Ineligible
"""
self.logger.debug("Starting test_disable_oobm_ha_state_ineligible")
# Enable ha for host
self.apiclient.configureHAForHost(self.getHostHaConfigCmd())
cmd = self.getHostHaEnableCmd()
response = self.apiclient.enableHAForHost(cmd)
self.assertEqual(response.hostid, cmd.hostid)
self.assertEqual(response.haenable, True)
self.configureAndEnableHostHa()
# Disable OOBM
self.apiclient.configureOutOfBandManagement(self.getOobmConfigCmd())
oobm_cmd = self.getOobmDisableCmd()
oobm_cmd.hostid = cmd.hostid
oobm_cmd.hostid = self.host.id
response = self.apiclient.disableOutOfBandManagementForHost(oobm_cmd)
self.assertEqual(response.hostid, oobm_cmd.hostid)
self.assertEqual(response.enabled, False)
response = self.getHost(hostId=cmd.hostid).outofbandmanagement
response = self.getHost(hostId=self.host.id).outofbandmanagement
self.assertEqual(response.powerstate, 'Disabled')
# Verify HA State is Ineligeble
response = self.getHost(hostId=cmd.hostid).hostha
self.assertEqual(response.hastate, "Ineligible")
self.waitUntilHostInState("Ineligible")
@attr(tags=["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="true")
def test_hostha_configure_default_driver(self):
"""
Tests host-ha configuration with valid data
"""
self.logger.debug("Starting test_hostha_configure_default_driver")
cmd = self.getHostHaConfigCmd()
response = self.apiclient.configureHAForHost(cmd)
self.assertEqual(response.hostid, cmd.hostid)
self.assertEqual(response.haprovider, cmd.provider.lower())
@attr(tags=["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="true")
def test_enable_ha_when_host_powerstate_on(self):
"""
Tests that when HA is enabled for a host, if oobm state is on HA State should turn into Available
"""
self.configureAndStartIpmiServer()
self.assertIssueCommandState('ON', 'On')
self.apiclient.configureHAForHost(self.getHostHaConfigCmd())
cmd = self.getHostHaEnableCmd()
response = self.apiclient.enableHAForHost(cmd)
self.assertEqual(response.hostid, cmd.hostid)
self.assertEqual(response.haenable, True)
# Verify HA State is Available
self.check_host_transition_to_available()
response = self.getHost()
if response.hostha.hastate is not "Available":
print response
self.assertEqual(response.hostha.hastate, "Available")
self.stopIpmiServer()
@attr(tags=["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="true")
def test_hostha_enable_feature_without_setting_provider(self):
"""
Tests Enable HA without setting the provider, Exception is thrown
"""
host = self.get_non_configured_ha_host()
cmd = self.getHostHaEnableCmd()
cmd.hostid = host.id
try:
self.apiclient.enableHAForHost(cmd)
except Exception as e:
pass
else:
self.fail("Expected an exception to be thrown, failing")
@attr(tags=["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="treu")
def test_hostha_enable_ha_when_host_disabled(self):
"""
Tests Enable HA when host is disconnected, should be Ineligible
"""
self.logger.debug("Starting test_hostha_enable_ha_when_host_disabled")
# Enable HA
self.apiclient.configureHAForHost(self.getHostHaConfigCmd())
cmd = self.getHostHaEnableCmd()
cmd.hostid = self.host.id
enable = self.apiclient.enableHAForHost(cmd)
self.assertEqual(enable.hostid, cmd.hostid)
self.assertEqual(enable.haenable, True)
self.configureAndEnableHostHa()
# Disable Host
self.disableHost(self.host.id)
# Check HA State
try:
response = self.getHost(self.host.id)
self.assertEqual(response.hostha.hastate, "Ineligible")
self.waitUntilHostInState("Ineligible")
except Exception as e:
self.enableHost(self.host.id)
self.fail(e)
@ -462,46 +268,39 @@ class TestHAKVM(cloudstackTestCase):
self.enableHost(self.host.id)
@attr(tags=["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="true")
def test_hostha_enable_ha_when_host_inMaintenance(self):
def test_hostha_enable_ha_when_host_in_maintenance(self):
"""
Tests Enable HA when host is in Maintenance mode, should be Ineligible
"""
host = self.getHost()
self.logger.debug("Starting test_hostha_enable_ha_when_host_in_maintenance")
# Enable HA
self.apiclient.configureHAForHost(self.getHostHaConfigCmd())
cmd = self.getHostHaEnableCmd()
cmd.hostid = host.id
enable = self.apiclient.enableHAForHost(cmd)
self.assertEqual(enable.hostid, cmd.hostid)
self.assertEqual(enable.haenable, True)
self.configureAndEnableHostHa()
# Prepare for maintenance Host
self.setHostToMaintanance(host.id)
self.setHostToMaintanance(self.host.id)
# Check HA State
try:
response = self.getHost(host.id)
self.assertEqual(response.hostha.hastate, "Ineligible")
self.waitUntilHostInState("Ineligible")
except Exception as e:
self.cancelMaintenance(host.id)
self.cancelMaintenance()
self.fail(e)
# Enable Host
self.cancelMaintenance(host.id)
self.cancelMaintenance()
@attr(tags=["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="true")
def test_hostha_enable_ha_when_host_disconected(self):
"""
Tests Enable HA when host is disconnected, should be Ineligible
"""
host = self.getHost()
self.logger.debug("Starting test_hostha_enable_ha_when_host_disconected")
# Enable HA
self.apiclient.configureHAForHost(self.getHostHaConfigCmd())
cmd = self.getHostHaEnableCmd()
cmd.hostid = host.id
cmd.hostid = self.host.id
enable = self.apiclient.enableHAForHost(cmd)
self.assertEqual(enable.hostid, cmd.hostid)
self.assertEqual(enable.haenable, True)
@ -511,9 +310,7 @@ class TestHAKVM(cloudstackTestCase):
# Check HA State
try:
time.sleep(1)
response = self.getHost(self.host.id)
self.assertEqual(response.hostha.hastate, "Ineligible")
self.waitUntilHostInState("Ineligible")
except Exception as e:
self.startAgent()
self.fail(e)
@ -526,13 +323,13 @@ class TestHAKVM(cloudstackTestCase):
"""
Tests HA Provider should be possible to be removed when HA is enabled
"""
self.logger.debug("Starting test_remove_ha_provider_not_possible")
host = self.getHost()
# Enable HA
self.apiclient.configureHAForHost(self.getHostHaConfigCmd())
cmd = self.getHostHaEnableCmd()
cmd.hostid = host.id
cmd.hostid = self.host.id
enable = self.apiclient.enableHAForHost(cmd)
self.assertEqual(enable.hostid, cmd.hostid)
self.assertEqual(enable.haenable, True)
@ -544,6 +341,136 @@ class TestHAKVM(cloudstackTestCase):
else:
self.fail("Expected an exception to be thrown, failing")
@attr(tags = ["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="true")
def test_hostha_kvm_host_degraded(self):
"""
Tests degraded HA state when agent is stopped/killed
"""
self.configureAndStartIpmiServer()
self.assertIssueCommandState('ON', 'On')
self.configureAndEnableHostHa()
self.deployVM()
# Start with the available state
self.waitUntilHostInState("Available")
# SSH into the KVM Host and executes kill -9 of the agent
self.stopAgent()
# Check if host would go into Suspect state
try:
self.waitUntilHostInState("Suspect")
except Exception as e:
self.startAgent()
raise Exception("Warning: Exception during test execution : %s" % e)
# Checks if the host would turn into Degraded
try:
self.waitUntilHostInState("Degraded")
except Exception as e:
self.startAgent()
raise Exception("Warning: Exception during test execution : %s" % e)
self.startAgent()
self.waitUntilHostInState("Available")
@attr(tags=["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="true")
def test_hostha_kvm_host_recovering(self):
"""
Tests recovery and fencing HA state transitions
"""
self.configureAndStartIpmiServer()
self.assertIssueCommandState('ON', 'On')
self.configureAndEnableHostHa()
self.deployVM()
# Start with the available state
self.waitUntilHostInState("Available")
# Kill host by triggering a fault
self.killAgent()
self.disableAgent()
self.resetHost()
# Check if host would go into Suspect state
try:
self.waitUntilHostInState("Suspect")
except Exception as e:
self.startAgent()
raise Exception("Warning: Exception during test execution : %s" % e)
# Checks if the host would turn into Recovered
try:
self.waitUntilHostInState("Recovered")
except Exception as e:
self.startAgent()
raise Exception("Warning: Exception during test execution : %s" % e)
self.enableAgent()
self.startAgent()
self.waitUntilHostInState("Available")
@attr(tags=["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="true")
def test_hostha_kvm_host_fencing(self):
"""
Tests fencing/fenced HA state when host crashes
"""
self.logger.debug("Starting test_ha_kvm_host_fencing")
self.configureAndStartIpmiServer()
self.assertIssueCommandState('ON', 'On')
self.configureAndEnableHostHa()
self.deployVM()
# Start with the available state
self.waitUntilHostInState("Available")
# Fail oobm commands
cmd = self.getOobmConfigCmd()
cmd.address = "1.1.1.1"
self.apiclient.configureOutOfBandManagement(cmd)
# Kill host by triggering a fault
self.killAgent()
self.disableAgent()
self.resetHost()
# Check if host would go into Suspect state
try:
self.waitUntilHostInState("Suspect")
except Exception as e:
self.startAgent()
raise Exception("Warning: Exception during test execution : %s" % e)
# Checks if the host would turn into Fencing
try:
self.waitUntilHostInState("Fencing")
except Exception as e:
self.startAgent()
raise Exception("Warning: Exception during test execution : %s" % e)
# Allow oobm commands to work now
self.configureAndEnableOobm()
# Checks if the host would turn into Fenced
try:
self.waitUntilHostInState("Fenced")
except Exception as e:
self.startAgent()
raise Exception("Warning: Exception during test execution : %s" % e)
self.enableAgent()
self.startAgent()
self.cancelMaintenance()
self.waitUntilHostInState("Available")
def configureAndStartIpmiServer(self, power_state=None):
"""
Setup ipmisim and enable out-of-band management for host
@ -587,7 +514,7 @@ class TestHAKVM(cloudstackTestCase):
def getOobmIssueActionCmd(self):
cmd = issueOutOfBandManagementPowerAction.issueOutOfBandManagementPowerActionCmd()
cmd.hostid = self.getHost().id
cmd.hostid = self.host.id
cmd.action = 'STATUS'
return cmd
@ -606,12 +533,12 @@ class TestHAKVM(cloudstackTestCase):
def getOobmEnableCmd(self):
cmd = enableOutOfBandManagementForHost.enableOutOfBandManagementForHostCmd()
cmd.hostid = self.getHost().id
cmd.hostid = self.host.id
return cmd
def getOobmDisableCmd(self):
cmd = disableOutOfBandManagementForHost.disableOutOfBandManagementForHostCmd()
cmd.hostid = self.getHost().id
cmd.hostid = self.host.id
return cmd
def getIpmiServerPort(self):
@ -624,7 +551,7 @@ class TestHAKVM(cloudstackTestCase):
cmd.port = self.getIpmiServerPort()
cmd.username = 'admin'
cmd.password = 'password'
cmd.hostid = self.getHost().id
cmd.hostid = self.host.id
return cmd
def getIpmiServerIp(self):
@ -655,28 +582,31 @@ class TestHAKVM(cloudstackTestCase):
return response[0]
def startAgent(self):
host = self.getHost()
SshClient(host=host.ipaddress, port=22, user=self.hostConfig["username"],
SshClient(host=self.host.ipaddress, port=22, user=self.hostConfig["username"],
passwd=self.hostConfig["password"]).execute \
("service cloudstack-agent start")
("systemctl start cloudstack-agent || service cloudstack-agent start")
def stopAgent(self):
SshClient(host=self.host.ipaddress, port=22, user=self.hostConfig["username"],
passwd=self.hostConfig["password"]).execute \
("systemctl stop cloudstack-agent || service cloudstack-agent stop")
def killAgent(self):
SshClient(host=self.host.ipaddress, port=22, user=self.hostConfig["username"], passwd=self.hostConfig["password"]).execute\
("kill -9 $(ps aux | grep 'cloudstack-agent' | awk '{print $2}')")
def disableHost(self, id):
cmd = updateHost.updateHostCmd()
cmd.id = id
cmd.allocationstate = "Disable"
response = self.apiclient.updateHost(cmd)
self.assertEqual(response.resourcestate, "Disabled")
def enableHost(self, id):
cmd = updateHost.updateHostCmd()
cmd.id = id
cmd.allocationstate = "Enable"
response = self.apiclient.updateHost(cmd)
self.assertEqual(response.resourcestate, "Enabled")
def setHostToMaintanance(self, id):
@ -687,15 +617,9 @@ class TestHAKVM(cloudstackTestCase):
self.assertEqual(response.resourcestate, "PrepareForMaintenance")
def cancelMaintenance(self, id):
def cancelMaintenance(self):
cmd = cancelHostMaintenance.cancelHostMaintenanceCmd()
cmd.id = id
cmd.id = self.host.id
response = self.apiclient.cancelHostMaintenance(cmd)
self.assertEqual(response.resourcestate, "Enabled")
def killAgent(self):
host = self.getHost()
SshClient(host=host.ipaddress, port=22, user=self.hostConfig["username"], passwd=self.hostConfig["password"]).execute\
("kill $(ps aux | grep 'cloudstack-agent' | awk '{print $2}')")

View File

@ -23,8 +23,6 @@ from marvin.lib.base import *
from marvin.lib.common import *
from nose.plugins.attrib import attr
import random
from ipmisim.ipmisim import IpmiServerContext, IpmiServer, ThreadedIpmiServer
import random
@ -35,7 +33,7 @@ import time
class TestHostHA(cloudstackTestCase):
""" Test cases for host HA using Simulator host(s)
""" Test host-ha business logic using Simulator
"""
def setUp(self):
@ -45,10 +43,10 @@ class TestHostHA(cloudstackTestCase):
self.services = self.testClient.getParsedTestDataConfig()
self.mgtSvrDetails = self.config.__dict__["mgtSvr"][0].__dict__
self.fakeMsId = random.randint(10000, 99999) * random.randint(10, 20)
self.host = None
# Cleanup any existing configs
self.dbclient.execute("delete from ha_config where resource_type='Host'")
self.host = None
# use random port for ipmisim
s = socket.socket()
@ -56,10 +54,17 @@ class TestHostHA(cloudstackTestCase):
self.serverPort = s.getsockname()[1]
s.close()
# Get a host to run tests against
self.host = self.getHost()
self.cleanup = []
def tearDown(self):
try:
host = self.getHost()
self.configureAndDisableHostHa(host.id)
self.host = None
self.dbclient.execute("delete from mshost_peer where peer_runid=%s" % self.getFakeMsRunId())
self.dbclient.execute("delete from mshost where runid=%s" % self.getFakeMsRunId())
self.dbclient.execute("delete from cluster_details where name='resourceHAEnabled'")
@ -70,12 +75,15 @@ class TestHostHA(cloudstackTestCase):
except Exception as e:
raise Exception("Warning: Exception during cleanup : %s" % e)
def getFakeMsId(self):
return self.fakeMsId
def getFakeMsRunId(self):
return self.fakeMsId * 1000
def getHost(self, hostId=None):
if self.host and hostId is None:
return self.host
@ -87,10 +95,13 @@ class TestHostHA(cloudstackTestCase):
resourcestate='Enabled',
id=hostId
)
if response and len(response) > 0:
random.shuffle(response)
self.host = response[0]
return self.host
raise self.skipTest("No simulator hosts found, skipping host-ha test")
raise self.skipTest("No suitable hosts found, skipping host-ha test")
def getHostHaConfigCmd(self, provider='simulatorhaprovider'):
cmd = configureHAForHost.configureHAForHostCmd()
@ -98,16 +109,25 @@ class TestHostHA(cloudstackTestCase):
cmd.hostid = self.getHost().id
return cmd
def getHostHaEnableCmd(self):
cmd = enableHAForHost.enableHAForHostCmd()
cmd.hostid = self.getHost().id
return cmd
def getHostHaDisableCmd(self):
cmd = disableHAForHost.disableHAForHostCmd()
cmd.hostid = self.getHost().id
return cmd
def getListHostHAResources(self):
cmd = listHostHAResources.listHostHAResourcesCmd()
cmd.hostid = self.getHost().id
return cmd
def configureAndEnableHostHa(self, initialize=True):
self.apiclient.configureHAForHost(self.getHostHaConfigCmd())
response = self.apiclient.enableHAForHost(self.getHostHaEnableCmd())
@ -115,6 +135,7 @@ class TestHostHA(cloudstackTestCase):
if initialize:
self.configureSimulatorHAProviderState(True, True, True, False)
def configureAndDisableHostHa(self, hostId):
self.apiclient.configureHAForHost(self.getHostHaConfigCmd())
cmd = self.getHostHaDisableCmd()
@ -123,6 +144,7 @@ class TestHostHA(cloudstackTestCase):
self.assertEqual(response.hostid, cmd.hostid)
self.assertEqual(response.haenable, False)
def enableHostHa(self, hostId):
cmd = self.getHostHaEnableCmd()
cmd.hostid = hostId
@ -130,6 +152,7 @@ class TestHostHA(cloudstackTestCase):
self.assertEqual(response.hostid, cmd.hostid)
self.assertEqual(response.haenable, True)
def configureSimulatorHAProviderState(self, health, activity, recover, fence):
cmd = configureSimulatorHAProviderState.configureSimulatorHAProviderStateCmd()
cmd.hostid = self.getHost().id
@ -140,24 +163,27 @@ class TestHostHA(cloudstackTestCase):
response = self.apiclient.configureSimulatorHAProviderState(cmd)
self.assertEqual(response.success, 'true')
def getSimulatorHAStateTransitions(self, hostId):
cmd = listSimulatorHAStateTransitions.listSimulatorHAStateTransitionsCmd()
cmd.hostid = hostId
return self.apiclient.listSimulatorHAStateTransitions(cmd)
def checkSyncToState(self, state, interval=5000):
def checkForStateSync(expectedState):
response = self.getHost(hostId=self.getHost().id).hostha
return response.hastate == expectedState, None
sync_interval = 1 + int(interval) / 1000
res, _ = wait_until(sync_interval, 10, checkForStateSync, state)
res, _ = wait_until(sync_interval, 100, checkForStateSync, state)
if not res:
self.fail("Failed to get host.hastate synced to expected state:" + state)
response = self.getHost(hostId=self.getHost().id).hostha
self.assertEqual(response.hastate, state)
def get_non_configured_ha_host(self):
def getNonConfiguredHaHost(self):
response = list_hosts(
self.apiclient,
type='Routing'
@ -168,12 +194,13 @@ class TestHostHA(cloudstackTestCase):
else:
return None
@attr(tags=["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="false")
def test_hostha_enable_feature_without_setting_provider(self):
"""
Tests Enable HA without setting the provider, Exception is thrown
"""
host = self.get_non_configured_ha_host()
host = self.getNonConfiguredHaHost()
if host is None:
cloudstackTestCase.skipTest(self, "There is no non configured hosts. Skipping test.")
@ -188,6 +215,7 @@ class TestHostHA(cloudstackTestCase):
else:
self.fail("Expected an exception to be thrown, failing")
@attr(tags=["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="false")
def test_ha_list_providers(self):
"""
@ -203,6 +231,7 @@ class TestHostHA(cloudstackTestCase):
response = self.apiclient.listHostHAProviders(cmd)[0]
self.assertEqual(response.haprovider, 'KVMHAProvider')
@attr(tags=["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="false")
def test_hostha_configure_invalid_provider(self):
"""
@ -217,6 +246,7 @@ class TestHostHA(cloudstackTestCase):
else:
self.fail("Expected an exception to be thrown, failing")
@attr(tags=["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="false")
def test_hostha_configure_default_driver(self):
"""
@ -227,6 +257,7 @@ class TestHostHA(cloudstackTestCase):
self.assertEqual(response.hostid, cmd.hostid)
self.assertEqual(response.haprovider, cmd.provider.lower())
@attr(tags=["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="false")
def test_ha_enable_feature_invalid(self):
"""
@ -255,6 +286,7 @@ class TestHostHA(cloudstackTestCase):
else:
self.fail("Expected an exception to be thrown, failing")
@attr(tags=["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="false")
def test_ha_disable_feature_invalid(self):
"""
@ -284,6 +316,7 @@ class TestHostHA(cloudstackTestCase):
else:
self.fail("Expected an exception to be thrown, failing")
@attr(tags=["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="false")
def test_hostha_enable_feature_valid(self):
"""
@ -295,6 +328,7 @@ class TestHostHA(cloudstackTestCase):
self.assertEqual(response.hostid, cmd.hostid)
self.assertEqual(response.haenable, True)
@attr(tags=["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="false")
def test_hostha_disable_feature_valid(self):
"""
@ -309,15 +343,16 @@ class TestHostHA(cloudstackTestCase):
response = self.getHost(hostId=cmd.hostid).hostha
self.assertEqual(response.hastate, 'Disabled')
@attr(tags=["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="false")
def test_ha_enabledisable_across_clusterzones(self):
def test_ha_configure_enabledisable_across_clusterzones(self):
"""
Tests ha enable/disable feature at cluster and zone level
Zone > Cluster > Host
"""
host = self.getHost()
self.configureAndEnableHostHa()
host = self.getHost()
self.checkSyncToState('Available')
response = self.getHost(hostId=host.id).hostha
self.assertTrue(response.hastate == 'Available')
@ -363,12 +398,16 @@ class TestHostHA(cloudstackTestCase):
# Check state sync
self.checkSyncToState('Available')
@attr(tags=["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="false")
def test_ha_multiple_mgmt_server_ownership(self):
"""
Tests ha resource ownership expiry across multi-mgmt server
"""
self.configureAndEnableHostHa()
host = self.getHost()
self.configureAndDisableHostHa(host.id)
self.configureSimulatorHAProviderState(True, True, True, False)
self.configureAndEnableHostHa(False)
cloudstackVersion = Configurations.listCapabilities(self.apiclient).cloudstackversion
@ -416,7 +455,7 @@ class TestHostHA(cloudstackTestCase):
retry_interval = 1 + (pingInterval * pingTimeout / 10)
res, _ = wait_until(retry_interval, 10, removeFakeMgmtServer, self.getFakeMsRunId())
res, _ = wait_until(retry_interval, 20, removeFakeMgmtServer, self.getFakeMsRunId())
if not res:
self.fail("Management server failed to turn down or remove fake mgmt server")
@ -432,23 +471,32 @@ class TestHostHA(cloudstackTestCase):
newOwnerId = result[0][0]
self.assertTrue(newOwnerId in currentMsHosts)
def checkFSMTransition(self, transition, event, haState, prevHaState, hasActiviyCounter, hasRecoveryCounter):
self.assertEqual(transition.event, event)
self.assertEqual(transition.hastate, haState)
self.assertEqual(transition.prevhastate, prevHaState)
if hasActiviyCounter:
if hasActiviyCounter is None:
pass
elif hasActiviyCounter:
self.assertTrue(transition.activitycounter > 0)
else:
self.assertEqual(transition.activitycounter, 0)
if hasRecoveryCounter:
if hasRecoveryCounter is None:
pass
elif hasRecoveryCounter:
self.assertTrue(transition.recoverycounter > 0)
else:
self.assertEqual(transition.recoverycounter, 0)
def findFSMTransitionToState(self, state, host):
transitions = self.getSimulatorHAStateTransitions(host.id)
if not transitions:
return False, (None, None, None)
previousTransition = None
stateTransition = None
nextTransition = None
@ -460,10 +508,12 @@ class TestHostHA(cloudstackTestCase):
stateTransition = transition
if not stateTransition:
previousTransition = transition
if stateTransition:
return True, (previousTransition, stateTransition, nextTransition,)
return False, (previousTransition, stateTransition, nextTransition,)
@attr(tags=["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="false")
def test_ha_verify_fsm_available(self):
"""
@ -472,15 +522,17 @@ class TestHostHA(cloudstackTestCase):
"""
host = self.getHost()
self.configureAndDisableHostHa(host.id)
self.configureSimulatorHAProviderState(True, True, True, False)
self.configureAndEnableHostHa(False)
res, (_, T, _) = wait_until(2, 20, self.findFSMTransitionToState, 'available', host)
res, (_, T, _) = wait_until(3, 20, self.findFSMTransitionToState, 'available', host)
if not res:
self.fail("FSM did not transition to available state")
self.checkFSMTransition(T, 'enabled', 'available', 'disabled', False, False)
@attr(tags=["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="false")
def test_ha_verify_fsm_degraded(self):
"""
@ -490,26 +542,26 @@ class TestHostHA(cloudstackTestCase):
Available->Suspect<->Checking->Degraded->Available
"""
host = self.getHost()
self.configureAndDisableHostHa(host.id)
self.configureSimulatorHAProviderState(False, True, True, False)
self.configureAndEnableHostHa(False)
# Initial health check failure
res, (_, T, _) = wait_until(2, 20, self.findFSMTransitionToState, 'suspect', host)
res, (_, T, _) = wait_until(3, 50, self.findFSMTransitionToState, 'suspect', host)
if not res:
self.fail("FSM did not transition to suspect state")
self.checkFSMTransition(T, 'healthcheckfailed', 'suspect', 'available', False, False)
# Check transition to Degraded
res, (prevT, T, nextT) = wait_until(2, 20, self.findFSMTransitionToState, 'degraded', host)
res, (prevT, T, _) = wait_until(3, 100, self.findFSMTransitionToState, 'degraded', host)
if not res:
self.fail("FSM did not transition to degraded state")
if prevT:
self.checkFSMTransition(prevT, 'performactivitycheck', 'checking', 'suspect', True, False)
self.checkFSMTransition(T, 'activitycheckfailureunderthresholdratio', 'degraded', 'checking', True, False)
if nextT:
self.checkFSMTransition(nextT, 'periodicrecheckresourceactivity', 'suspect', 'degraded', False, False)
@attr(tags=["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="false")
def test_ha_verify_fsm_recovering(self):
@ -520,36 +572,33 @@ class TestHostHA(cloudstackTestCase):
Available->Suspect<->Checking->Recovering->Recovered<-retry-loop->->Fencing
"""
host = self.getHost()
self.configureAndDisableHostHa(host.id)
self.configureSimulatorHAProviderState(False, False, True, False)
self.configureAndEnableHostHa(False)
# Initial health check failure
res, (_, T, _) = wait_until(2, 30, self.findFSMTransitionToState, 'suspect', host)
res, (_, T, _) = wait_until(3, 50, self.findFSMTransitionToState, 'suspect', host)
if not res:
self.fail("FSM did not transition to suspect state")
self.checkFSMTransition(T, 'healthcheckfailed', 'suspect', 'available', False, False)
# Check transition to recovering
res, (prevT, T, nextT) = wait_until(2, 60, self.findFSMTransitionToState, 'recovering', host)
res, (prevT, T, _) = wait_until(3, 100, self.findFSMTransitionToState, 'recovering', host)
if not res:
self.fail("FSM did not transition to recovering state")
if prevT:
self.checkFSMTransition(prevT, 'performactivitycheck', 'checking', 'suspect', True, False)
self.checkFSMTransition(T, 'activitycheckfailureoverthresholdratio', 'recovering', 'checking', True, False)
if nextT:
self.checkFSMTransition(nextT, 'recovered', 'recovered', 'recovering', False, True)
# Check transition to fencing due to recovery attempts exceeded
res, (prevT, T, nextT) = wait_until(2, 60, self.findFSMTransitionToState, 'fencing', host)
res, (_, T, _) = wait_until(3, 100, self.findFSMTransitionToState, 'fencing', host)
if not res:
self.fail("FSM did not transition to fencing state")
if prevT:
self.checkFSMTransition(prevT, 'activitycheckfailureoverthresholdratio', 'recovering', 'checking', True,
True)
self.checkFSMTransition(T, 'recoveryoperationthresholdexceeded', 'fencing', 'recovering', False, True)
self.checkFSMTransition(T, 'recoveryoperationthresholdexceeded', 'fencing', 'recovering', None, True)
@attr(tags=["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="false")
def test_ha_verify_fsm_fenced(self):
@ -559,19 +608,18 @@ class TestHostHA(cloudstackTestCase):
Available->Suspect<->Checking->Recovering<-fail recovery->->Fencing->Fenced
"""
host = self.getHost()
self.configureAndDisableHostHa(host.id)
self.configureSimulatorHAProviderState(False, False, False, True)
self.configureAndEnableHostHa(False)
# Check for transition to fenced
res, (prevT, T, _) = wait_until(2, 30, self.findFSMTransitionToState, 'fenced', host)
res, (prevT, T, _) = wait_until(3, 100, self.findFSMTransitionToState, 'fenced', host)
if not res:
self.fail("FSM did not transition to fenced state")
self.checkFSMTransition(prevT, 'recoveryoperationthresholdexceeded', 'fencing', 'recovering', False, True)
self.checkFSMTransition(T, 'fenced', 'fenced', 'fencing', False, False)
# TODO: add test case for HA vm reboot checks
# Simulate manual recovery of host and cancel maintenance mode
self.configureSimulatorHAProviderState(True, True, True, False)
cancelCmd = cancelHostMaintenance.cancelHostMaintenanceCmd()
@ -579,13 +627,13 @@ class TestHostHA(cloudstackTestCase):
self.apiclient.cancelHostMaintenance(cancelCmd)
# Check for transition to available after manual recovery
res, (prevT, T, _) = wait_until(2, 20, self.findFSMTransitionToState, 'available', host)
res, (prevT, T, _) = wait_until(3, 50, self.findFSMTransitionToState, 'available', host)
if not res:
self.fail("FSM did not transition to available state")
self.checkFSMTransition(prevT, 'healthcheckpassed', 'ineligible', 'fenced', False, False)
self.checkFSMTransition(T, 'eligible', 'available', 'ineligible', False, False)
@attr(tags=["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="false")
def test_configure_ha_provider_invalid(self):
"""
@ -618,6 +666,7 @@ class TestHostHA(cloudstackTestCase):
else:
self.fail("Expected an exception to be thrown, failing")
@attr(tags=["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="false")
def test_configure_ha_provider_valid(self):
"""
@ -649,3 +698,75 @@ class TestHostHA(cloudstackTestCase):
# Check the response contains the set provider and hostID
self.assertEqual(response.haprovider, conf_ha_cmd.provider)
self.assertEqual(response.hostid, conf_ha_cmd.hostid)
def getHaProvider(self, host):
cmd = listHostHAProviders.listHostHAProvidersCmd()
cmd.hypervisor = host.hypervisor
response = self.apiclient.listHostHAProviders(cmd)
return response[0].haprovider
def configureHaProvider(self):
cmd = self.getHostHaConfigCmd(self.getHaProvider(self.getHost()))
return self.apiclient.configureHAForHost(cmd)
@attr(tags=["advanced",
"advancedns",
"smoke",
"basic",
"sg"],
required_hardware="false")
def test_list_ha_for_host(self):
"""
Test that verifies the listHAForHost API
"""
self.configureHaProvider()
db_count = self.dbclient.execute("SELECT count(*) FROM cloud.ha_config")
cmd = self.getListHostHAResources()
del cmd.hostid
response = self.apiclient.listHostHAResources(cmd)
self.assertEqual(db_count[0][0], len(response))
@attr(tags=["advanced",
"advancedns",
"smoke",
"basic",
"sg"],
required_hardware="false")
def test_list_ha_for_host_valid(self):
"""
Valid test for listing a specific host HA resources
"""
self.configureHaProvider()
cmd = self.getListHostHAResources()
response = self.apiclient.listHostHAResources(cmd)
self.assertEqual(response[0].hostid, cmd.hostid)
@attr(tags=["advanced",
"advancedns",
"smoke",
"basic",
"sg"],
required_hardware="false")
def test_list_ha_for_host_invalid(self):
"""
Test that listHostHAResources is returning exception when called with invalid data
"""
self.configureHaProvider()
cmd = self.getListHostHAResources()
cmd.hostid = "someinvalidvalue"
try:
response = self.apiclient.listHostHAResources(cmd)
except Exception:
pass
else:
self.fail("Expected an exception to be thrown, failing")