mirror of https://github.com/apache/cloudstack.git
Merge branch '4.22'
This commit is contained in:
commit
a4a52c9665
|
|
@ -26,17 +26,19 @@ public interface Investigator extends Adapter {
|
|||
* Returns if the vm is still alive.
|
||||
*
|
||||
* @param vm to work on.
|
||||
* @return true if vm is alive, otherwise false
|
||||
*/
|
||||
public boolean isVmAlive(VirtualMachine vm, Host host) throws UnknownVM;
|
||||
boolean isVmAlive(VirtualMachine vm, Host host) throws UnknownVM;
|
||||
|
||||
public Status isAgentAlive(Host agent);
|
||||
/**
|
||||
* Returns the agent status of the host.
|
||||
*
|
||||
* @param host
|
||||
* @return status of the host agent
|
||||
*/
|
||||
Status getHostAgentStatus(Host host);
|
||||
|
||||
class UnknownVM extends Exception {
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static final long serialVersionUID = 1L;
|
||||
|
||||
};
|
||||
}
|
||||
|
|
|
|||
|
|
@ -87,6 +87,7 @@ public final class ConfigureHAForHostCmd extends BaseAsyncCmd {
|
|||
final HostHAResponse response = new HostHAResponse();
|
||||
response.setId(resourceUuid);
|
||||
response.setProvider(getHaProvider().toLowerCase());
|
||||
response.setStatus(result);
|
||||
response.setResponseName(getCommandName());
|
||||
setResponseObject(response);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -38,6 +38,8 @@ public class CheckOnHostAnswer extends Answer {
|
|||
|
||||
public CheckOnHostAnswer(CheckOnHostCommand cmd, String details) {
|
||||
super(cmd, false, details);
|
||||
determined = false;
|
||||
alive = false;
|
||||
}
|
||||
|
||||
public boolean isDetermined() {
|
||||
|
|
@ -47,5 +49,4 @@ public class CheckOnHostAnswer extends Answer {
|
|||
public boolean isAlive() {
|
||||
return alive;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -24,7 +24,7 @@ import com.cloud.host.Host;
|
|||
|
||||
public class CheckOnHostCommand extends Command {
|
||||
HostTO host;
|
||||
boolean reportCheckFailureIfOneStorageIsDown;
|
||||
boolean reportIfHeartBeatFailedForOneStoragePool;
|
||||
|
||||
protected CheckOnHostCommand() {
|
||||
}
|
||||
|
|
@ -34,17 +34,17 @@ public class CheckOnHostCommand extends Command {
|
|||
setWait(20);
|
||||
}
|
||||
|
||||
public CheckOnHostCommand(Host host, boolean reportCheckFailureIfOneStorageIsDown) {
|
||||
public CheckOnHostCommand(Host host, boolean reportIfHeartBeatFailedForOneStoragePool) {
|
||||
this(host);
|
||||
this.reportCheckFailureIfOneStorageIsDown = reportCheckFailureIfOneStorageIsDown;
|
||||
this.reportIfHeartBeatFailedForOneStoragePool = reportIfHeartBeatFailedForOneStoragePool;
|
||||
}
|
||||
|
||||
public HostTO getHost() {
|
||||
return host;
|
||||
}
|
||||
|
||||
public boolean isCheckFailedOnOneStorage() {
|
||||
return reportCheckFailureIfOneStorageIsDown;
|
||||
public boolean shouldReportIfHeartBeatFailedForOneStoragePool() {
|
||||
return reportIfHeartBeatFailedForOneStoragePool;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
|||
|
|
@ -75,10 +75,10 @@ public interface HighAvailabilityManager extends Manager {
|
|||
+ " which are registered for the HA event that were successful and are now ready to be purged.",
|
||||
true, Cluster);
|
||||
|
||||
public static final ConfigKey<Boolean> KvmHAFenceHostIfHeartbeatFailsOnStorage = new ConfigKey<>("Advanced", Boolean.class, "kvm.ha.fence.on.storage.heartbeat.failure", "false",
|
||||
ConfigKey<Boolean> KvmHAFenceHostIfHeartbeatFailsOnStorage = new ConfigKey<>("Advanced", Boolean.class, "kvm.ha.fence.on.storage.heartbeat.failure", "false",
|
||||
"Proceed fencing the host even the heartbeat failed for only one storage pool", false, ConfigKey.Scope.Zone);
|
||||
|
||||
public enum WorkType {
|
||||
enum WorkType {
|
||||
Migration, // Migrating VMs off of a host.
|
||||
Stop, // Stops a VM for storage pool migration purposes. This should be obsolete now.
|
||||
CheckStop, // Checks if a VM has been stopped.
|
||||
|
|
|
|||
|
|
@ -395,9 +395,9 @@ public class VolumeServiceImpl implements VolumeService {
|
|||
}
|
||||
|
||||
// Find out if the volume is at state of download_in_progress on secondary storage
|
||||
VolumeDataStoreVO volumeStore = _volumeStoreDao.findByVolume(volume.getId());
|
||||
if (volumeStore != null) {
|
||||
if (volumeStore.getDownloadState() == VMTemplateStorageResourceAssoc.Status.DOWNLOAD_IN_PROGRESS) {
|
||||
VolumeDataStoreVO volumeOnImageStore = _volumeStoreDao.findByVolume(volume.getId());
|
||||
if (volumeOnImageStore != null) {
|
||||
if (volumeOnImageStore.getDownloadState() == VMTemplateStorageResourceAssoc.Status.DOWNLOAD_IN_PROGRESS) {
|
||||
String msg = String.format("Volume: %s is currently being uploaded; can't delete it.", volume);
|
||||
logger.debug(msg);
|
||||
result.setSuccess(false);
|
||||
|
|
@ -416,10 +416,10 @@ public class VolumeServiceImpl implements VolumeService {
|
|||
|
||||
if (!volumeExistsOnPrimary(vol)) {
|
||||
// not created on primary store
|
||||
if (volumeStore == null) {
|
||||
if (volumeOnImageStore == null) {
|
||||
// also not created on secondary store
|
||||
if (logger.isDebugEnabled()) {
|
||||
logger.debug("Marking volume that was never created as destroyed: " + vol);
|
||||
logger.debug("Marking volume that was never created as destroyed: {}", vol);
|
||||
}
|
||||
VMTemplateVO template = templateDao.findById(vol.getTemplateId());
|
||||
if (template != null && !template.isDeployAsIs()) {
|
||||
|
|
@ -435,11 +435,21 @@ public class VolumeServiceImpl implements VolumeService {
|
|||
if (volume.getDataStore().getRole() == DataStoreRole.Image) {
|
||||
// no need to change state in volumes table
|
||||
volume.processEventOnly(Event.DestroyRequested);
|
||||
if (volumeOnImageStore == null) {
|
||||
logger.debug("Volume {} doesn't exist on image store, no need to delete", vol);
|
||||
future.complete(result);
|
||||
return future;
|
||||
}
|
||||
} else if (volume.getDataStore().getRole() == DataStoreRole.Primary) {
|
||||
if (vol.getState() == Volume.State.Expunging) {
|
||||
logger.info("Volume {} is already in Expunging, retrying", volume);
|
||||
}
|
||||
volume.processEvent(Event.ExpungeRequested);
|
||||
if (!volumeExistsOnPrimary(vol)) {
|
||||
logger.debug("Volume {} doesn't exist on primary storage, no need to delete", vol);
|
||||
future.complete(result);
|
||||
return future;
|
||||
}
|
||||
}
|
||||
|
||||
DeleteVolumeContext<VolumeApiResult> context = new DeleteVolumeContext<>(null, vo, future);
|
||||
|
|
@ -460,13 +470,11 @@ public class VolumeServiceImpl implements VolumeService {
|
|||
|
||||
private boolean volumeExistsOnPrimary(VolumeVO vol) {
|
||||
Long poolId = vol.getPoolId();
|
||||
|
||||
if (poolId == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
PrimaryDataStore primaryStore = dataStoreMgr.getPrimaryDataStore(poolId);
|
||||
|
||||
if (primaryStore == null) {
|
||||
return false;
|
||||
}
|
||||
|
|
@ -476,8 +484,7 @@ public class VolumeServiceImpl implements VolumeService {
|
|||
}
|
||||
|
||||
String volumePath = vol.getPath();
|
||||
|
||||
if (volumePath == null || volumePath.trim().isEmpty()) {
|
||||
if (StringUtils.isBlank(volumePath)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1663,14 +1663,14 @@ public class ExtensionsManagerImpl extends ManagerBase implements ExtensionsMana
|
|||
public List<String> getExtensionReservedResourceDetails(long extensionId) {
|
||||
ExtensionDetailsVO detailsVO = extensionDetailsDao.findDetail(extensionId,
|
||||
ApiConstants.RESERVED_RESOURCE_DETAILS);
|
||||
if (detailsVO == null || !StringUtils.isNotBlank(detailsVO.getValue())) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
List<String> reservedDetails = new ArrayList<>();
|
||||
String[] parts = detailsVO.getValue().split(",");
|
||||
for (String part : parts) {
|
||||
if (StringUtils.isNotBlank(part)) {
|
||||
reservedDetails.add(part.trim());
|
||||
if (detailsVO != null && StringUtils.isNotBlank(detailsVO.getValue())) {
|
||||
String[] parts = detailsVO.getValue().split(",");
|
||||
for (String part : parts) {
|
||||
String trimmedPart = part.trim();
|
||||
if (StringUtils.isNotBlank(trimmedPart)) {
|
||||
reservedDetails.add(trimmedPart);
|
||||
}
|
||||
}
|
||||
}
|
||||
addInbuiltExtensionReservedResourceDetails(extensionId, reservedDetails);
|
||||
|
|
|
|||
|
|
@ -41,15 +41,15 @@ public class HypervInvestigator extends AdapterBase implements Investigator {
|
|||
|
||||
@Override
|
||||
public boolean isVmAlive(com.cloud.vm.VirtualMachine vm, Host host) throws UnknownVM {
|
||||
Status status = isAgentAlive(host);
|
||||
Status status = getHostAgentStatus(host);
|
||||
if (status == null) {
|
||||
throw new UnknownVM();
|
||||
}
|
||||
return status == Status.Up ? true : null;
|
||||
return status == Status.Up;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Status isAgentAlive(Host agent) {
|
||||
public Status getHostAgentStatus(Host agent) {
|
||||
if (agent.getHypervisorType() != Hypervisor.HypervisorType.Hyperv) {
|
||||
return null;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -19,10 +19,7 @@
|
|||
package com.cloud.ha;
|
||||
|
||||
import com.cloud.agent.AgentManager;
|
||||
import com.cloud.agent.api.Answer;
|
||||
import com.cloud.agent.api.CheckOnHostCommand;
|
||||
import com.cloud.host.Host;
|
||||
import com.cloud.host.HostVO;
|
||||
import com.cloud.host.Status;
|
||||
import com.cloud.host.dao.HostDao;
|
||||
import com.cloud.hypervisor.Hypervisor;
|
||||
|
|
@ -34,11 +31,12 @@ import org.apache.cloudstack.engine.subsystem.api.storage.DataStoreProvider;
|
|||
import org.apache.cloudstack.engine.subsystem.api.storage.DataStoreProviderManager;
|
||||
import org.apache.cloudstack.engine.subsystem.api.storage.PrimaryDataStoreDriver;
|
||||
import org.apache.cloudstack.ha.HAManager;
|
||||
import org.apache.cloudstack.kvm.ha.KVMHostActivityChecker;
|
||||
import org.apache.cloudstack.storage.datastore.db.PrimaryDataStoreDao;
|
||||
import org.apache.cloudstack.storage.datastore.db.StoragePoolVO;
|
||||
|
||||
import javax.inject.Inject;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
public class KVMInvestigator extends AdapterBase implements Investigator {
|
||||
|
|
@ -54,13 +52,15 @@ public class KVMInvestigator extends AdapterBase implements Investigator {
|
|||
private HAManager haManager;
|
||||
@Inject
|
||||
private DataStoreProviderManager dataStoreProviderMgr;
|
||||
@Inject
|
||||
private KVMHostActivityChecker hostActivityChecker;
|
||||
|
||||
@Override
|
||||
public boolean isVmAlive(com.cloud.vm.VirtualMachine vm, Host host) throws UnknownVM {
|
||||
if (haManager.isHAEligible(host)) {
|
||||
return haManager.isVMAliveOnHost(host);
|
||||
}
|
||||
Status status = isAgentAlive(host);
|
||||
Status status = getHostAgentStatus(host);
|
||||
logger.debug("HA: HOST is ineligible legacy state {} for host {}", status, host);
|
||||
if (status == null) {
|
||||
throw new UnknownVM();
|
||||
|
|
@ -73,86 +73,41 @@ public class KVMInvestigator extends AdapterBase implements Investigator {
|
|||
}
|
||||
|
||||
@Override
|
||||
public Status isAgentAlive(Host agent) {
|
||||
if (agent.getHypervisorType() != Hypervisor.HypervisorType.KVM && agent.getHypervisorType() != Hypervisor.HypervisorType.LXC) {
|
||||
public Status getHostAgentStatus(Host host) {
|
||||
if (host.getHypervisorType() != Hypervisor.HypervisorType.KVM && host.getHypervisorType() != Hypervisor.HypervisorType.LXC) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (haManager.isHAEligible(agent)) {
|
||||
return haManager.getHostStatus(agent);
|
||||
if (haManager.isHAEligible(host)) {
|
||||
return haManager.getHostStatusFromHAConfig(host);
|
||||
}
|
||||
|
||||
List<StoragePoolVO> clusterPools = _storagePoolDao.findPoolsInClusters(Arrays.asList(agent.getClusterId()), null);
|
||||
boolean storageSupportHA = storageSupportHa(clusterPools);
|
||||
if (!storageSupportHA) {
|
||||
List<StoragePoolVO> zonePools = _storagePoolDao.findZoneWideStoragePoolsByHypervisor(agent.getDataCenterId(), agent.getHypervisorType());
|
||||
storageSupportHA = storageSupportHa(zonePools);
|
||||
List<StoragePoolVO> clusterPools = _storagePoolDao.findPoolsInClusters(Collections.singletonList(host.getClusterId()), null);
|
||||
boolean storageSupportsHA = storageSupportsHA(clusterPools);
|
||||
if (!storageSupportsHA) {
|
||||
List<StoragePoolVO> zonePools = _storagePoolDao.findZoneWideStoragePoolsByHypervisor(host.getDataCenterId(), host.getHypervisorType());
|
||||
storageSupportsHA = storageSupportsHA(zonePools);
|
||||
}
|
||||
if (!storageSupportHA) {
|
||||
logger.warn("Agent investigation was requested on host {}, but host does not support investigation because it has no NFS storage. Skipping investigation.", agent);
|
||||
if (!storageSupportsHA) {
|
||||
logger.warn("Agent investigation was requested on host {}, but host does not support investigation" +
|
||||
" because it has no HA supported storage. Skipping investigation.", host);
|
||||
return null;
|
||||
}
|
||||
|
||||
Status hostStatus = null;
|
||||
Status neighbourStatus = null;
|
||||
boolean reportFailureIfOneStorageIsDown = HighAvailabilityManager.KvmHAFenceHostIfHeartbeatFailsOnStorage.value();
|
||||
CheckOnHostCommand cmd = new CheckOnHostCommand(agent, reportFailureIfOneStorageIsDown);
|
||||
|
||||
try {
|
||||
Answer answer = _agentMgr.easySend(agent.getId(), cmd);
|
||||
if (answer != null) {
|
||||
hostStatus = answer.getResult() ? Status.Down : Status.Up;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
logger.debug("Failed to send command to host: {}", agent);
|
||||
}
|
||||
if (hostStatus == null) {
|
||||
hostStatus = Status.Disconnected;
|
||||
}
|
||||
|
||||
List<HostVO> neighbors = _resourceMgr.listHostsInClusterByStatus(agent.getClusterId(), Status.Up);
|
||||
for (HostVO neighbor : neighbors) {
|
||||
if (neighbor.getId() == agent.getId()
|
||||
|| (neighbor.getHypervisorType() != Hypervisor.HypervisorType.KVM && neighbor.getHypervisorType() != Hypervisor.HypervisorType.LXC)) {
|
||||
continue;
|
||||
}
|
||||
logger.debug("Investigating host:{} via neighbouring host:{}", agent, neighbor);
|
||||
try {
|
||||
Answer answer = _agentMgr.easySend(neighbor.getId(), cmd);
|
||||
if (answer != null) {
|
||||
neighbourStatus = answer.getResult() ? Status.Down : Status.Up;
|
||||
logger.debug("Neighbouring host:{} returned status:{} for the investigated host:{}", neighbor, neighbourStatus, agent);
|
||||
if (neighbourStatus == Status.Up) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
logger.debug("Failed to send command to host: {}", neighbor);
|
||||
}
|
||||
}
|
||||
if (neighbourStatus == Status.Up && (hostStatus == Status.Disconnected || hostStatus == Status.Down)) {
|
||||
hostStatus = Status.Disconnected;
|
||||
}
|
||||
if (neighbourStatus == Status.Down && (hostStatus == Status.Disconnected || hostStatus == Status.Down)) {
|
||||
hostStatus = Status.Down;
|
||||
}
|
||||
logger.debug("HA: HOST is ineligible legacy state {} for host {}", hostStatus, agent);
|
||||
return hostStatus;
|
||||
return hostActivityChecker.getHostAgentStatus(host);
|
||||
}
|
||||
|
||||
private boolean storageSupportHa(List<StoragePoolVO> pools) {
|
||||
boolean storageSupportHA = false;
|
||||
private boolean storageSupportsHA(List<StoragePoolVO> pools) {
|
||||
for (StoragePoolVO pool : pools) {
|
||||
DataStoreProvider storeProvider = dataStoreProviderMgr.getDataStoreProvider(pool.getStorageProviderName());
|
||||
DataStoreDriver storeDriver = storeProvider.getDataStoreDriver();
|
||||
if (storeDriver instanceof PrimaryDataStoreDriver) {
|
||||
PrimaryDataStoreDriver primaryStoreDriver = (PrimaryDataStoreDriver)storeDriver;
|
||||
if (primaryStoreDriver.isStorageSupportHA(pool.getPoolType())) {
|
||||
storageSupportHA = true;
|
||||
break;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return storageSupportHA;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -35,10 +35,9 @@ import com.cloud.agent.properties.AgentPropertiesFileHandler;
|
|||
public class KVMHABase {
|
||||
protected Logger logger = LogManager.getLogger(getClass());
|
||||
private long _timeout = 60000; /* 1 minutes */
|
||||
protected long _heartBeatUpdateTimeout = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.HEARTBEAT_UPDATE_TIMEOUT);
|
||||
protected long _heartBeatUpdateFreq = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_UPDATE_FREQUENCY);
|
||||
protected long _heartBeatUpdateFreqInMs = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_UPDATE_FREQUENCY);
|
||||
protected long _heartBeatUpdateMaxTries = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_UPDATE_MAX_TRIES);
|
||||
protected long _heartBeatUpdateRetrySleep = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_UPDATE_RETRY_SLEEP);
|
||||
protected long _heartBeatUpdateRetrySleepInMs = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_UPDATE_RETRY_SLEEP);
|
||||
|
||||
public static enum PoolType {
|
||||
PrimaryStorage, SecondaryStorage
|
||||
|
|
@ -138,7 +137,7 @@ public class KVMHABase {
|
|||
/* Can't find the mount point? */
|
||||
/* we need to mount it under poolName */
|
||||
if (poolName != null) {
|
||||
Script mount = new Script("/bin/bash", 60000);
|
||||
Script mount = new Script("/bin/bash", _timeout);
|
||||
mount.add("-c");
|
||||
mount.add("mount " + mountSource + " " + destPath);
|
||||
String result = mount.execute();
|
||||
|
|
@ -154,7 +153,6 @@ public class KVMHABase {
|
|||
}
|
||||
|
||||
protected String getMountPoint(HAStoragePool storagePool) {
|
||||
|
||||
StoragePool pool = null;
|
||||
String poolName = null;
|
||||
try {
|
||||
|
|
@ -171,7 +169,6 @@ public class KVMHABase {
|
|||
}
|
||||
poolName = pool.getName();
|
||||
}
|
||||
|
||||
} catch (LibvirtException e) {
|
||||
logger.debug("Ignoring libvirt error.", e);
|
||||
} finally {
|
||||
|
|
@ -234,7 +231,7 @@ public class KVMHABase {
|
|||
return result;
|
||||
}
|
||||
|
||||
public Boolean checkingHeartBeat() {
|
||||
public Boolean hasHeartBeat() {
|
||||
// TODO Auto-generated method stub
|
||||
return null;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -26,44 +26,43 @@ import com.cloud.agent.api.to.HostTO;
|
|||
public class KVMHAChecker extends KVMHABase implements Callable<Boolean> {
|
||||
private List<HAStoragePool> storagePools;
|
||||
private HostTO host;
|
||||
private boolean reportFailureIfOneStorageIsDown;
|
||||
private boolean reportIfHeartBeatFailedForOneStoragePool;
|
||||
|
||||
public KVMHAChecker(List<HAStoragePool> pools, HostTO host, boolean reportFailureIfOneStorageIsDown) {
|
||||
public KVMHAChecker(List<HAStoragePool> pools, HostTO host, boolean reportIfHeartBeatFailedForOneStoragePool) {
|
||||
this.storagePools = pools;
|
||||
this.host = host;
|
||||
this.reportFailureIfOneStorageIsDown = reportFailureIfOneStorageIsDown;
|
||||
this.reportIfHeartBeatFailedForOneStoragePool = reportIfHeartBeatFailedForOneStoragePool;
|
||||
}
|
||||
|
||||
/*
|
||||
* True means heartbeaing is on going, or we can't get it's status. False
|
||||
* means heartbeating is stopped definitely
|
||||
* True means heart beating is on going, or we can't get it's status.
|
||||
* False means heart beating is stopped definitely.
|
||||
*/
|
||||
@Override
|
||||
public Boolean checkingHeartBeat() {
|
||||
boolean validResult = false;
|
||||
|
||||
String hostAndPools = String.format("host IP [%s] in pools [%s]", host.getPrivateNetwork().getIp(), storagePools.stream().map(pool -> pool.getPoolUUID()).collect(Collectors.joining(", ")));
|
||||
|
||||
logger.debug(String.format("Checking heart beat with KVMHAChecker for %s", hostAndPools));
|
||||
public Boolean hasHeartBeat() {
|
||||
String hostAndPools = String.format("host IP [%s] in pools [%s]", host.getPrivateNetwork().getIp(),
|
||||
storagePools.stream().map(pool -> pool.getPoolUUID()).collect(Collectors.joining(", ")));
|
||||
logger.debug("Checking heart beat with KVMHAChecker for {}", hostAndPools);
|
||||
|
||||
boolean heartBeatCheckResult = false;
|
||||
for (HAStoragePool pool : storagePools) {
|
||||
validResult = pool.getPool().checkingHeartBeat(pool, host);
|
||||
if (reportFailureIfOneStorageIsDown && !validResult) {
|
||||
heartBeatCheckResult = pool.getPool().hasHeartBeat(pool, host);
|
||||
if (reportIfHeartBeatFailedForOneStoragePool && !heartBeatCheckResult) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!validResult) {
|
||||
logger.warn(String.format("All checks with KVMHAChecker for %s considered it as dead. It may cause a shutdown of the host.", hostAndPools));
|
||||
if (!heartBeatCheckResult) {
|
||||
logger.warn("All checks with KVMHAChecker for {} considered it as dead. It may cause a shutdown of the host.", hostAndPools);
|
||||
}
|
||||
|
||||
return validResult;
|
||||
return heartBeatCheckResult;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Boolean call() throws Exception {
|
||||
// logger.addAppender(new org.apache.log4j.ConsoleAppender(new
|
||||
// org.apache.log4j.PatternLayout(), "System.out"));
|
||||
return checkingHeartBeat();
|
||||
return hasHeartBeat();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -34,53 +34,49 @@ import java.util.concurrent.ConcurrentHashMap;
|
|||
|
||||
public class KVMHAMonitor extends KVMHABase implements Runnable {
|
||||
|
||||
private final Map<String, HAStoragePool> storagePool = new ConcurrentHashMap<>();
|
||||
private final Map<String, HAStoragePool> haStoragePools = new ConcurrentHashMap<>();
|
||||
private final boolean rebootHostAndAlertManagementOnHeartbeatTimeout;
|
||||
|
||||
private final String hostPrivateIp;
|
||||
|
||||
public KVMHAMonitor(HAStoragePool pool, String host) {
|
||||
if (pool != null) {
|
||||
storagePool.put(pool.getPoolUUID(), pool);
|
||||
}
|
||||
public KVMHAMonitor(String host) {
|
||||
hostPrivateIp = host;
|
||||
|
||||
rebootHostAndAlertManagementOnHeartbeatTimeout = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.REBOOT_HOST_AND_ALERT_MANAGEMENT_ON_HEARTBEAT_TIMEOUT);
|
||||
}
|
||||
|
||||
public void addStoragePool(HAStoragePool pool) {
|
||||
synchronized (storagePool) {
|
||||
storagePool.put(pool.getPoolUUID(), pool);
|
||||
synchronized (haStoragePools) {
|
||||
haStoragePools.put(pool.getPoolUUID(), pool);
|
||||
}
|
||||
}
|
||||
|
||||
public void removeStoragePool(String uuid) {
|
||||
synchronized (storagePool) {
|
||||
HAStoragePool pool = storagePool.get(uuid);
|
||||
synchronized (haStoragePools) {
|
||||
HAStoragePool pool = haStoragePools.get(uuid);
|
||||
if (pool != null) {
|
||||
Script.runSimpleBashScript("umount " + pool.getMountDestPath());
|
||||
storagePool.remove(uuid);
|
||||
haStoragePools.remove(uuid);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public List<HAStoragePool> getStoragePools() {
|
||||
synchronized (storagePool) {
|
||||
return new ArrayList<>(storagePool.values());
|
||||
synchronized (haStoragePools) {
|
||||
return new ArrayList<>(haStoragePools.values());
|
||||
}
|
||||
}
|
||||
|
||||
public HAStoragePool getStoragePool(String uuid) {
|
||||
synchronized (storagePool) {
|
||||
return storagePool.get(uuid);
|
||||
synchronized (haStoragePools) {
|
||||
return haStoragePools.get(uuid);
|
||||
}
|
||||
}
|
||||
|
||||
protected void runHeartBeat() {
|
||||
synchronized (storagePool) {
|
||||
synchronized (haStoragePools) {
|
||||
Set<String> removedPools = new HashSet<>();
|
||||
for (String uuid : storagePool.keySet()) {
|
||||
HAStoragePool primaryStoragePool = storagePool.get(uuid);
|
||||
for (String uuid : haStoragePools.keySet()) {
|
||||
HAStoragePool primaryStoragePool = haStoragePools.get(uuid);
|
||||
if (HighAvailabilityManager.LIBVIRT_STORAGE_POOL_TYPES_WITH_HA_SUPPORT.contains(primaryStoragePool.getPool().getType())) {
|
||||
checkForNotExistingLibvirtStoragePools(removedPools, uuid);
|
||||
if (removedPools.contains(uuid)) {
|
||||
|
|
@ -91,7 +87,7 @@ public class KVMHAMonitor extends KVMHABase implements Runnable {
|
|||
result = executePoolHeartBeatCommand(uuid, primaryStoragePool, result);
|
||||
|
||||
if (result != null && rebootHostAndAlertManagementOnHeartbeatTimeout) {
|
||||
logger.warn(String.format("Write heartbeat for pool [%s] failed: %s; stopping cloudstack-agent.", uuid, result));
|
||||
logger.warn("Write heartbeat for pool [{}] failed: {}; stopping cloudstack-agent.", uuid, result);
|
||||
primaryStoragePool.getPool().createHeartBeatCommand(primaryStoragePool, null, false);;
|
||||
}
|
||||
}
|
||||
|
|
@ -104,20 +100,18 @@ public class KVMHAMonitor extends KVMHABase implements Runnable {
|
|||
}
|
||||
|
||||
private String executePoolHeartBeatCommand(String uuid, HAStoragePool primaryStoragePool, String result) {
|
||||
for (int i = 1; i <= _heartBeatUpdateMaxTries; i++) {
|
||||
for (int attempt = 1; attempt <= _heartBeatUpdateMaxTries; attempt++) {
|
||||
result = primaryStoragePool.getPool().createHeartBeatCommand(primaryStoragePool, hostPrivateIp, true);
|
||||
|
||||
if (result != null) {
|
||||
logger.warn(String.format("Write heartbeat for pool [%s] failed: %s; try: %s of %s.", uuid, result, i, _heartBeatUpdateMaxTries));
|
||||
try {
|
||||
Thread.sleep(_heartBeatUpdateRetrySleep);
|
||||
} catch (InterruptedException e) {
|
||||
logger.debug("[IGNORED] Interrupted between heartbeat retries.", e);
|
||||
}
|
||||
} else {
|
||||
if (result == null) {
|
||||
break;
|
||||
}
|
||||
|
||||
logger.warn("Write heartbeat for pool [{}] failed: {}; try: {} of {}.", uuid, result, attempt, _heartBeatUpdateMaxTries);
|
||||
try {
|
||||
Thread.sleep(_heartBeatUpdateRetrySleepInMs);
|
||||
} catch (InterruptedException e) {
|
||||
logger.debug("[IGNORED] Interrupted between heartbeat retries.", e);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
|
@ -128,21 +122,21 @@ public class KVMHAMonitor extends KVMHABase implements Runnable {
|
|||
StoragePool storage = conn.storagePoolLookupByUUIDString(uuid);
|
||||
if (storage == null || storage.getInfo().state != StoragePoolState.VIR_STORAGE_POOL_RUNNING) {
|
||||
if (storage == null) {
|
||||
logger.debug(String.format("Libvirt storage pool [%s] not found, removing from HA list.", uuid));
|
||||
logger.debug("Libvirt storage pool [{}] not found, removing from HA list.", uuid);
|
||||
} else {
|
||||
logger.debug(String.format("Libvirt storage pool [%s] found, but not running, removing from HA list.", uuid));
|
||||
logger.debug("Libvirt storage pool [{}] found, but not running, removing from HA list.", uuid);
|
||||
}
|
||||
|
||||
removedPools.add(uuid);
|
||||
}
|
||||
|
||||
logger.debug(String.format("Found NFS storage pool [%s] in libvirt, continuing.", uuid));
|
||||
logger.debug("Found NFS storage pool [{}] in libvirt, continuing.", uuid);
|
||||
|
||||
} catch (LibvirtException e) {
|
||||
logger.debug(String.format("Failed to lookup libvirt storage pool [%s].", uuid), e);
|
||||
logger.debug("Failed to lookup libvirt storage pool [{}].", uuid, e);
|
||||
|
||||
if (e.toString().contains("pool not found")) {
|
||||
logger.debug(String.format("Removing pool [%s] from HA monitor since it was deleted.", uuid));
|
||||
logger.debug("Removing pool [{}] from HA monitor since it was deleted.", uuid);
|
||||
removedPools.add(uuid);
|
||||
}
|
||||
}
|
||||
|
|
@ -155,11 +149,10 @@ public class KVMHAMonitor extends KVMHABase implements Runnable {
|
|||
runHeartBeat();
|
||||
|
||||
try {
|
||||
Thread.sleep(_heartBeatUpdateFreq);
|
||||
Thread.sleep(_heartBeatUpdateFreqInMs);
|
||||
} catch (InterruptedException e) {
|
||||
logger.debug("[IGNORED] Interrupted between heartbeats.", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -39,12 +39,12 @@ public class KVMHAVMActivityChecker extends KVMHABase implements Callable<Boolea
|
|||
}
|
||||
|
||||
@Override
|
||||
public Boolean checkingHeartBeat() {
|
||||
return this.storagePool.getPool().vmActivityCheck(storagePool, host, activityScriptTimeout, volumeUuidList, vmActivityCheckPath, suspectTimeInSeconds);
|
||||
public Boolean hasHeartBeat() {
|
||||
return this.storagePool.getPool().hasVmActivity(storagePool, host, activityScriptTimeout, volumeUuidList, vmActivityCheckPath, suspectTimeInSeconds);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Boolean call() throws Exception {
|
||||
return checkingHeartBeat();
|
||||
return hasHeartBeat();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1388,9 +1388,9 @@ public class LibvirtComputingResource extends ServerResourceBase implements Serv
|
|||
|
||||
final String[] info = NetUtils.getNetworkParams(privateNic);
|
||||
|
||||
kvmhaMonitor = new KVMHAMonitor(null, info[0]);
|
||||
final Thread ha = new Thread(kvmhaMonitor);
|
||||
ha.start();
|
||||
kvmhaMonitor = new KVMHAMonitor(info[0]);
|
||||
final Thread haMonitorThread = new Thread(kvmhaMonitor);
|
||||
haMonitorThread.start();
|
||||
|
||||
storagePoolManager = new KVMStoragePoolManager(storageLayer, kvmhaMonitor);
|
||||
|
||||
|
|
|
|||
|
|
@ -26,6 +26,7 @@ import java.util.concurrent.Executors;
|
|||
import java.util.concurrent.Future;
|
||||
|
||||
import com.cloud.agent.api.Answer;
|
||||
import com.cloud.agent.api.CheckOnHostAnswer;
|
||||
import com.cloud.agent.api.CheckOnHostCommand;
|
||||
import com.cloud.agent.api.to.HostTO;
|
||||
import com.cloud.hypervisor.kvm.resource.KVMHABase.HAStoragePool;
|
||||
|
|
@ -45,20 +46,21 @@ public final class LibvirtCheckOnHostCommandWrapper extends CommandWrapper<Check
|
|||
|
||||
final List<HAStoragePool> pools = monitor.getStoragePools();
|
||||
final HostTO host = command.getHost();
|
||||
final KVMHAChecker ha = new KVMHAChecker(pools, host, command.isCheckFailedOnOneStorage());
|
||||
|
||||
final KVMHAChecker ha = new KVMHAChecker(pools, host, command.shouldReportIfHeartBeatFailedForOneStoragePool());
|
||||
|
||||
final Future<Boolean> future = executors.submit(ha);
|
||||
try {
|
||||
final Boolean result = future.get();
|
||||
if (result) {
|
||||
return new Answer(command, false, "Heart is beating...");
|
||||
final Boolean hasHeartBeat = future.get();
|
||||
if (hasHeartBeat) {
|
||||
return new CheckOnHostAnswer(command, true, "Heart is beating");
|
||||
} else {
|
||||
return new Answer(command);
|
||||
return new CheckOnHostAnswer(command, "Heart is not beating");
|
||||
}
|
||||
} catch (final InterruptedException e) {
|
||||
return new Answer(command, false, "CheckOnHostCommand: can't get status of host: InterruptedException");
|
||||
return new CheckOnHostAnswer(command, "CheckOnHostCommand: can't get status of host: InterruptedException");
|
||||
} catch (final ExecutionException e) {
|
||||
return new Answer(command, false, "CheckOnHostCommand: can't get status of host: ExecutionException");
|
||||
return new CheckOnHostAnswer(command, "CheckOnHostCommand: can't get status of host: ExecutionException");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -49,8 +49,8 @@ public final class LibvirtCheckVMActivityOnStoragePoolCommandWrapper extends Com
|
|||
KVMStoragePool primaryPool = storagePoolMgr.getStoragePool(pool.getType(), pool.getUuid());
|
||||
|
||||
if (primaryPool.isPoolSupportHA()) {
|
||||
final HAStoragePool nfspool = monitor.getStoragePool(pool.getUuid());
|
||||
final KVMHAVMActivityChecker ha = new KVMHAVMActivityChecker(nfspool, command.getHost(), command.getVolumeList(), libvirtComputingResource.getVmActivityCheckPath(), command.getSuspectTimeInSeconds());
|
||||
final HAStoragePool haPool = monitor.getStoragePool(pool.getUuid());
|
||||
final KVMHAVMActivityChecker ha = new KVMHAVMActivityChecker(haPool, command.getHost(), command.getVolumeList(), libvirtComputingResource.getVmActivityCheckPath(), command.getSuspectTimeInSeconds());
|
||||
final Future<Boolean> future = executors.submit(ha);
|
||||
try {
|
||||
final Boolean result = future.get();
|
||||
|
|
|
|||
|
|
@ -208,12 +208,12 @@ public class IscsiAdmStoragePool implements KVMStoragePool {
|
|||
}
|
||||
|
||||
@Override
|
||||
public Boolean checkingHeartBeat(HAStoragePool pool, HostTO host) {
|
||||
public Boolean hasHeartBeat(HAStoragePool pool, HostTO host) {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Boolean vmActivityCheck(HAStoragePool pool, HostTO host, Duration activityScriptTimeout, String volumeUUIDListString, String vmActivityCheckPath, long duration) {
|
||||
public Boolean hasVmActivity(HAStoragePool pool, HostTO host, Duration activityScriptTimeout, String volumeUUIDListString, String vmActivityCheckPath, long duration) {
|
||||
return null;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -33,35 +33,33 @@ import com.cloud.storage.Storage.StoragePoolType;
|
|||
|
||||
public interface KVMStoragePool {
|
||||
|
||||
public static final long HeartBeatUpdateTimeout = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.HEARTBEAT_UPDATE_TIMEOUT);
|
||||
public static final long HeartBeatUpdateFreq = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_UPDATE_FREQUENCY);
|
||||
public static final long HeartBeatUpdateMaxTries = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_UPDATE_MAX_TRIES);
|
||||
public static final long HeartBeatUpdateRetrySleep = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_UPDATE_RETRY_SLEEP);
|
||||
public static final long HeartBeatCheckerTimeout = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_CHECKER_TIMEOUT);
|
||||
long HeartBeatUpdateTimeoutInMs = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.HEARTBEAT_UPDATE_TIMEOUT);
|
||||
long HeartBeatUpdateFreqInMs = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_UPDATE_FREQUENCY);
|
||||
long HeartBeatCheckerTimeoutInMs = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_CHECKER_TIMEOUT);
|
||||
|
||||
public default KVMPhysicalDisk createPhysicalDisk(String volumeUuid, PhysicalDiskFormat format, Storage.ProvisioningType provisioningType, long size, Long usableSize, byte[] passphrase) {
|
||||
default KVMPhysicalDisk createPhysicalDisk(String volumeUuid, PhysicalDiskFormat format, Storage.ProvisioningType provisioningType, long size, Long usableSize, byte[] passphrase) {
|
||||
return createPhysicalDisk(volumeUuid, format, provisioningType, size, passphrase);
|
||||
}
|
||||
|
||||
public KVMPhysicalDisk createPhysicalDisk(String volumeUuid, PhysicalDiskFormat format, Storage.ProvisioningType provisioningType, long size, byte[] passphrase);
|
||||
KVMPhysicalDisk createPhysicalDisk(String volumeUuid, PhysicalDiskFormat format, Storage.ProvisioningType provisioningType, long size, byte[] passphrase);
|
||||
|
||||
public KVMPhysicalDisk createPhysicalDisk(String volumeUuid, Storage.ProvisioningType provisioningType, long size, byte[] passphrase);
|
||||
KVMPhysicalDisk createPhysicalDisk(String volumeUuid, Storage.ProvisioningType provisioningType, long size, byte[] passphrase);
|
||||
|
||||
public boolean connectPhysicalDisk(String volumeUuid, Map<String, String> details);
|
||||
boolean connectPhysicalDisk(String volumeUuid, Map<String, String> details);
|
||||
|
||||
public KVMPhysicalDisk getPhysicalDisk(String volumeUuid);
|
||||
KVMPhysicalDisk getPhysicalDisk(String volumeUuid);
|
||||
|
||||
public boolean disconnectPhysicalDisk(String volumeUuid);
|
||||
boolean disconnectPhysicalDisk(String volumeUuid);
|
||||
|
||||
public boolean deletePhysicalDisk(String volumeUuid, Storage.ImageFormat format);
|
||||
boolean deletePhysicalDisk(String volumeUuid, Storage.ImageFormat format);
|
||||
|
||||
public List<KVMPhysicalDisk> listPhysicalDisks();
|
||||
List<KVMPhysicalDisk> listPhysicalDisks();
|
||||
|
||||
public String getUuid();
|
||||
String getUuid();
|
||||
|
||||
public long getCapacity();
|
||||
long getCapacity();
|
||||
|
||||
public long getUsed();
|
||||
long getUsed();
|
||||
|
||||
default Long getCapacityIops() {
|
||||
return null;
|
||||
|
|
@ -71,51 +69,51 @@ public interface KVMStoragePool {
|
|||
return null;
|
||||
}
|
||||
|
||||
public long getAvailable();
|
||||
long getAvailable();
|
||||
|
||||
public boolean refresh();
|
||||
boolean refresh();
|
||||
|
||||
public boolean isExternalSnapshot();
|
||||
boolean isExternalSnapshot();
|
||||
|
||||
public String getLocalPath();
|
||||
String getLocalPath();
|
||||
|
||||
public String getSourceHost();
|
||||
String getSourceHost();
|
||||
|
||||
public String getSourceDir();
|
||||
String getSourceDir();
|
||||
|
||||
public int getSourcePort();
|
||||
int getSourcePort();
|
||||
|
||||
public String getAuthUserName();
|
||||
String getAuthUserName();
|
||||
|
||||
public String getAuthSecret();
|
||||
String getAuthSecret();
|
||||
|
||||
public StoragePoolType getType();
|
||||
StoragePoolType getType();
|
||||
|
||||
public boolean delete();
|
||||
boolean delete();
|
||||
|
||||
PhysicalDiskFormat getDefaultFormat();
|
||||
|
||||
public boolean createFolder(String path);
|
||||
boolean createFolder(String path);
|
||||
|
||||
public boolean supportsConfigDriveIso();
|
||||
boolean supportsConfigDriveIso();
|
||||
|
||||
public Map<String, String> getDetails();
|
||||
Map<String, String> getDetails();
|
||||
|
||||
default String getLocalPathFor(String relativePath) {
|
||||
return String.format("%s%s%s", getLocalPath(), File.separator, relativePath);
|
||||
}
|
||||
|
||||
public boolean isPoolSupportHA();
|
||||
boolean isPoolSupportHA();
|
||||
|
||||
public String getHearthBeatPath();
|
||||
String getHearthBeatPath();
|
||||
|
||||
public String createHeartBeatCommand(HAStoragePool primaryStoragePool, String hostPrivateIp, boolean hostValidation);
|
||||
String createHeartBeatCommand(HAStoragePool primaryStoragePool, String hostPrivateIp, boolean hostValidation);
|
||||
|
||||
public String getStorageNodeId();
|
||||
String getStorageNodeId();
|
||||
|
||||
public Boolean checkingHeartBeat(HAStoragePool pool, HostTO host);
|
||||
Boolean hasHeartBeat(HAStoragePool pool, HostTO host);
|
||||
|
||||
public Boolean vmActivityCheck(HAStoragePool pool, HostTO host, Duration activityScriptTimeout, String volumeUUIDListString, String vmActivityCheckPath, long duration);
|
||||
Boolean hasVmActivity(HAStoragePool pool, HostTO host, Duration activityScriptTimeout, String volumeUUIDListString, String vmActivityCheckPath, long duration);
|
||||
|
||||
default LibvirtVMDef.DiskDef.BlockIOSize getSupportedLogicalBlockSize() {
|
||||
return null;
|
||||
|
|
|
|||
|
|
@ -345,16 +345,14 @@ public class LibvirtStoragePool implements KVMStoragePool {
|
|||
|
||||
|
||||
public String createHeartBeatCommand(HAStoragePool primaryStoragePool, String hostPrivateIp, boolean hostValidation) {
|
||||
Script cmd = new Script(primaryStoragePool.getPool().getHearthBeatPath(), HeartBeatUpdateTimeout, logger);
|
||||
Script cmd = new Script(primaryStoragePool.getPool().getHearthBeatPath(), HeartBeatUpdateTimeoutInMs, logger);
|
||||
cmd.add("-i", primaryStoragePool.getPoolIp());
|
||||
cmd.add("-p", primaryStoragePool.getPoolMountSourcePath());
|
||||
cmd.add("-m", primaryStoragePool.getMountDestPath());
|
||||
|
||||
if (hostValidation) {
|
||||
cmd.add("-h", hostPrivateIp);
|
||||
}
|
||||
|
||||
if (!hostValidation) {
|
||||
} else {
|
||||
cmd.add("-c");
|
||||
}
|
||||
|
||||
|
|
@ -372,53 +370,53 @@ public class LibvirtStoragePool implements KVMStoragePool {
|
|||
}
|
||||
|
||||
@Override
|
||||
public Boolean checkingHeartBeat(HAStoragePool pool, HostTO host) {
|
||||
boolean validResult = false;
|
||||
public Boolean hasHeartBeat(HAStoragePool pool, HostTO host) {
|
||||
String hostIp = host.getPrivateNetwork().getIp();
|
||||
Script cmd = new Script(getHearthBeatPath(), HeartBeatCheckerTimeout, logger);
|
||||
Script cmd = new Script(getHearthBeatPath(), HeartBeatCheckerTimeoutInMs, logger);
|
||||
cmd.add("-i", pool.getPoolIp());
|
||||
cmd.add("-p", pool.getPoolMountSourcePath());
|
||||
cmd.add("-m", pool.getMountDestPath());
|
||||
cmd.add("-h", hostIp);
|
||||
cmd.add("-r");
|
||||
cmd.add("-t", String.valueOf(HeartBeatUpdateFreq / 1000));
|
||||
cmd.add("-t", String.valueOf(HeartBeatUpdateFreqInMs / 1000));
|
||||
OutputInterpreter.OneLineParser parser = new OutputInterpreter.OneLineParser();
|
||||
String result = cmd.execute(parser);
|
||||
String parsedLine = parser.getLine();
|
||||
|
||||
logger.debug(String.format("Checking heart beat with KVMHAChecker [{command=\"%s\", result: \"%s\", log: \"%s\", pool: \"%s\"}].", cmd.toString(), result, parsedLine,
|
||||
pool.getPoolIp()));
|
||||
logger.debug("Checking heart beat for host IP {} with KVMHAChecker [{command=\"{}\", result: \"{}\", log: \"{}\", pool: \"{}\"}].", hostIp, cmd.toString(), result, parsedLine, pool.getPoolIp());
|
||||
|
||||
if (result == null && parsedLine.contains("DEAD")) {
|
||||
logger.warn(String.format("Checking heart beat with KVMHAChecker command [%s] returned [%s]. [%s]. It may cause a shutdown of host IP [%s].", cmd.toString(),
|
||||
result, parsedLine, hostIp));
|
||||
logger.warn("Checking heart beat for host IP {} with KVMHAChecker command [{}] returned [{}]. It may cause a shutdown of the host.", hostIp, cmd.toString(), parsedLine);
|
||||
return false;
|
||||
} else {
|
||||
validResult = true;
|
||||
logger.debug("Checking heart beat for host IP {} with KVMHAChecker command [{}] succeeded.", hostIp, cmd.toString());
|
||||
return true;
|
||||
}
|
||||
return validResult;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Boolean vmActivityCheck(HAStoragePool pool, HostTO host, Duration activityScriptTimeout, String volumeUUIDListString, String vmActivityCheckPath, long duration) {
|
||||
public Boolean hasVmActivity(HAStoragePool pool, HostTO host, Duration activityScriptTimeout, String volumeUUIDListString, String vmActivityCheckPath, long duration) {
|
||||
String hostIp = host.getPrivateNetwork().getIp();
|
||||
Script cmd = new Script(vmActivityCheckPath, activityScriptTimeout.getStandardSeconds(), logger);
|
||||
cmd.add("-i", pool.getPoolIp());
|
||||
cmd.add("-p", pool.getPoolMountSourcePath());
|
||||
cmd.add("-m", pool.getMountDestPath());
|
||||
cmd.add("-h", host.getPrivateNetwork().getIp());
|
||||
cmd.add("-h", hostIp);
|
||||
cmd.add("-u", volumeUUIDListString);
|
||||
cmd.add("-t", String.valueOf(String.valueOf(System.currentTimeMillis() / 1000)));
|
||||
cmd.add("-t", String.valueOf(System.currentTimeMillis() / 1000));
|
||||
cmd.add("-d", String.valueOf(duration));
|
||||
OutputInterpreter.OneLineParser parser = new OutputInterpreter.OneLineParser();
|
||||
|
||||
String result = cmd.execute(parser);
|
||||
String parsedLine = parser.getLine();
|
||||
|
||||
logger.debug(String.format("Checking heart beat with KVMHAVMActivityChecker [{command=\"%s\", result: \"%s\", log: \"%s\", pool: \"%s\"}].", cmd.toString(), result, parsedLine, pool.getPoolIp()));
|
||||
logger.debug("Checking VM activity for host IP {} with KVMHAVMActivityChecker [{command=\"{}\", result: \"{}\", log: \"{}\", pool: \"{}\"}].", hostIp, cmd.toString(), result, parsedLine, pool.getPoolIp());
|
||||
|
||||
if (result == null && parsedLine.contains("DEAD")) {
|
||||
logger.warn(String.format("Checking heart beat with KVMHAVMActivityChecker command [%s] returned [%s]. It is [%s]. It may cause a shutdown of host IP [%s].", cmd.toString(), result, parsedLine, host.getPrivateNetwork().getIp()));
|
||||
logger.warn("Checking VM activity for host IP {} with KVMHAVMActivityChecker command [{}] returned [{}]. It may cause a shutdown of the host.", hostIp, cmd.toString(), parsedLine);
|
||||
return false;
|
||||
} else {
|
||||
logger.debug("Checking VM activity for host IP {} with KVMHAVMActivityChecker command [{}] succeeded.", hostIp, cmd.toString());
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -225,13 +225,13 @@ public class MultipathSCSIPool implements KVMStoragePool {
|
|||
}
|
||||
|
||||
@Override
|
||||
public Boolean checkingHeartBeat(HAStoragePool pool, HostTO host) {
|
||||
public Boolean hasHeartBeat(HAStoragePool pool, HostTO host) {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Boolean vmActivityCheck(HAStoragePool pool, HostTO host, Duration activityScriptTimeout,
|
||||
String volumeUUIDListString, String vmActivityCheckPath, long duration) {
|
||||
public Boolean hasVmActivity(HAStoragePool pool, HostTO host, Duration activityScriptTimeout,
|
||||
String volumeUUIDListString, String vmActivityCheckPath, long duration) {
|
||||
return null;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -236,12 +236,12 @@ public class ScaleIOStoragePool implements KVMStoragePool {
|
|||
}
|
||||
|
||||
@Override
|
||||
public Boolean checkingHeartBeat(HAStoragePool pool, HostTO host) {
|
||||
public Boolean hasHeartBeat(HAStoragePool pool, HostTO host) {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Boolean vmActivityCheck(HAStoragePool pool, HostTO host, Duration activityScriptTimeout, String volumeUUIDListString, String vmActivityCheckPath, long duration) {
|
||||
public Boolean hasVmActivity(HAStoragePool pool, HostTO host, Duration activityScriptTimeout, String volumeUUIDListString, String vmActivityCheckPath, long duration) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -19,38 +19,37 @@ package org.apache.cloudstack.kvm.ha;
|
|||
|
||||
import org.apache.cloudstack.framework.config.ConfigKey;
|
||||
|
||||
public class KVMHAConfig {
|
||||
public interface KVMHAConfig {
|
||||
|
||||
public static final ConfigKey<Long> KvmHAHealthCheckTimeout = new ConfigKey<>("Advanced", Long.class, "kvm.ha.health.check.timeout", "10",
|
||||
ConfigKey<Long> KvmHAHealthCheckTimeout = new ConfigKey<>("Advanced", Long.class, "kvm.ha.health.check.timeout", "10",
|
||||
"The maximum length of time, in seconds, expected for an health check to complete.", true, ConfigKey.Scope.Cluster);
|
||||
|
||||
public static final ConfigKey<Long> KvmHAActivityCheckTimeout = new ConfigKey<>("Advanced", Long.class, "kvm.ha.activity.check.timeout", "60",
|
||||
ConfigKey<Long> KvmHAActivityCheckTimeout = new ConfigKey<>("Advanced", Long.class, "kvm.ha.activity.check.timeout", "60",
|
||||
"The maximum length of time, in seconds, expected for an activity check to complete.", true, ConfigKey.Scope.Cluster);
|
||||
|
||||
public static final ConfigKey<Long> KvmHAActivityCheckInterval = new ConfigKey<>("Advanced", Long.class, "kvm.ha.activity.check.interval", "60",
|
||||
ConfigKey<Long> KvmHAActivityCheckInterval = new ConfigKey<>("Advanced", Long.class, "kvm.ha.activity.check.interval", "60",
|
||||
"The interval, in seconds, between activity checks.", true, ConfigKey.Scope.Cluster);
|
||||
|
||||
public static final ConfigKey<Long> KvmHAActivityCheckMaxAttempts = new ConfigKey<>("Advanced", Long.class, "kvm.ha.activity.check.max.attempts", "10",
|
||||
ConfigKey<Long> KvmHAActivityCheckMaxAttempts = new ConfigKey<>("Advanced", Long.class, "kvm.ha.activity.check.max.attempts", "10",
|
||||
"The maximum number of activity check attempts to perform before deciding to recover or degrade a resource.", true, ConfigKey.Scope.Cluster);
|
||||
|
||||
public static final ConfigKey<Double> KvmHAActivityCheckFailureThreshold = new ConfigKey<>("Advanced", Double.class, "kvm.ha.activity.check.failure.ratio", "0.7",
|
||||
ConfigKey<Double> KvmHAActivityCheckFailureThreshold = new ConfigKey<>("Advanced", Double.class, "kvm.ha.activity.check.failure.ratio", "0.7",
|
||||
"The activity check failure threshold ratio. This is used with the activity check maximum attempts for deciding to recover or degrade a resource. For most environments, please keep this value above 0.5.",
|
||||
true, ConfigKey.Scope.Cluster);
|
||||
|
||||
public static final ConfigKey<Long> KvmHADegradedMaxPeriod = new ConfigKey<>("Advanced", Long.class, "kvm.ha.degraded.max.period", "300",
|
||||
ConfigKey<Long> KvmHADegradedMaxPeriod = new ConfigKey<>("Advanced", Long.class, "kvm.ha.degraded.max.period", "300",
|
||||
"The maximum length of time, in seconds, a resource can be in degraded state where only health checks are performed.", true, ConfigKey.Scope.Cluster);
|
||||
|
||||
public static final ConfigKey<Long> KvmHARecoverTimeout = new ConfigKey<>("Advanced", Long.class, "kvm.ha.recover.timeout", "60",
|
||||
ConfigKey<Long> KvmHARecoverTimeout = new ConfigKey<>("Advanced", Long.class, "kvm.ha.recover.timeout", "60",
|
||||
"The maximum length of time, in seconds, expected for a recovery operation to complete.", true, ConfigKey.Scope.Cluster);
|
||||
|
||||
public static final ConfigKey<Long> KvmHARecoverWaitPeriod = new ConfigKey<>("Advanced", Long.class, "kvm.ha.recover.wait.period", "600",
|
||||
ConfigKey<Long> KvmHARecoverWaitPeriod = new ConfigKey<>("Advanced", Long.class, "kvm.ha.recover.wait.period", "600",
|
||||
"The maximum length of time, in seconds, to wait for a resource to recover.", true, ConfigKey.Scope.Cluster);
|
||||
|
||||
public static final ConfigKey<Long> KvmHARecoverAttemptThreshold = new ConfigKey<>("Advanced", Long.class, "kvm.ha.recover.failure.threshold", "1",
|
||||
ConfigKey<Long> KvmHARecoverAttemptThreshold = new ConfigKey<>("Advanced", Long.class, "kvm.ha.recover.failure.threshold", "1",
|
||||
"The maximum recovery attempts to be made for a resource, after which the resource is fenced. The recovery counter resets when a health check passes for a resource.",
|
||||
true, ConfigKey.Scope.Cluster);
|
||||
|
||||
public static final ConfigKey<Long> KvmHAFenceTimeout = new ConfigKey<>("Advanced", Long.class, "kvm.ha.fence.timeout", "60",
|
||||
ConfigKey<Long> KvmHAFenceTimeout = new ConfigKey<>("Advanced", Long.class, "kvm.ha.fence.timeout", "60",
|
||||
"The maximum length of time, in seconds, expected for a fence operation to complete.", true, ConfigKey.Scope.Cluster);
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -68,17 +68,18 @@ public final class KVMHAProvider extends HAAbstractHostProvider implements HAPro
|
|||
|
||||
@Override
|
||||
public boolean recover(Host r) throws HARecoveryException {
|
||||
logger.debug("Recover the host {}", r);
|
||||
try {
|
||||
if (outOfBandManagementService.isOutOfBandManagementEnabled(r)){
|
||||
if (outOfBandManagementService.isOutOfBandManagementEnabled(r)) {
|
||||
final OutOfBandManagementResponse resp = outOfBandManagementService.executePowerOperation(r, PowerOperation.RESET, null);
|
||||
return resp.getSuccess();
|
||||
} else {
|
||||
logger.warn("OOBM recover operation failed for the host {}", r);
|
||||
return false;
|
||||
}
|
||||
} catch (Exception e){
|
||||
} catch (Exception e) {
|
||||
logger.warn("OOBM service is not configured or enabled for this host {} error is {}", r, e.getMessage());
|
||||
throw new HARecoveryException(String.format(" OOBM service is not configured or enabled for this host %s", r), e);
|
||||
throw new HARecoveryException(String.format("OOBM service is not configured or enabled for this host %s", r), e);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -19,6 +19,7 @@ package org.apache.cloudstack.kvm.ha;
|
|||
|
||||
import com.cloud.agent.AgentManager;
|
||||
import com.cloud.agent.api.Answer;
|
||||
import com.cloud.agent.api.CheckOnHostAnswer;
|
||||
import com.cloud.agent.api.CheckOnHostCommand;
|
||||
import com.cloud.agent.api.CheckVMActivityOnStoragePoolCommand;
|
||||
import com.cloud.dc.dao.ClusterDao;
|
||||
|
|
@ -61,7 +62,7 @@ public class KVMHostActivityChecker extends AdapterBase implements ActivityCheck
|
|||
@Inject
|
||||
private AgentManager agentMgr;
|
||||
@Inject
|
||||
private PrimaryDataStoreDao storagePool;
|
||||
private PrimaryDataStoreDao storagePoolDao;
|
||||
@Inject
|
||||
private StorageManager storageManager;
|
||||
@Inject
|
||||
|
|
@ -70,11 +71,11 @@ public class KVMHostActivityChecker extends AdapterBase implements ActivityCheck
|
|||
@Override
|
||||
public boolean isActive(Host r, DateTime suspectTime) throws HACheckerException {
|
||||
try {
|
||||
return isVMActivityOnHost(r, suspectTime);
|
||||
return hasVMActivityOnHost(r, suspectTime);
|
||||
} catch (HACheckerException e) {
|
||||
//Re-throwing the exception to avoid poluting the 'HACheckerException' already thrown
|
||||
//Re-throwing the exception to avoid polluting the 'HACheckerException' already thrown
|
||||
throw e;
|
||||
} catch (Exception e){
|
||||
} catch (Exception e) {
|
||||
String message = String.format("Operation timed out, probably the %s is not reachable.", r.toString());
|
||||
logger.warn(message, e);
|
||||
throw new HACheckerException(message, e);
|
||||
|
|
@ -83,82 +84,115 @@ public class KVMHostActivityChecker extends AdapterBase implements ActivityCheck
|
|||
|
||||
@Override
|
||||
public boolean isHealthy(Host r) {
|
||||
return isAgentActive(r);
|
||||
return isHostAgentUp(r);
|
||||
}
|
||||
|
||||
private boolean isAgentActive(Host agent) {
|
||||
if (agent.getHypervisorType() != Hypervisor.HypervisorType.KVM && agent.getHypervisorType() != Hypervisor.HypervisorType.LXC) {
|
||||
throw new IllegalStateException(String.format("Calling KVM investigator for non KVM Host of type [%s].", agent.getHypervisorType()));
|
||||
private boolean isHostAgentUp(Host host) {
|
||||
if (host.getHypervisorType() != Hypervisor.HypervisorType.KVM && host.getHypervisorType() != Hypervisor.HypervisorType.LXC) {
|
||||
throw new IllegalStateException(String.format("Calling KVM investigator for non KVM Host of type [%s].", host.getHypervisorType()));
|
||||
}
|
||||
Status hostStatus = Status.Unknown;
|
||||
Status neighbourStatus = Status.Unknown;
|
||||
final CheckOnHostCommand cmd = new CheckOnHostCommand(agent, HighAvailabilityManager.KvmHAFenceHostIfHeartbeatFailsOnStorage.value());
|
||||
try {
|
||||
logger.debug(String.format("Checking %s status...", agent.toString()));
|
||||
Answer answer = agentMgr.easySend(agent.getId(), cmd);
|
||||
if (answer != null) {
|
||||
hostStatus = answer.getResult() ? Status.Down : Status.Up;
|
||||
logger.debug(String.format("%s has the status [%s].", agent.toString(), hostStatus));
|
||||
|
||||
if ( hostStatus == Status.Up ){
|
||||
return true;
|
||||
Status hostStatus = getHostAgentStatus(host);
|
||||
|
||||
logger.debug("{} has the status [{}].", host.toString(), hostStatus);
|
||||
return hostStatus == Status.Up;
|
||||
}
|
||||
|
||||
public Status getHostAgentStatus(Host host) {
|
||||
if (host.getHypervisorType() != Hypervisor.HypervisorType.KVM && host.getHypervisorType() != Hypervisor.HypervisorType.LXC) {
|
||||
return null;
|
||||
}
|
||||
|
||||
Status hostStatusFromItself = checkHostStatusWithSameHost(host);
|
||||
if (hostStatusFromItself == Status.Up) {
|
||||
return Status.Up;
|
||||
}
|
||||
|
||||
Status hostStatusFromNeighbour = checkHostStatusWithNeighbourHosts(host);
|
||||
Status hostStatus = hostStatusFromItself;
|
||||
if (hostStatusFromNeighbour == Status.Up && (hostStatusFromItself == Status.Disconnected || hostStatusFromItself == Status.Down)) {
|
||||
hostStatus = Status.Disconnected;
|
||||
}
|
||||
if (hostStatusFromNeighbour == Status.Down && (hostStatusFromItself == Status.Disconnected || hostStatusFromItself == Status.Down)) {
|
||||
hostStatus = Status.Down;
|
||||
}
|
||||
|
||||
logger.debug("HA: HOST is ineligible legacy state {} for host {}", hostStatus, host);
|
||||
return hostStatus;
|
||||
}
|
||||
|
||||
private Status checkHostStatusWithSameHost(Host host) {
|
||||
Status hostStatus;
|
||||
boolean reportFailureIfOneStorageIsDown = HighAvailabilityManager.KvmHAFenceHostIfHeartbeatFailsOnStorage.value();
|
||||
final CheckOnHostCommand cmd = new CheckOnHostCommand(host, reportFailureIfOneStorageIsDown);
|
||||
try {
|
||||
logger.debug("Checking {} status...", host.toString());
|
||||
Answer answer = agentMgr.easySend(host.getId(), cmd);
|
||||
if (answer != null) {
|
||||
if (answer.getResult()) {
|
||||
hostStatus = ((CheckOnHostAnswer)answer).isAlive() ? Status.Up : Status.Down;
|
||||
} else {
|
||||
logger.debug("{} is not active according to itself, details: {}.", host.toString(), answer.getDetails());
|
||||
hostStatus = Status.Down;
|
||||
}
|
||||
}
|
||||
else {
|
||||
logger.debug(String.format("Setting %s to \"Disconnected\" status.", agent.toString()));
|
||||
logger.debug("{} has the status [{}].", host.toString(), hostStatus);
|
||||
} else {
|
||||
logger.debug("Setting {} to \"Disconnected\" status.", host.toString());
|
||||
hostStatus = Status.Disconnected;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
logger.warn(String.format("Failed to send command CheckOnHostCommand to %s.", agent.toString()), e);
|
||||
logger.warn("Failed to send command CheckOnHostCommand to {}.", host.toString(), e);
|
||||
hostStatus = Status.Disconnected;
|
||||
}
|
||||
|
||||
List<HostVO> neighbors = resourceManager.listHostsInClusterByStatus(agent.getClusterId(), Status.Up);
|
||||
return hostStatus;
|
||||
}
|
||||
|
||||
private Status checkHostStatusWithNeighbourHosts(Host host) {
|
||||
Status hostStatusFromNeighbour = Status.Unknown;
|
||||
boolean reportFailureIfOneStorageIsDown = HighAvailabilityManager.KvmHAFenceHostIfHeartbeatFailsOnStorage.value();
|
||||
final CheckOnHostCommand cmd = new CheckOnHostCommand(host, reportFailureIfOneStorageIsDown);
|
||||
List<HostVO> neighbors = resourceManager.listHostsInClusterByStatus(host.getClusterId(), Status.Up);
|
||||
for (HostVO neighbor : neighbors) {
|
||||
if (neighbor.getId() == agent.getId() || (neighbor.getHypervisorType() != Hypervisor.HypervisorType.KVM && neighbor.getHypervisorType() != Hypervisor.HypervisorType.LXC)) {
|
||||
if (neighbor.getId() == host.getId()
|
||||
|| (neighbor.getHypervisorType() != Hypervisor.HypervisorType.KVM && neighbor.getHypervisorType() != Hypervisor.HypervisorType.LXC)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
try {
|
||||
logger.debug(String.format("Investigating %s via neighbouring %s.", agent.toString(), neighbor.toString()));
|
||||
|
||||
logger.debug("Investigating {} via neighboring {}.", host.toString(), neighbor.toString());
|
||||
Answer answer = agentMgr.easySend(neighbor.getId(), cmd);
|
||||
if (answer != null) {
|
||||
neighbourStatus = answer.getResult() ? Status.Down : Status.Up;
|
||||
|
||||
logger.debug(String.format("Neighbouring %s returned status [%s] for the investigated %s.", neighbor.toString(), neighbourStatus, agent.toString()));
|
||||
|
||||
if (neighbourStatus == Status.Up) {
|
||||
break;
|
||||
if (answer.getResult()) {
|
||||
hostStatusFromNeighbour = ((CheckOnHostAnswer)answer).isAlive() ? Status.Up : Status.Down;
|
||||
logger.debug("Neighboring {} returned status [{}] for the investigated {}.", neighbor.toString(), hostStatusFromNeighbour, host.toString());
|
||||
if (hostStatusFromNeighbour == Status.Up) {
|
||||
return hostStatusFromNeighbour;
|
||||
}
|
||||
} else {
|
||||
logger.debug("{} is not active according to neighbor {}, details: {}.", host.toString(), neighbor.toString(), answer.getDetails());
|
||||
}
|
||||
} else {
|
||||
logger.debug(String.format("Neighbouring %s is Disconnected.", neighbor.toString()));
|
||||
logger.debug("Neighboring {} is Disconnected.", neighbor.toString());
|
||||
}
|
||||
} catch (Exception e) {
|
||||
logger.warn(String.format("Failed to send command CheckOnHostCommand to %s.", neighbor.toString()), e);
|
||||
logger.warn("Failed to send command CheckOnHostCommand to neighbor {}.", neighbor.toString(), e);
|
||||
}
|
||||
}
|
||||
if (neighbourStatus == Status.Up && (hostStatus == Status.Disconnected || hostStatus == Status.Down)) {
|
||||
hostStatus = Status.Disconnected;
|
||||
}
|
||||
if (neighbourStatus == Status.Down && (hostStatus == Status.Disconnected || hostStatus == Status.Down)) {
|
||||
hostStatus = Status.Down;
|
||||
}
|
||||
|
||||
logger.debug(String.format("%s has the status [%s].", agent.toString(), hostStatus));
|
||||
|
||||
return hostStatus == Status.Up;
|
||||
return hostStatusFromNeighbour;
|
||||
}
|
||||
|
||||
private boolean isVMActivityOnHost(Host agent, DateTime suspectTime) throws HACheckerException {
|
||||
if (agent.getHypervisorType() != Hypervisor.HypervisorType.KVM && agent.getHypervisorType() != Hypervisor.HypervisorType.LXC) {
|
||||
throw new IllegalStateException(String.format("Calling KVM investigator for non KVM Host of type [%s].", agent.getHypervisorType()));
|
||||
private boolean hasVMActivityOnHost(Host host, DateTime suspectTime) throws HACheckerException {
|
||||
if (host.getHypervisorType() != Hypervisor.HypervisorType.KVM && host.getHypervisorType() != Hypervisor.HypervisorType.LXC) {
|
||||
throw new IllegalStateException(String.format("Calling KVM investigator for non KVM Host of type [%s].", host.getHypervisorType()));
|
||||
}
|
||||
boolean activityStatus = true;
|
||||
HashMap<StoragePool, List<Volume>> poolVolMap = getVolumeUuidOnHost(agent);
|
||||
for (StoragePool pool : poolVolMap.keySet()) {
|
||||
activityStatus = verifyActivityOfStorageOnHost(poolVolMap, pool, agent, suspectTime, activityStatus);
|
||||
HashMap<StoragePool, List<Volume>> poolVolumeMap = getStoragePoolAndVolumeInfoOnHost(host);
|
||||
for (StoragePool pool : poolVolumeMap.keySet()) {
|
||||
activityStatus = verifyActivityOfStorageOnHost(poolVolumeMap, pool, host, suspectTime, activityStatus);
|
||||
if (!activityStatus) {
|
||||
logger.warn("It seems that the storage pool [{}] does not have activity on {}.", pool, agent);
|
||||
logger.warn("It seems that the storage pool [{}] does not have activity on {}.", pool, host);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
|
@ -166,66 +200,64 @@ public class KVMHostActivityChecker extends AdapterBase implements ActivityCheck
|
|||
return activityStatus;
|
||||
}
|
||||
|
||||
protected boolean verifyActivityOfStorageOnHost(HashMap<StoragePool, List<Volume>> poolVolMap, StoragePool pool, Host agent, DateTime suspectTime, boolean activityStatus) throws HACheckerException, IllegalStateException {
|
||||
protected boolean verifyActivityOfStorageOnHost(HashMap<StoragePool, List<Volume>> poolVolMap, StoragePool pool, Host host, DateTime suspectTime, boolean activityStatus) throws HACheckerException, IllegalStateException {
|
||||
List<Volume> volume_list = poolVolMap.get(pool);
|
||||
final CheckVMActivityOnStoragePoolCommand cmd = new CheckVMActivityOnStoragePoolCommand(agent, pool, volume_list, suspectTime);
|
||||
final CheckVMActivityOnStoragePoolCommand cmd = new CheckVMActivityOnStoragePoolCommand(host, pool, volume_list, suspectTime);
|
||||
|
||||
logger.debug("Checking VM activity for {} on storage pool [{}].", agent.toString(), pool);
|
||||
logger.debug("Checking VM activity for {} on storage pool [{}].", host.toString(), pool);
|
||||
try {
|
||||
Answer answer = storageManager.sendToPool(pool, getNeighbors(agent), cmd);
|
||||
|
||||
Answer answer = storageManager.sendToPool(pool, getNeighbors(host), cmd);
|
||||
if (answer != null) {
|
||||
activityStatus = !answer.getResult();
|
||||
logger.debug("{} {} activity on storage pool [{}]", agent.toString(), activityStatus ? "has" : "does not have", pool);
|
||||
logger.debug("{} {} activity on storage pool [{}]", host.toString(), activityStatus ? "has" : "does not have", pool);
|
||||
} else {
|
||||
String message = String.format("Did not get a valid response for VM activity check for %s on storage pool [%s].", agent.toString(), pool);
|
||||
String message = String.format("Did not get a valid response for VM activity check for %s on storage pool [%s].", host.toString(), pool);
|
||||
logger.debug(message);
|
||||
throw new IllegalStateException(message);
|
||||
}
|
||||
} catch (StorageUnavailableException e){
|
||||
String message = String.format("Storage [%s] is unavailable to do the check, probably the %s is not reachable.", pool, agent);
|
||||
} catch (StorageUnavailableException e) {
|
||||
String message = String.format("Storage [%s] is unavailable to do the check, probably the %s is not reachable.", pool, host);
|
||||
logger.warn(message, e);
|
||||
throw new HACheckerException(message, e);
|
||||
}
|
||||
return activityStatus;
|
||||
}
|
||||
|
||||
private HashMap<StoragePool, List<Volume>> getVolumeUuidOnHost(Host agent) {
|
||||
List<VMInstanceVO> vm_list = vmInstanceDao.listByHostId(agent.getId());
|
||||
List<VolumeVO> volume_list = new ArrayList<VolumeVO>();
|
||||
for (VirtualMachine vm : vm_list) {
|
||||
private HashMap<StoragePool, List<Volume>> getStoragePoolAndVolumeInfoOnHost(Host host) {
|
||||
List<VMInstanceVO> vmListOnHost = vmInstanceDao.listByHostId(host.getId());
|
||||
List<VolumeVO> volumeListOnHost = new ArrayList<>();
|
||||
for (VirtualMachine vm : vmListOnHost) {
|
||||
logger.debug("Retrieving volumes of VM [{}]...", vm);
|
||||
List<VolumeVO> vm_volume_list = volumeDao.findByInstance(vm.getId());
|
||||
volume_list.addAll(vm_volume_list);
|
||||
List<VolumeVO> volumeListOfVM = volumeDao.findByInstance(vm.getId());
|
||||
volumeListOnHost.addAll(volumeListOfVM);
|
||||
}
|
||||
|
||||
HashMap<StoragePool, List<Volume>> poolVolMap = new HashMap<StoragePool, List<Volume>>();
|
||||
for (Volume vol : volume_list) {
|
||||
StoragePool sp = storagePool.findById(vol.getPoolId());
|
||||
logger.debug("Retrieving storage pool [{}] of volume [{}]...", sp, vol);
|
||||
if (!poolVolMap.containsKey(sp)) {
|
||||
List<Volume> list = new ArrayList<Volume>();
|
||||
list.add(vol);
|
||||
HashMap<StoragePool, List<Volume>> poolVolumeMap = new HashMap<>();
|
||||
for (Volume volume : volumeListOnHost) {
|
||||
StoragePool pool = storagePoolDao.findById(volume.getPoolId());
|
||||
logger.debug("Retrieving storage pool [{}] of volume [{}]...", pool, volume);
|
||||
if (!poolVolumeMap.containsKey(pool)) {
|
||||
List<Volume> volList = new ArrayList<>();
|
||||
volList.add(volume);
|
||||
|
||||
poolVolMap.put(sp, list);
|
||||
poolVolumeMap.put(pool, volList);
|
||||
} else {
|
||||
poolVolMap.get(sp).add(vol);
|
||||
poolVolumeMap.get(pool).add(volume);
|
||||
}
|
||||
}
|
||||
return poolVolMap;
|
||||
return poolVolumeMap;
|
||||
}
|
||||
|
||||
public long[] getNeighbors(Host agent) {
|
||||
List<Long> neighbors = new ArrayList<Long>();
|
||||
List<HostVO> cluster_hosts = resourceManager.listHostsInClusterByStatus(agent.getClusterId(), Status.Up);
|
||||
logger.debug("Retrieving all \"Up\" hosts from cluster [{}]...", clusterDao.findById(agent.getClusterId()));
|
||||
for (HostVO host : cluster_hosts) {
|
||||
if (host.getId() == agent.getId() || (host.getHypervisorType() != Hypervisor.HypervisorType.KVM && host.getHypervisorType() != Hypervisor.HypervisorType.LXC)) {
|
||||
public long[] getNeighbors(Host host) {
|
||||
List<Long> neighbors = new ArrayList<>();
|
||||
List<HostVO> clusterHosts = resourceManager.listHostsInClusterByStatus(host.getClusterId(), Status.Up);
|
||||
logger.debug("Retrieving all \"Up\" hosts from cluster [{}]...", clusterDao.findById(host.getClusterId()));
|
||||
for (HostVO clusterHost : clusterHosts) {
|
||||
if (clusterHost.getId() == host.getId() || (clusterHost.getHypervisorType() != Hypervisor.HypervisorType.KVM && clusterHost.getHypervisorType() != Hypervisor.HypervisorType.LXC)) {
|
||||
continue;
|
||||
}
|
||||
neighbors.add(host.getId());
|
||||
neighbors.add(clusterHost.getId());
|
||||
}
|
||||
return ArrayUtils.toPrimitive(neighbors.toArray(new Long[neighbors.size()]));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -3133,7 +3133,7 @@ public class LibvirtComputingResourceTest {
|
|||
assertNotNull(wrapper);
|
||||
|
||||
final Answer answer = wrapper.execute(command, libvirtComputingResourceMock);
|
||||
assertTrue(answer.getResult());
|
||||
assertFalse(answer.getResult());
|
||||
|
||||
verify(libvirtComputingResourceMock, times(1)).getMonitor();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -54,13 +54,13 @@ public class SimulatorInvestigator extends AdapterBase implements Investigator {
|
|||
}
|
||||
|
||||
@Override
|
||||
public Status isAgentAlive(Host agent) {
|
||||
public Status getHostAgentStatus(Host agent) {
|
||||
if (agent.getHypervisorType() != HypervisorType.Simulator) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (haManager.isHAEligible(agent)) {
|
||||
return haManager.getHostStatus(agent);
|
||||
return haManager.getHostStatusFromHAConfig(agent);
|
||||
}
|
||||
|
||||
CheckOnHostCommand cmd = new CheckOnHostCommand(agent);
|
||||
|
|
|
|||
|
|
@ -28,7 +28,7 @@ public class VmwareInvestigator extends AdapterBase implements Investigator {
|
|||
}
|
||||
|
||||
@Override
|
||||
public Status isAgentAlive(Host agent) {
|
||||
public Status getHostAgentStatus(Host agent) {
|
||||
if (agent.getHypervisorType() == HypervisorType.VMware)
|
||||
return Status.Disconnected;
|
||||
|
||||
|
|
|
|||
|
|
@ -585,8 +585,8 @@ public class LinstorStorageAdaptor implements StorageAdaptor {
|
|||
Path propFile = diskPath.getParent().resolve("template.properties");
|
||||
if (Files.exists(propFile)) {
|
||||
java.util.Properties templateProps = new java.util.Properties();
|
||||
try {
|
||||
templateProps.load(new FileInputStream(propFile.toFile()));
|
||||
try (FileInputStream in = new FileInputStream(propFile.toFile())) {
|
||||
templateProps.load(in);
|
||||
String desc = templateProps.getProperty("description");
|
||||
if (desc != null && desc.startsWith("SystemVM Template")) {
|
||||
return true;
|
||||
|
|
|
|||
|
|
@ -228,11 +228,11 @@ public class LinstorStoragePool implements KVMStoragePool {
|
|||
public String createHeartBeatCommand(HAStoragePool pool, String hostPrivateIp,
|
||||
boolean hostValidation) {
|
||||
LOGGER.trace(String.format("Linstor.createHeartBeatCommand: %s, %s, %b", pool.getPoolIp(), hostPrivateIp, hostValidation));
|
||||
boolean isStorageNodeUp = checkingHeartBeat(pool, null);
|
||||
boolean isStorageNodeUp = hasHeartBeat(pool, null);
|
||||
if (!isStorageNodeUp && !hostValidation) {
|
||||
//restart the host
|
||||
LOGGER.debug(String.format("The host [%s] will be restarted because the health check failed for the storage pool [%s]", hostPrivateIp, pool.getPool().getType()));
|
||||
Script cmd = new Script(pool.getPool().getHearthBeatPath(), Duration.millis(HeartBeatUpdateTimeout), LOGGER);
|
||||
Script cmd = new Script(pool.getPool().getHearthBeatPath(), Duration.millis(HeartBeatUpdateTimeoutInMs), LOGGER);
|
||||
cmd.add("-c");
|
||||
cmd.execute();
|
||||
return "Down";
|
||||
|
|
@ -258,7 +258,7 @@ public class LinstorStoragePool implements KVMStoragePool {
|
|||
}
|
||||
|
||||
@Override
|
||||
public Boolean checkingHeartBeat(HAStoragePool pool, HostTO host) {
|
||||
public Boolean hasHeartBeat(HAStoragePool pool, HostTO host) {
|
||||
String hostName;
|
||||
if (host == null) {
|
||||
hostName = localNodeName;
|
||||
|
|
@ -274,7 +274,7 @@ public class LinstorStoragePool implements KVMStoragePool {
|
|||
}
|
||||
|
||||
private String executeDrbdSetupStatus(OutputInterpreter.AllLinesParser parser) {
|
||||
Script sc = new Script("drbdsetup", Duration.millis(HeartBeatUpdateTimeout), LOGGER);
|
||||
Script sc = new Script("drbdsetup", Duration.millis(HeartBeatUpdateTimeoutInMs), LOGGER);
|
||||
sc.add("status");
|
||||
sc.add("--json");
|
||||
return sc.execute(parser);
|
||||
|
|
@ -329,7 +329,7 @@ public class LinstorStoragePool implements KVMStoragePool {
|
|||
}
|
||||
|
||||
private String executeDrbdEventsNow(OutputInterpreter.AllLinesParser parser) {
|
||||
Script sc = new Script("drbdsetup", Duration.millis(HeartBeatUpdateTimeout), LOGGER);
|
||||
Script sc = new Script("drbdsetup", Duration.millis(HeartBeatUpdateTimeoutInMs), LOGGER);
|
||||
sc.add("events2");
|
||||
sc.add("--now");
|
||||
return sc.execute(parser);
|
||||
|
|
@ -369,8 +369,8 @@ public class LinstorStoragePool implements KVMStoragePool {
|
|||
}
|
||||
|
||||
@Override
|
||||
public Boolean vmActivityCheck(HAStoragePool pool, HostTO host, Duration activityScriptTimeout, String volumeUUIDListString, String vmActivityCheckPath, long duration) {
|
||||
public Boolean hasVmActivity(HAStoragePool pool, HostTO host, Duration activityScriptTimeout, String volumeUUIDListString, String vmActivityCheckPath, long duration) {
|
||||
LOGGER.trace(String.format("Linstor.vmActivityCheck: %s, %s", pool.getPoolIp(), host.getPrivateNetwork().getIp()));
|
||||
return checkingHeartBeat(pool, host);
|
||||
return hasHeartBeat(pool, host);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -198,11 +198,11 @@ public class StorPoolStoragePool implements KVMStoragePool {
|
|||
|
||||
@Override
|
||||
public String createHeartBeatCommand(HAStoragePool primaryStoragePool, String hostPrivateIp, boolean hostValidation) {
|
||||
boolean isStorageNodeUp = checkingHeartBeat(primaryStoragePool, null);
|
||||
boolean isStorageNodeUp = hasHeartBeat(primaryStoragePool, null);
|
||||
if (!isStorageNodeUp && !hostValidation) {
|
||||
//restart the host
|
||||
logger.debug(String.format("The host [%s] will be restarted because the health check failed for the storage pool [%s]", hostPrivateIp, primaryStoragePool.getPool().getType()));
|
||||
Script cmd = new Script(primaryStoragePool.getPool().getHearthBeatPath(), HeartBeatUpdateTimeout, logger);
|
||||
Script cmd = new Script(primaryStoragePool.getPool().getHearthBeatPath(), HeartBeatUpdateTimeoutInMs, logger);
|
||||
cmd.add("-c");
|
||||
cmd.execute();
|
||||
return "Down";
|
||||
|
|
@ -240,7 +240,7 @@ public class StorPoolStoragePool implements KVMStoragePool {
|
|||
}
|
||||
|
||||
@Override
|
||||
public Boolean checkingHeartBeat(HAStoragePool pool, HostTO host) {
|
||||
public Boolean hasHeartBeat(HAStoragePool pool, HostTO host) {
|
||||
boolean isNodeWorking = false;
|
||||
OutputInterpreter.AllLinesParser parser = new OutputInterpreter.AllLinesParser();
|
||||
|
||||
|
|
@ -300,8 +300,8 @@ public class StorPoolStoragePool implements KVMStoragePool {
|
|||
}
|
||||
|
||||
@Override
|
||||
public Boolean vmActivityCheck(HAStoragePool pool, HostTO host, Duration activityScriptTimeout, String volumeUuidListString, String vmActivityCheckPath, long duration) {
|
||||
return checkingHeartBeat(pool, host);
|
||||
public Boolean hasVmActivity(HAStoragePool pool, HostTO host, Duration activityScriptTimeout, String volumeUuidListString, String vmActivityCheckPath, long duration) {
|
||||
return hasHeartBeat(pool, host);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
|||
|
|
@ -75,7 +75,7 @@ fi
|
|||
#delete VMs on this mountpoint
|
||||
deleteVMs() {
|
||||
local mountPoint=$1
|
||||
vmPids=$(ps aux| grep qemu | grep "$mountPoint" | awk '{print $2}' 2> /dev/null)
|
||||
vmPids=$(ps aux | grep qemu | grep "$mountPoint" | awk '{print $2}' 2> /dev/null)
|
||||
if [ $? -gt 0 ]
|
||||
then
|
||||
return
|
||||
|
|
@ -93,7 +93,7 @@ deleteVMs() {
|
|||
}
|
||||
|
||||
#checking is there the same nfs server mounted under $MountPoint?
|
||||
mounts=$(cat /proc/mounts |grep nfs|grep $MountPoint)
|
||||
mounts=$(cat /proc/mounts | grep nfs | grep $MountPoint)
|
||||
if [ $? -gt 0 ]
|
||||
then
|
||||
# remount it
|
||||
|
|
|
|||
|
|
@ -38,7 +38,7 @@ public class CheckOnAgentInvestigator extends AdapterBase implements Investigato
|
|||
}
|
||||
|
||||
@Override
|
||||
public Status isAgentAlive(Host agent) {
|
||||
public Status getHostAgentStatus(Host agent) {
|
||||
return null;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -42,6 +42,9 @@ import org.apache.cloudstack.engine.subsystem.api.storage.PrimaryDataStoreDriver
|
|||
import org.apache.cloudstack.framework.config.ConfigKey;
|
||||
import org.apache.cloudstack.framework.config.Configurable;
|
||||
import org.apache.cloudstack.framework.config.dao.ConfigurationDao;
|
||||
import org.apache.cloudstack.ha.HAConfig;
|
||||
import org.apache.cloudstack.ha.HAResource;
|
||||
import org.apache.cloudstack.ha.dao.HAConfigDao;
|
||||
import org.apache.cloudstack.managed.context.ManagedContext;
|
||||
import org.apache.cloudstack.managed.context.ManagedContextRunnable;
|
||||
import org.apache.cloudstack.management.ManagementServerHost;
|
||||
|
|
@ -223,6 +226,8 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
|
|||
@Inject
|
||||
ConfigurationDao _configDao;
|
||||
@Inject
|
||||
HAConfigDao _haConfigDao;
|
||||
@Inject
|
||||
VolumeOrchestrationService volumeMgr;
|
||||
|
||||
String _instance;
|
||||
|
|
@ -237,25 +242,53 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
|
|||
long _timeBetweenCleanups;
|
||||
String _haTag = null;
|
||||
|
||||
protected HighAvailabilityManagerImpl() {
|
||||
}
|
||||
|
||||
private boolean vmHasPendingHAJob(final List<HaWorkVO> pendingHaWorks, final VMInstanceVO vm) {
|
||||
Optional<HaWorkVO> item = pendingHaWorks.stream()
|
||||
.filter(h -> h.getInstanceId() == vm.getId())
|
||||
.reduce((first, second) -> second);
|
||||
if (item.isPresent() && (item.get().getTimesTried() < _maxRetries ||
|
||||
!item.get().canScheduleNew(_timeBetweenFailures))) {
|
||||
logger.debug(String.format("Skipping HA on %s as there is already a running HA job for it", vm));
|
||||
logger.debug("Skipping HA on {} as there is already a running HA job for it", vm);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
protected HighAvailabilityManagerImpl() {
|
||||
private boolean isHostHAInspectionInProgress(long hostId) {
|
||||
final HAConfig haConfig = _haConfigDao.findHAResource(hostId, HAResource.ResourceType.Host);
|
||||
if (haConfig == null || !haConfig.isEnabled()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
HAConfig.HAState state = haConfig.getState();
|
||||
logger.debug("Checking Host HA inspection is in progress or not for the host {} from HAConfig, HA state is {}", hostId, state);
|
||||
if (state == HAConfig.HAState.Suspect || state == HAConfig.HAState.Checking) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (state == HAConfig.HAState.Recovered || state == HAConfig.HAState.Available) {
|
||||
// If the host HA state is Recovered, it indicates that the host has restarted successfully.
|
||||
// If the host HA state is Available, it means the host has restarted successfully and the recovery waiting period has completed.
|
||||
// In both states, the agent can connect as soon as the host is ready (and can move to Suspect -> Checking HA state if the agent connection fails again before Fencing).
|
||||
final HostVO host = _hostDao.findById(hostId);
|
||||
if (host != null && host.getStatus() != Status.Up) {
|
||||
logger.debug("{} is in {} status and HA state is {}, considering Host HA inspection is still in progress" +
|
||||
" until we are sure the host is ready after a recovery wait period and agent is connected/Up", host, host.getStatus(), state);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Status investigate(final long hostId) {
|
||||
final HostVO host = _hostDao.findById(hostId);
|
||||
if (host == null) {
|
||||
logger.warn("Host with id {} is removed or doesn't exists.", hostId);
|
||||
return Status.Alert;
|
||||
}
|
||||
|
||||
|
|
@ -270,7 +303,7 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
|
|||
|
||||
Status hostState = null;
|
||||
for (Investigator investigator : investigators) {
|
||||
hostState = investigator.isAgentAlive(host);
|
||||
hostState = investigator.getHostAgentStatus(host);
|
||||
if (hostState != null) {
|
||||
if (logger.isDebugEnabled()) {
|
||||
logger.debug("{} was able to determine host {} is in {}", investigator.getName(), host, hostState.toString());
|
||||
|
|
@ -278,7 +311,7 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
|
|||
return hostState;
|
||||
}
|
||||
if (logger.isDebugEnabled()) {
|
||||
logger.debug(investigator.getName() + " unable to determine the state of the host. Moving on.");
|
||||
logger.debug("{} unable to determine the state of the host. Moving on.", investigator.getName());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -570,9 +603,9 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
|
|||
}
|
||||
|
||||
protected Long restart(final HaWorkVO work) {
|
||||
logger.debug("RESTART with HAWORK");
|
||||
logger.debug("RESTART with HA WORK");
|
||||
List<HaWorkVO> items = _haDao.listFutureHaWorkForVm(work.getInstanceId(), work.getId());
|
||||
if (items.size() > 0) {
|
||||
if (!items.isEmpty()) {
|
||||
StringBuilder str = new StringBuilder("Cancelling this work item because newer ones have been scheduled. Work Ids = [");
|
||||
for (HaWorkVO item : items) {
|
||||
str.append(item.getId()).append(", ");
|
||||
|
|
@ -583,7 +616,7 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
|
|||
}
|
||||
|
||||
items = _haDao.listRunningHaWorkForVm(work.getInstanceId());
|
||||
if (items.size() > 0) {
|
||||
if (!items.isEmpty()) {
|
||||
StringBuilder str = new StringBuilder("Waiting because there's HA work being executed on an item currently. Work Ids =[");
|
||||
for (HaWorkVO item : items) {
|
||||
str.append(item.getId()).append(", ");
|
||||
|
|
@ -597,21 +630,21 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
|
|||
|
||||
VirtualMachine vm = _itMgr.findById(work.getInstanceId());
|
||||
if (vm == null) {
|
||||
logger.info("Unable to find vm: " + vmId);
|
||||
logger.info("Unable to find vm: {}", vmId);
|
||||
return null;
|
||||
}
|
||||
if (checkAndCancelWorkIfNeeded(work)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
logger.info("HA on " + vm);
|
||||
logger.info("HA on {}", vm);
|
||||
if (vm.getState() != work.getPreviousState() || vm.getUpdated() != work.getUpdateTime()) {
|
||||
logger.info("VM " + vm + " has been changed. Current State = " + vm.getState() + " Previous State = " + work.getPreviousState() + " last updated = " +
|
||||
vm.getUpdated() + " previous updated = " + work.getUpdateTime());
|
||||
logger.info("VM {} has been changed. Current State = {} Previous State = {} last updated = {} previous updated = {}",
|
||||
vm, vm.getState(), work.getPreviousState(), vm.getUpdated(), work.getUpdateTime());
|
||||
return null;
|
||||
}
|
||||
if (vm.getHostId() != null && !vm.getHostId().equals(work.getHostId())) {
|
||||
logger.info("VM " + vm + " has been changed. Current host id = " + vm.getHostId() + " Previous host id = " + work.getHostId());
|
||||
logger.info("VM {} has been changed. Current host id = {} Previous host id = {}", vm, vm.getHostId(), work.getHostId());
|
||||
return null;
|
||||
}
|
||||
|
||||
|
|
@ -628,10 +661,13 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
|
|||
boolean isHostRemoved = false;
|
||||
if (host == null) {
|
||||
host = _hostDao.findByIdIncludingRemoved(work.getHostId());
|
||||
if (host != null) {
|
||||
logger.debug("VM {} is now no longer on host {} as the host is removed", vm, host);
|
||||
isHostRemoved = true;
|
||||
if (host == null) {
|
||||
logger.debug("VM {} is now no longer on host {}, the host doesn't exist", vm, work.getHostId());
|
||||
return null;
|
||||
}
|
||||
|
||||
logger.debug("VM {} is now no longer on host {} as the host is removed", vm, host);
|
||||
isHostRemoved = true;
|
||||
}
|
||||
|
||||
DataCenterVO dcVO = _dcDao.findById(host.getDataCenterId());
|
||||
|
|
@ -652,40 +688,39 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
|
|||
try
|
||||
{
|
||||
alive = investigator.isVmAlive(vm, host);
|
||||
logger.info(investigator.getName() + " found " + vm + " to be alive? " + alive);
|
||||
logger.info("{} found {} to be alive? {}", investigator.getName(), vm, alive);
|
||||
break;
|
||||
} catch (UnknownVM e) {
|
||||
logger.info(investigator.getName() + " could not find " + vm);
|
||||
logger.info("{} could not find {}", investigator.getName(), vm);
|
||||
}
|
||||
}
|
||||
|
||||
boolean fenced = false;
|
||||
if (alive == null) {
|
||||
logger.debug("Fencing off VM that we don't know the state of");
|
||||
logger.debug("Fencing off VM {} that we don't know the state of", vm);
|
||||
for (FenceBuilder fb : fenceBuilders) {
|
||||
Boolean result = fb.fenceOff(vm, host);
|
||||
logger.info("Fencer " + fb.getName() + " returned " + result);
|
||||
logger.info("Fencer {} returned {}", fb.getName(), result);
|
||||
if (result != null && result) {
|
||||
fenced = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
} else if (!alive) {
|
||||
fenced = true;
|
||||
} else {
|
||||
logger.debug("VM {} is found to be alive by {}", vm, investigator.getName());
|
||||
logger.debug("VM {} is found to be alive by {} on host {}", vm, investigator.getName(), host);
|
||||
if (host.getStatus() == Status.Up) {
|
||||
logger.info(vm + " is alive and host is up. No need to restart it.");
|
||||
logger.info("{} is alive and host {} is up. No need to restart it.", vm, host);
|
||||
return null;
|
||||
} else {
|
||||
logger.debug("Rescheduling because the host is not up but the vm is alive");
|
||||
logger.debug("Rescheduling because the host {} is not up but the vm {} is alive", host, vm);
|
||||
return (System.currentTimeMillis() >> 10) + _investigateRetryInterval;
|
||||
}
|
||||
}
|
||||
|
||||
if (!fenced) {
|
||||
logger.debug("We were unable to fence off the VM " + vm);
|
||||
logger.debug("We were unable to fence off the VM {}", vm);
|
||||
_alertMgr.sendAlert(alertType, vm.getDataCenterId(), vm.getPodIdToDeployIn(), "Unable to restart " + vm.getHostName() +
|
||||
" which was running on host " + hostDesc, "Insufficient capacity to restart VM, name: " + vm.getHostName() + ", id: " + vmId +
|
||||
" which was running on host " + hostDesc);
|
||||
|
|
@ -728,15 +763,15 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
|
|||
|
||||
if (!ForceHA.value() && !vm.isHaEnabled()) {
|
||||
if (logger.isDebugEnabled()) {
|
||||
logger.debug("VM is not HA enabled so we're done.");
|
||||
logger.debug("VM {} is not HA enabled so we're done.", vm);
|
||||
}
|
||||
return null; // VM doesn't require HA
|
||||
}
|
||||
|
||||
if ((host == null || host.getRemoved() != null || host.getState() != Status.Up)
|
||||
if ((host.getRemoved() != null || host.getState() != Status.Up)
|
||||
&& !volumeMgr.canVmRestartOnAnotherServer(vm.getId())) {
|
||||
if (logger.isDebugEnabled()) {
|
||||
logger.debug("VM can not restart on another server.");
|
||||
logger.debug("VM {} can not restart on another server.", vm);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
|
@ -777,13 +812,13 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
|
|||
if (started != null && started.getState() == VirtualMachine.State.Running) {
|
||||
String message = String.format("HA starting VM: %s (%s)", started.getHostName(), started.getInstanceName());
|
||||
HostVO hostVmHasStarted = _hostDao.findById(started.getHostId());
|
||||
logger.info(String.format("HA is now restarting %s on %s", started, hostVmHasStarted));
|
||||
logger.info("HA is now restarting {} on {}", started, hostVmHasStarted);
|
||||
_alertMgr.sendAlert(alertType, vm.getDataCenterId(), vm.getPodIdToDeployIn(), message, message);
|
||||
return null;
|
||||
}
|
||||
|
||||
if (logger.isDebugEnabled()) {
|
||||
logger.debug("Rescheduling VM " + vm.toString() + " to try again in " + _restartRetryInterval);
|
||||
logger.debug("Rescheduling VM {} to try again in {}", vm.toString(), _restartRetryInterval);
|
||||
}
|
||||
} catch (final InsufficientCapacityException e) {
|
||||
logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage());
|
||||
|
|
@ -815,6 +850,9 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
|
|||
if (!CancellableWorkReasonTypes.contains(work.getReasonType())) {
|
||||
return false;
|
||||
}
|
||||
if (isHostHAInspectionInProgress(work.getHostId())) {
|
||||
return false;
|
||||
}
|
||||
Status hostStatus = investigate(work.getHostId());
|
||||
if (!Status.Up.equals(hostStatus)) {
|
||||
return false;
|
||||
|
|
@ -825,13 +863,14 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
|
|||
}
|
||||
|
||||
public Long migrate(final HaWorkVO work) {
|
||||
logger.debug("MIGRATE with HA WORK");
|
||||
long vmId = work.getInstanceId();
|
||||
long srcHostId = work.getHostId();
|
||||
HostVO srcHost = _hostDao.findById(srcHostId);
|
||||
|
||||
VMInstanceVO vm = _instanceDao.findById(vmId);
|
||||
if (vm == null) {
|
||||
logger.info("Unable to find vm: " + vmId + ", skipping migrate.");
|
||||
logger.info("Unable to find vm: {}, skipping migrate.", vmId);
|
||||
return null;
|
||||
}
|
||||
if (checkAndCancelWorkIfNeeded(work)) {
|
||||
|
|
@ -840,11 +879,11 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
|
|||
logger.info("Migration attempt: for {} from {}. Starting attempt: {}/{} times.", vm, srcHost, 1 + work.getTimesTried(), _maxRetries);
|
||||
|
||||
if (VirtualMachine.State.Stopped.equals(vm.getState())) {
|
||||
logger.info(String.format("vm %s is Stopped, skipping migrate.", vm));
|
||||
logger.info("vm {} is Stopped, skipping migrate.", vm);
|
||||
return null;
|
||||
}
|
||||
if (VirtualMachine.State.Running.equals(vm.getState()) && srcHostId != vm.getHostId()) {
|
||||
logger.info(String.format("VM %s is running on a different host %s, skipping migration", vm, vm.getHostId()));
|
||||
logger.info("VM {} is running on a different host {}, skipping migration", vm, vm.getHostId());
|
||||
return null;
|
||||
}
|
||||
|
||||
|
|
@ -879,7 +918,7 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
|
|||
final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.Destroy, Step.Scheduled, hostId, vm.getState(), 0, vm.getUpdated(), reasonType);
|
||||
_haDao.persist(work);
|
||||
if (logger.isDebugEnabled()) {
|
||||
logger.debug("Scheduled " + work.toString());
|
||||
logger.debug("{}}", work.toString());
|
||||
}
|
||||
wakeupWorkers();
|
||||
return true;
|
||||
|
|
@ -897,7 +936,7 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
|
|||
}
|
||||
|
||||
private void destroyVM(VirtualMachine vm, boolean expunge) throws OperationTimedoutException, AgentUnavailableException {
|
||||
logger.info("Destroying " + vm.toString());
|
||||
logger.info("Destroying {}", vm.toString());
|
||||
if (VirtualMachine.Type.ConsoleProxy.equals(vm.getType())) {
|
||||
consoleProxyManager.destroyProxy(vm.getId());
|
||||
} else if (VirtualMachine.Type.SecondaryStorageVm.equals(vm.getType())) {
|
||||
|
|
@ -908,9 +947,10 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
|
|||
}
|
||||
|
||||
protected Long destroyVM(final HaWorkVO work) {
|
||||
logger.debug("DESTROY with HA WORK");
|
||||
final VirtualMachine vm = _itMgr.findById(work.getInstanceId());
|
||||
if (vm == null) {
|
||||
logger.info("No longer can find VM " + work.getInstanceId() + ". Throwing away " + work);
|
||||
logger.info("No longer can find VM {}. Throwing away {}", work.getInstanceId(), work);
|
||||
return null;
|
||||
}
|
||||
if (checkAndCancelWorkIfNeeded(work)) {
|
||||
|
|
@ -944,20 +984,21 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
|
|||
}
|
||||
|
||||
protected Long stopVM(final HaWorkVO work) throws ConcurrentOperationException {
|
||||
logger.debug("STOP with HA WORK");
|
||||
VirtualMachine vm = _itMgr.findById(work.getInstanceId());
|
||||
if (vm == null) {
|
||||
logger.info("No longer can find VM " + work.getInstanceId() + ". Throwing away " + work);
|
||||
logger.info("No longer can find VM {}. Throwing away {}", work.getInstanceId(), work);
|
||||
work.setStep(Step.Done);
|
||||
return null;
|
||||
}
|
||||
if (checkAndCancelWorkIfNeeded(work)) {
|
||||
return null;
|
||||
}
|
||||
logger.info("Stopping " + vm);
|
||||
logger.info("Stopping {}", vm);
|
||||
try {
|
||||
if (work.getWorkType() == WorkType.Stop) {
|
||||
_itMgr.advanceStop(vm.getUuid(), false);
|
||||
logger.info("Successfully stopped " + vm);
|
||||
logger.info("Successfully stopped {}", vm);
|
||||
return null;
|
||||
} else if (work.getWorkType() == WorkType.CheckStop) {
|
||||
if ((vm.getState() != work.getPreviousState()) || vm.getUpdated() != work.getUpdateTime() || vm.getHostId() == null ||
|
||||
|
|
@ -969,7 +1010,7 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
|
|||
}
|
||||
|
||||
_itMgr.advanceStop(vm.getUuid(), false);
|
||||
logger.info("Stop for " + vm + " was successful");
|
||||
logger.info("Stop for {} was successful", vm);
|
||||
return null;
|
||||
} else if (work.getWorkType() == WorkType.ForceStop) {
|
||||
if ((vm.getState() != work.getPreviousState()) || vm.getUpdated() != work.getUpdateTime() || vm.getHostId() == null ||
|
||||
|
|
@ -981,13 +1022,13 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
|
|||
}
|
||||
|
||||
_itMgr.advanceStop(vm.getUuid(), true);
|
||||
logger.info("Stop for " + vm + " was successful");
|
||||
logger.info("Stop for {} was successful", vm);
|
||||
return null;
|
||||
} else {
|
||||
assert false : "Who decided there's other steps but didn't modify the guy who does the work?";
|
||||
}
|
||||
} catch (final ResourceUnavailableException e) {
|
||||
logger.debug("Agnet is not available" + e.getMessage());
|
||||
logger.debug("Agent is not available" + e.getMessage());
|
||||
} catch (OperationTimedoutException e) {
|
||||
logger.debug("operation timed out: " + e.getMessage());
|
||||
}
|
||||
|
|
@ -1043,7 +1084,8 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
|
|||
try {
|
||||
if (vm != null && !VmHaEnabled.valueIn(vm.getDataCenterId())) {
|
||||
if (logger.isDebugEnabled()) {
|
||||
logger.debug(String.format("VM high availability manager is disabled, rescheduling the HA work %s, for the VM %s (id) to retry later in case VM high availability manager is enabled on retry attempt", work, vm.getName(), vm.getId()));
|
||||
logger.debug("VM high availability manager is disabled, rescheduling the HA work {} for the VM {} ({}) " +
|
||||
"to retry later in case VM high availability manager is enabled on retry attempt", work, vm.getName(), vm.getId());
|
||||
}
|
||||
long nextTime = getRescheduleTime(wt);
|
||||
rescheduleWork(work, nextTime);
|
||||
|
|
@ -1065,13 +1107,13 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
|
|||
}
|
||||
|
||||
if (nextTime == null) {
|
||||
logger.info("Completed work " + work + ". Took " + (work.getTimesTried() + 1) + "/" + _maxRetries + " attempts.");
|
||||
logger.info("Completed work {}. Took {}/{} attempts.", work, work.getTimesTried() + 1, _maxRetries);
|
||||
work.setStep(Step.Done);
|
||||
} else {
|
||||
rescheduleWork(work, nextTime.longValue());
|
||||
}
|
||||
} catch (Exception e) {
|
||||
logger.warn("Encountered unhandled exception during HA process, reschedule work", e);
|
||||
logger.warn("Encountered unhandled exception during HA process, reschedule work {}", work, e);
|
||||
|
||||
long nextTime = getRescheduleTime(wt);
|
||||
rescheduleWork(work, nextTime);
|
||||
|
|
@ -1085,11 +1127,11 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
|
|||
} finally {
|
||||
if (!Step.Done.equals(work.getStep())) {
|
||||
if (work.getTimesTried() >= _maxRetries) {
|
||||
logger.warn("Giving up, retried max " + work.getTimesTried() + "/" + _maxRetries + " times for work: " + work);
|
||||
logger.warn("Giving up, retried max {}/{} times for work: {}", work.getTimesTried(), _maxRetries, work);
|
||||
work.setStep(Step.Done);
|
||||
} else {
|
||||
logger.warn("Rescheduling work " + work + " to try again at " + new Date(work.getTimeToTry() << 10) +
|
||||
". Finished attempt " + work.getTimesTried() + "/" + _maxRetries + " times.");
|
||||
logger.warn("Rescheduling work {} to try again at {}. Finished attempt {}/{} times.",
|
||||
work, new Date(work.getTimeToTry() << 10), work.getTimesTried(), _maxRetries);
|
||||
}
|
||||
}
|
||||
_haDao.update(work.getId(), work);
|
||||
|
|
|
|||
|
|
@ -74,7 +74,7 @@ public class KVMFencer extends AdapterBase implements FenceBuilder {
|
|||
@Override
|
||||
public Boolean fenceOff(VirtualMachine vm, Host host) {
|
||||
if (host.getHypervisorType() != HypervisorType.KVM && host.getHypervisorType() != HypervisorType.LXC) {
|
||||
logger.warn("Don't know how to fence non kvm hosts " + host.getHypervisorType());
|
||||
logger.warn("Don't know how to fence non kvm hosts {}", host.getHypervisorType());
|
||||
return null;
|
||||
}
|
||||
|
||||
|
|
@ -97,11 +97,8 @@ public class KVMFencer extends AdapterBase implements FenceBuilder {
|
|||
FenceAnswer answer;
|
||||
try {
|
||||
answer = (FenceAnswer)_agentMgr.send(h.getId(), fence);
|
||||
} catch (AgentUnavailableException e) {
|
||||
logger.info("Moving on to the next host because " + h.toString() + " is unavailable", e);
|
||||
continue;
|
||||
} catch (OperationTimedoutException e) {
|
||||
logger.info("Moving on to the next host because " + h.toString() + " is unavailable", e);
|
||||
} catch (AgentUnavailableException | OperationTimedoutException e) {
|
||||
logger.info("Moving on to the next host because {} is unavailable", h.toString(), e);
|
||||
continue;
|
||||
}
|
||||
if (answer != null && answer.getResult()) {
|
||||
|
|
@ -115,7 +112,7 @@ public class KVMFencer extends AdapterBase implements FenceBuilder {
|
|||
"Fencing off host " + host.getId() + " did not succeed after asking " + i + " hosts. " +
|
||||
"Check Agent logs for more information.");
|
||||
|
||||
logger.error("Unable to fence off " + vm.toString() + " on " + host.toString());
|
||||
logger.error("Unable to fence off {} on {}", vm.toString(), host.toString());
|
||||
|
||||
return false;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -104,7 +104,7 @@ public class ManagementIPSystemVMInvestigator extends AbstractInvestigatorImpl {
|
|||
}
|
||||
|
||||
@Override
|
||||
public Status isAgentAlive(Host agent) {
|
||||
public Status getHostAgentStatus(Host agent) {
|
||||
return null;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -103,7 +103,7 @@ public class UserVmDomRInvestigator extends AbstractInvestigatorImpl {
|
|||
}
|
||||
|
||||
@Override
|
||||
public Status isAgentAlive(Host agent) {
|
||||
public Status getHostAgentStatus(Host agent) {
|
||||
if (logger.isDebugEnabled()) {
|
||||
logger.debug("checking if agent ({}) is alive", agent);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -46,7 +46,7 @@ public class XenServerInvestigator extends AdapterBase implements Investigator {
|
|||
}
|
||||
|
||||
@Override
|
||||
public Status isAgentAlive(Host agent) {
|
||||
public Status getHostAgentStatus(Host agent) {
|
||||
if (agent.getHypervisorType() != HypervisorType.XenServer) {
|
||||
return null;
|
||||
}
|
||||
|
|
@ -74,7 +74,7 @@ public class XenServerInvestigator extends AdapterBase implements Investigator {
|
|||
|
||||
@Override
|
||||
public boolean isVmAlive(VirtualMachine vm, Host host) throws UnknownVM {
|
||||
Status status = isAgentAlive(host);
|
||||
Status status = getHostAgentStatus(host);
|
||||
if (status == null) {
|
||||
throw new UnknownVM();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -3036,7 +3036,7 @@ public class ManagementServerImpl extends MutualExclusiveIdsManagerBase implemen
|
|||
final String hypervisor = cmd.getHypervisor();
|
||||
final String hypervisorVersion = cmd.getHypervisorVersion();
|
||||
|
||||
//throw exception if hypervisor name is not passed, but version is
|
||||
//throw exception if hypervisor name is not passed, but a version is
|
||||
if (hypervisorVersion != null && (hypervisor == null || hypervisor.isEmpty())) {
|
||||
throw new InvalidParameterValueException("Hypervisor version parameter cannot be used without specifying a hypervisor : XenServer, KVM or VMware");
|
||||
}
|
||||
|
|
@ -3054,7 +3054,7 @@ public class ManagementServerImpl extends MutualExclusiveIdsManagerBase implemen
|
|||
final SearchCriteria<GuestOSHypervisorVO> sc = sb.create();
|
||||
|
||||
if (id != null) {
|
||||
sc.setParameters("id", SearchCriteria.Op.EQ, id);
|
||||
sc.setParameters("id", id);
|
||||
}
|
||||
|
||||
if (osTypeId != null) {
|
||||
|
|
|
|||
|
|
@ -765,6 +765,13 @@ public class VMSnapshotManagerImpl extends MutualExclusiveIdsManagerBase impleme
|
|||
"In order to revert to a Snapshot without memory you need to first stop the Instance.");
|
||||
}
|
||||
|
||||
if (userVm.getState() == VirtualMachine.State.Running && vmSnapshotVo.getType() == VMSnapshot.Type.Disk) {
|
||||
throw new InvalidParameterValueException(
|
||||
"Reverting to the Instance Snapshot is not allowed for running Instances as this would result in an Instance state change. " +
|
||||
"For running Instances only Snapshots with memory can be reverted. " +
|
||||
"In order to revert to a Snapshot without memory you need to first stop the Instance.");
|
||||
}
|
||||
|
||||
if (userVm.getState() == VirtualMachine.State.Stopped && vmSnapshotVo.getType() == VMSnapshot.Type.DiskAndMemory) {
|
||||
throw new InvalidParameterValueException(
|
||||
"Reverting to the Instance Snapshot is not allowed for stopped Instances when the Snapshot contains memory as this would result in an Instance state change. " +
|
||||
|
|
|
|||
|
|
@ -67,11 +67,16 @@ public interface HAManager extends HAConfigManager {
|
|||
"The number of pending fence operations per management server. This setting determines the size of the size of the FENCE queue.", true);
|
||||
|
||||
boolean transitionHAState(final HAConfig.Event event, final HAConfig haConfig);
|
||||
|
||||
HAProvider getHAProvider(final String name);
|
||||
|
||||
HAResourceCounter getHACounter(final Long resourceId, final HAResource.ResourceType resourceType);
|
||||
|
||||
void purgeHACounter(final Long resourceId, final HAResource.ResourceType resourceType);
|
||||
|
||||
boolean isHAEligible(final HAResource resource);
|
||||
|
||||
Boolean isVMAliveOnHost(final Host host) throws Investigator.UnknownVM;
|
||||
Status getHostStatus(final Host host);
|
||||
|
||||
Status getHostStatusFromHAConfig(final Host host);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -139,9 +139,7 @@ public final class HAManagerImpl extends ManagerBase implements HAManager, Clust
|
|||
|
||||
public synchronized void purgeHACounter(final Long resourceId, final HAResource.ResourceType resourceType) {
|
||||
final String key = resourceCounterKey(resourceId, resourceType);
|
||||
if (haCounterMap.containsKey(key)) {
|
||||
haCounterMap.remove(key);
|
||||
}
|
||||
haCounterMap.remove(key);
|
||||
}
|
||||
|
||||
public boolean transitionHAState(final HAConfig.Event event, final HAConfig haConfig) {
|
||||
|
|
@ -248,6 +246,7 @@ public final class HAManagerImpl extends ManagerBase implements HAManager, Clust
|
|||
}
|
||||
|
||||
private boolean isHAEnabledForCluster(final HAResource resource) {
|
||||
// HA is enabled by default when cluster details doesn't exist
|
||||
if (resource == null || resource.getClusterId() == null) {
|
||||
return true;
|
||||
}
|
||||
|
|
@ -259,14 +258,10 @@ public final class HAManagerImpl extends ManagerBase implements HAManager, Clust
|
|||
if (resource == null || resource.getId() < 1L) {
|
||||
return false;
|
||||
}
|
||||
HAResource.ResourceType resourceType = null;
|
||||
if (resource instanceof Host) {
|
||||
resourceType = HAResource.ResourceType.Host;
|
||||
}
|
||||
if (resourceType == null) {
|
||||
if (!(resource instanceof Host)) {
|
||||
return false;
|
||||
}
|
||||
final HAConfig haConfig = haConfigDao.findHAResource(resource.getId(), resourceType);
|
||||
final HAConfig haConfig = haConfigDao.findHAResource(resource.getId(), HAResource.ResourceType.Host);
|
||||
return haConfig != null && haConfig.isEnabled()
|
||||
&& haConfig.getState() != HAConfig.HAState.Disabled
|
||||
&& haConfig.getState() != HAConfig.HAState.Ineligible;
|
||||
|
|
@ -317,19 +312,23 @@ public final class HAManagerImpl extends ManagerBase implements HAManager, Clust
|
|||
throw new Investigator.UnknownVM();
|
||||
}
|
||||
|
||||
public Status getHostStatus(final Host host) {
|
||||
public Status getHostStatusFromHAConfig(final Host host) {
|
||||
final HAConfig haConfig = haConfigDao.findHAResource(host.getId(), HAResource.ResourceType.Host);
|
||||
if (haConfig != null) {
|
||||
if (haConfig.getState() == HAConfig.HAState.Fenced) {
|
||||
logger.debug("HA: Agent [{}] is available/suspect/checking Up.", host);
|
||||
return Status.Down;
|
||||
} else if (haConfig.getState() == HAConfig.HAState.Degraded || haConfig.getState() == HAConfig.HAState.Recovering || haConfig.getState() == HAConfig.HAState.Fencing) {
|
||||
logger.debug("HA: Agent [{}] is disconnected. State: {}, {}.", host, haConfig.getState(), haConfig.getState().getDescription());
|
||||
return Status.Disconnected;
|
||||
}
|
||||
return Status.Up;
|
||||
if (haConfig == null) {
|
||||
logger.warn("HA: Agent [{}] config is not available.", host);
|
||||
return Status.Unknown;
|
||||
}
|
||||
return Status.Unknown;
|
||||
if (haConfig.getState() == HAConfig.HAState.Fenced) {
|
||||
logger.debug("HA: Agent [{}] is fenced.", host);
|
||||
return Status.Down;
|
||||
}
|
||||
if (haConfig.getState() == HAConfig.HAState.Degraded || haConfig.getState() == HAConfig.HAState.Recovering || haConfig.getState() == HAConfig.HAState.Fencing) {
|
||||
logger.debug("HA: Agent [{}] is disconnected. State: {}, {}.", host, haConfig.getState(), haConfig.getState().getDescription());
|
||||
return Status.Disconnected;
|
||||
}
|
||||
|
||||
logger.debug("HA: Agent [{}] is considered Up (HA state can be Available/Suspect/Checking/Recovered). State: {}, {}.", host, haConfig.getState(), haConfig.getState().getDescription());
|
||||
return Status.Up;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
|
|
@ -511,9 +510,14 @@ public final class HAManagerImpl extends ManagerBase implements HAManager, Clust
|
|||
|
||||
// Attempt recovery
|
||||
if (newState == HAConfig.HAState.Recovering) {
|
||||
if (counter.getRecoveryCounter() >= (Long) (haProvider.getConfigValue(HAProviderConfig.MaxRecoveryAttempts, resource))) {
|
||||
long recoveryCounter = counter.getRecoveryCounter();
|
||||
Long maxRecoveryAttempts = (Long) (haProvider.getConfigValue(HAProviderConfig.MaxRecoveryAttempts, resource));
|
||||
if (recoveryCounter >= maxRecoveryAttempts) {
|
||||
logger.debug("Recovery attempts have reached the configured limit: {} for the resource [{}].", maxRecoveryAttempts, resource);
|
||||
return false;
|
||||
}
|
||||
|
||||
logger.debug("Recovery attempt #{} for the resource [{}]. Max recovery attempts configured is {}.", recoveryCounter + 1, resource, maxRecoveryAttempts);
|
||||
final RecoveryTask task = ComponentContext.inject(new RecoveryTask(resource, haProvider, haConfig,
|
||||
HAProviderConfig.RecoveryTimeout, recoveryExecutor));
|
||||
final Future<Boolean> recoveryFuture = recoveryExecutor.submit(task);
|
||||
|
|
@ -536,20 +540,20 @@ public final class HAManagerImpl extends ManagerBase implements HAManager, Clust
|
|||
return false;
|
||||
}
|
||||
|
||||
logger.debug(String.format("HA state pre-transition:: new state=[%s], old state=[%s], for resource id=[%s], status=[%s], ha config state=[%s]." , newState, oldState, haConfig.getResourceId(), status, haConfig.getState()));
|
||||
logger.debug("HA state pre-transition:: new state=[{}], old state=[{}], for resource id=[{}], status=[{}], ha config state=[{}].", newState, oldState, haConfig.getResourceId(), status, haConfig.getState());
|
||||
|
||||
if (status && haConfig.getState() != newState) {
|
||||
logger.warn(String.format("HA state pre-transition:: HA state is not equal to transition state, HA state=[%s], new state=[%s].", haConfig.getState(), newState));
|
||||
logger.warn("HA state pre-transition:: HA state is not equal to transition state, HA state=[{}], new state=[{}].", haConfig.getState(), newState);
|
||||
}
|
||||
return processHAStateChange(haConfig, newState, status);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean postStateTransitionEvent(final StateMachine2.Transition<HAConfig.HAState, HAConfig.Event> transition, final HAConfig haConfig, final boolean status, final Object opaque) {
|
||||
logger.debug(String.format("HA state post-transition:: new state=[%s], old state=[%s], for resource id=[%s], status=[%s], ha config state=[%s].", transition.getToState(), transition.getCurrentState(), haConfig.getResourceId(), status, haConfig.getState()));
|
||||
logger.debug("HA state post-transition:: new state=[{}], old state=[{}], for resource id=[{}], status=[{}], ha config state=[{}].", transition.getToState(), transition.getCurrentState(), haConfig.getResourceId(), status, haConfig.getState());
|
||||
|
||||
if (status && haConfig.getState() != transition.getToState()) {
|
||||
logger.warn(String.format("HA state post-transition:: HA state is not equal to transition state, HA state=[%s], new state=[%s].", haConfig.getState(), transition.getToState()));
|
||||
logger.warn("HA state post-transition:: HA state is not equal to transition state, HA state=[{}], new state=[{}].", haConfig.getState(), transition.getToState());
|
||||
}
|
||||
return processHAStateChange(haConfig, transition.getToState(), status);
|
||||
}
|
||||
|
|
@ -645,7 +649,7 @@ public final class HAManagerImpl extends ManagerBase implements HAManager, Clust
|
|||
try {
|
||||
logger.debug("HA health check task is running...");
|
||||
|
||||
final List<HAConfig> haConfigList = new ArrayList<HAConfig>(haConfigDao.listAll());
|
||||
final List<HAConfig> haConfigList = new ArrayList<>(haConfigDao.listAll());
|
||||
for (final HAConfig haConfig : haConfigList) {
|
||||
currentHaConfig = haConfig;
|
||||
|
||||
|
|
@ -676,8 +680,8 @@ public final class HAManagerImpl extends ManagerBase implements HAManager, Clust
|
|||
HAProviderConfig.HealthCheckTimeout, healthCheckExecutor));
|
||||
healthCheckExecutor.submit(task);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
final HAResourceCounter counter = getHACounter(haConfig.getResourceId(), haConfig.getResourceType());
|
||||
|
|
@ -695,16 +699,22 @@ public final class HAManagerImpl extends ManagerBase implements HAManager, Clust
|
|||
}
|
||||
|
||||
if (haConfig.getState() == HAConfig.HAState.Recovering) {
|
||||
if (counter.getRecoveryCounter() >= (Long) (haProvider.getConfigValue(HAProviderConfig.MaxRecoveryAttempts, resource))) {
|
||||
long recoveryCounter = counter.getRecoveryCounter();
|
||||
Long maxRecoveryAttempts = (Long) (haProvider.getConfigValue(HAProviderConfig.MaxRecoveryAttempts, resource));
|
||||
if (recoveryCounter >= maxRecoveryAttempts) {
|
||||
logger.debug("Recovery attempts have reached the max limit: {} for the resource [{}].", maxRecoveryAttempts, resource);
|
||||
transitionHAState(HAConfig.Event.RecoveryOperationThresholdExceeded, haConfig);
|
||||
} else {
|
||||
logger.debug("Retry recovery for the resource [{}]. Max recovery attempts configured is {}.", resource, maxRecoveryAttempts);
|
||||
transitionHAState(HAConfig.Event.RetryRecovery, haConfig);
|
||||
}
|
||||
}
|
||||
|
||||
if (haConfig.getState() == HAConfig.HAState.Recovered) {
|
||||
counter.markRecoveryStarted();
|
||||
if (counter.canExitRecovery((Long)(haProvider.getConfigValue(HAProviderConfig.RecoveryWaitTimeout, resource)))) {
|
||||
Long recoveryWaitTimeout = (Long)(haProvider.getConfigValue(HAProviderConfig.RecoveryWaitTimeout, resource));
|
||||
logger.debug("Recovery started for the resource [{}], wait period configured to become Available is {} secs", resource, recoveryWaitTimeout);
|
||||
if (counter.canExitRecovery(recoveryWaitTimeout)) {
|
||||
if (transitionHAState(HAConfig.Event.RecoveryWaitPeriodTimeout, haConfig)) {
|
||||
counter.markRecoveryCompleted();
|
||||
}
|
||||
|
|
@ -717,7 +727,7 @@ public final class HAManagerImpl extends ManagerBase implements HAManager, Clust
|
|||
}
|
||||
} catch (Throwable t) {
|
||||
if (currentHaConfig != null) {
|
||||
logger.error(String.format("Error trying to perform health checks in HA manager [%s].", currentHaConfig.getHaProvider()), t);
|
||||
logger.error("Error trying to perform health checks in HA manager [{}].", currentHaConfig.getHaProvider(), t);
|
||||
} else {
|
||||
logger.error("Error trying to perform health checks in HA manager.", t);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -36,6 +36,10 @@ public final class HAResourceCounter {
|
|||
return activityCheckCounter.get();
|
||||
}
|
||||
|
||||
public long getActivityCheckFailureCounter() {
|
||||
return activityCheckFailureCounter.get();
|
||||
}
|
||||
|
||||
public long getRecoveryCounter() {
|
||||
return recoveryOperationCounter.get();
|
||||
}
|
||||
|
|
@ -66,7 +70,7 @@ public final class HAResourceCounter {
|
|||
firstHealthCheckFailureTimestamp = null;
|
||||
}
|
||||
|
||||
public boolean hasActivityThresholdExceeded(final double failureRatio) {
|
||||
public boolean hasActivityFailureThresholdExceeded(final double failureRatio) {
|
||||
return activityCheckFailureCounter.get() > (activityCheckCounter.get() * failureRatio);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -62,6 +62,8 @@ public class ActivityCheckTask extends BaseHATask {
|
|||
return;
|
||||
}
|
||||
|
||||
long activityCounter = counter.getActivityCheckCounter();
|
||||
logger.debug("Activity check #{}, result: {} for the resource {}. Max activity checks configured is {}", activityCounter + 1, result, getResource(), maxActivityChecks);
|
||||
counter.incrActivityCounter(!result);
|
||||
|
||||
if (counter.getActivityCheckCounter() < maxActivityChecks) {
|
||||
|
|
@ -69,7 +71,9 @@ public class ActivityCheckTask extends BaseHATask {
|
|||
return;
|
||||
}
|
||||
|
||||
if (counter.hasActivityThresholdExceeded(activityCheckFailureRatio)) {
|
||||
long activityCheckFailureCount = counter.getActivityCheckFailureCounter();
|
||||
logger.debug("{} activity checks failed out of {} checks performed for the resource {}. Failure threshold configured is {}", activityCheckFailureCount, maxActivityChecks, getResource(), activityCheckFailureRatio);
|
||||
if (counter.hasActivityFailureThresholdExceeded(activityCheckFailureRatio)) {
|
||||
haManager.transitionHAState(HAConfig.Event.ActivityCheckFailureOverThresholdRatio, haConfig);
|
||||
} else {
|
||||
if (haManager.transitionHAState(HAConfig.Event.ActivityCheckFailureUnderThresholdRatio, haConfig)) {
|
||||
|
|
|
|||
|
|
@ -97,7 +97,7 @@ public abstract class BaseHATask implements Callable<Boolean> {
|
|||
result = future.get(timeout, TimeUnit.SECONDS);
|
||||
}
|
||||
} catch (InterruptedException | ExecutionException e) {
|
||||
logger.warn("Exception occurred while running " + getTaskType() + " on a resource: " + e.getMessage(), e.getCause());
|
||||
logger.warn("Exception occurred while running {} on a resource: {}", getTaskType(), e.getMessage(), e.getCause());
|
||||
throwable = e.getCause();
|
||||
} catch (TimeoutException e) {
|
||||
logger.trace("{} operation timed out for resource: {}", getTaskType(), resource);
|
||||
|
|
|
|||
|
|
@ -35,6 +35,7 @@ import org.apache.cloudstack.engine.orchestration.service.VolumeOrchestrationSer
|
|||
import org.apache.cloudstack.engine.subsystem.api.storage.DataStoreProviderManager;
|
||||
import org.apache.cloudstack.framework.config.ConfigKey;
|
||||
import org.apache.cloudstack.framework.config.dao.ConfigurationDao;
|
||||
import org.apache.cloudstack.ha.dao.HAConfigDao;
|
||||
import org.apache.cloudstack.managed.context.ManagedContext;
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
|
|
@ -118,6 +119,8 @@ public class HighAvailabilityManagerImplTest {
|
|||
@Mock
|
||||
ConfigurationDao _configDao;
|
||||
@Mock
|
||||
HAConfigDao _haConfigDao;
|
||||
@Mock
|
||||
VolumeOrchestrationService volumeMgr;
|
||||
@Mock
|
||||
ConsoleProxyManager consoleProxyManager;
|
||||
|
|
@ -362,7 +365,7 @@ public class HighAvailabilityManagerImplTest {
|
|||
investigators.add(investigator);
|
||||
highAvailabilityManager.setInvestigators(investigators);
|
||||
// Mock isAgentAlive to return host status as Down
|
||||
Mockito.when(investigator.isAgentAlive(hostVO)).thenReturn(Status.Down);
|
||||
Mockito.when(investigator.getHostAgentStatus(hostVO)).thenReturn(Status.Down);
|
||||
|
||||
ConfigKey<Boolean> haEnabled = Mockito.mock(ConfigKey.class);
|
||||
highAvailabilityManager.VmHaEnabled = haEnabled;
|
||||
|
|
|
|||
|
|
@ -169,10 +169,10 @@ import com.cloud.vm.ImportVMTaskVO;
|
|||
import com.cloud.vm.NicProfile;
|
||||
import com.cloud.vm.UserVmManager;
|
||||
import com.cloud.vm.UserVmVO;
|
||||
import com.cloud.vm.VmDetailConstants;
|
||||
import com.cloud.vm.VMInstanceDetailVO;
|
||||
import com.cloud.vm.VMInstanceVO;
|
||||
import com.cloud.vm.VirtualMachine;
|
||||
import com.cloud.vm.VmDetailConstants;
|
||||
import com.cloud.vm.dao.NicDao;
|
||||
import com.cloud.vm.dao.UserVmDao;
|
||||
import com.cloud.vm.dao.VMInstanceDao;
|
||||
|
|
|
|||
|
|
@ -231,21 +231,19 @@ public class RedfishClient {
|
|||
}
|
||||
|
||||
protected HttpResponse retryHttpRequest(String url, HttpRequestBase httpReq, HttpClient client) {
|
||||
logger.warn(String.format("Failed to execute HTTP %s request [URL: %s]. Executing the request again.", httpReq.getMethod(), url));
|
||||
logger.warn("Failed to execute HTTP {} request [URL: {}]. Executing the request again.", httpReq.getMethod(), url);
|
||||
HttpResponse response = null;
|
||||
for (int attempt = 1; attempt < redfishRequestMaxRetries + 1; attempt++) {
|
||||
try {
|
||||
TimeUnit.SECONDS.sleep(WAIT_FOR_REQUEST_RETRY);
|
||||
logger.debug(String.format("HTTP %s request retry attempt %d/%d [URL: %s].", httpReq.getMethod(), attempt, redfishRequestMaxRetries, url));
|
||||
logger.debug("HTTP {} request retry attempt {}/{} [URL: {}].", httpReq.getMethod(), attempt, redfishRequestMaxRetries, url);
|
||||
response = client.execute(httpReq);
|
||||
break;
|
||||
} catch (IOException | InterruptedException e) {
|
||||
if (attempt == redfishRequestMaxRetries) {
|
||||
throw new RedfishException(String.format("Failed to execute HTTP %s request retry attempt %d/%d [URL: %s] due to exception %s", httpReq.getMethod(), attempt, redfishRequestMaxRetries,url, e));
|
||||
} else {
|
||||
logger.warn(
|
||||
String.format("Failed to execute HTTP %s request retry attempt %d/%d [URL: %s] due to exception %s", httpReq.getMethod(), attempt, redfishRequestMaxRetries,
|
||||
url, e));
|
||||
logger.warn("Failed to execute HTTP {} request retry attempt {}/{} [URL: {}] due to exception {}", httpReq.getMethod(), attempt, redfishRequestMaxRetries, url, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -312,7 +310,7 @@ public class RedfishClient {
|
|||
throw new RedfishException(String.format("Failed to execute System power command for host by performing '%s' request on URL '%s' and host address '%s'. The expected HTTP status code is '%s' but it got '%s'.",
|
||||
HttpPost.METHOD_NAME, url, hostAddress, EXPECTED_HTTP_STATUS, statusCode));
|
||||
}
|
||||
logger.debug(String.format("Sending ComputerSystem.Reset Command '%s' to host '%s' with request '%s %s'", resetCommand, hostAddress, HttpPost.METHOD_NAME, url));
|
||||
logger.debug("Sending ComputerSystem.Reset Command '{}' to host '{}' with request '{} {}'", resetCommand, hostAddress, HttpPost.METHOD_NAME, url);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -330,7 +328,7 @@ public class RedfishClient {
|
|||
|
||||
String systemId = processGetSystemIdResponse(response);
|
||||
|
||||
logger.debug(String.format("Retrieved System ID '%s' with request '%s: %s'", systemId, HttpGet.METHOD_NAME, url));
|
||||
logger.debug("Retrieved System ID '{}' with request '{}: {}'", systemId, HttpGet.METHOD_NAME, url);
|
||||
|
||||
return systemId;
|
||||
}
|
||||
|
|
@ -384,7 +382,7 @@ public class RedfishClient {
|
|||
}
|
||||
|
||||
RedfishPowerState powerState = processGetSystemRequestResponse(response);
|
||||
logger.debug(String.format("Retrieved System power state '%s' with request '%s: %s'", powerState, HttpGet.METHOD_NAME, url));
|
||||
logger.debug("Retrieved System power state '{}' with request '{}: {}'", powerState, HttpGet.METHOD_NAME, url);
|
||||
return powerState;
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue