Merge branch '4.22'

This commit is contained in:
Suresh Kumar Anaparti 2026-05-08 20:57:36 +05:30
commit a4a52c9665
No known key found for this signature in database
GPG Key ID: D7CEAE3A9E71D0AA
47 changed files with 517 additions and 462 deletions

View File

@ -26,17 +26,19 @@ public interface Investigator extends Adapter {
* Returns if the vm is still alive.
*
* @param vm to work on.
* @return true if vm is alive, otherwise false
*/
public boolean isVmAlive(VirtualMachine vm, Host host) throws UnknownVM;
boolean isVmAlive(VirtualMachine vm, Host host) throws UnknownVM;
public Status isAgentAlive(Host agent);
/**
* Returns the agent status of the host.
*
* @param host
* @return status of the host agent
*/
Status getHostAgentStatus(Host host);
class UnknownVM extends Exception {
/**
*
*/
private static final long serialVersionUID = 1L;
};
}

View File

@ -87,6 +87,7 @@ public final class ConfigureHAForHostCmd extends BaseAsyncCmd {
final HostHAResponse response = new HostHAResponse();
response.setId(resourceUuid);
response.setProvider(getHaProvider().toLowerCase());
response.setStatus(result);
response.setResponseName(getCommandName());
setResponseObject(response);
}

View File

@ -38,6 +38,8 @@ public class CheckOnHostAnswer extends Answer {
public CheckOnHostAnswer(CheckOnHostCommand cmd, String details) {
super(cmd, false, details);
determined = false;
alive = false;
}
public boolean isDetermined() {
@ -47,5 +49,4 @@ public class CheckOnHostAnswer extends Answer {
public boolean isAlive() {
return alive;
}
}

View File

@ -24,7 +24,7 @@ import com.cloud.host.Host;
public class CheckOnHostCommand extends Command {
HostTO host;
boolean reportCheckFailureIfOneStorageIsDown;
boolean reportIfHeartBeatFailedForOneStoragePool;
protected CheckOnHostCommand() {
}
@ -34,17 +34,17 @@ public class CheckOnHostCommand extends Command {
setWait(20);
}
public CheckOnHostCommand(Host host, boolean reportCheckFailureIfOneStorageIsDown) {
public CheckOnHostCommand(Host host, boolean reportIfHeartBeatFailedForOneStoragePool) {
this(host);
this.reportCheckFailureIfOneStorageIsDown = reportCheckFailureIfOneStorageIsDown;
this.reportIfHeartBeatFailedForOneStoragePool = reportIfHeartBeatFailedForOneStoragePool;
}
public HostTO getHost() {
return host;
}
public boolean isCheckFailedOnOneStorage() {
return reportCheckFailureIfOneStorageIsDown;
public boolean shouldReportIfHeartBeatFailedForOneStoragePool() {
return reportIfHeartBeatFailedForOneStoragePool;
}
@Override

View File

@ -75,10 +75,10 @@ public interface HighAvailabilityManager extends Manager {
+ " which are registered for the HA event that were successful and are now ready to be purged.",
true, Cluster);
public static final ConfigKey<Boolean> KvmHAFenceHostIfHeartbeatFailsOnStorage = new ConfigKey<>("Advanced", Boolean.class, "kvm.ha.fence.on.storage.heartbeat.failure", "false",
ConfigKey<Boolean> KvmHAFenceHostIfHeartbeatFailsOnStorage = new ConfigKey<>("Advanced", Boolean.class, "kvm.ha.fence.on.storage.heartbeat.failure", "false",
"Proceed fencing the host even the heartbeat failed for only one storage pool", false, ConfigKey.Scope.Zone);
public enum WorkType {
enum WorkType {
Migration, // Migrating VMs off of a host.
Stop, // Stops a VM for storage pool migration purposes. This should be obsolete now.
CheckStop, // Checks if a VM has been stopped.

View File

@ -395,9 +395,9 @@ public class VolumeServiceImpl implements VolumeService {
}
// Find out if the volume is at state of download_in_progress on secondary storage
VolumeDataStoreVO volumeStore = _volumeStoreDao.findByVolume(volume.getId());
if (volumeStore != null) {
if (volumeStore.getDownloadState() == VMTemplateStorageResourceAssoc.Status.DOWNLOAD_IN_PROGRESS) {
VolumeDataStoreVO volumeOnImageStore = _volumeStoreDao.findByVolume(volume.getId());
if (volumeOnImageStore != null) {
if (volumeOnImageStore.getDownloadState() == VMTemplateStorageResourceAssoc.Status.DOWNLOAD_IN_PROGRESS) {
String msg = String.format("Volume: %s is currently being uploaded; can't delete it.", volume);
logger.debug(msg);
result.setSuccess(false);
@ -416,10 +416,10 @@ public class VolumeServiceImpl implements VolumeService {
if (!volumeExistsOnPrimary(vol)) {
// not created on primary store
if (volumeStore == null) {
if (volumeOnImageStore == null) {
// also not created on secondary store
if (logger.isDebugEnabled()) {
logger.debug("Marking volume that was never created as destroyed: " + vol);
logger.debug("Marking volume that was never created as destroyed: {}", vol);
}
VMTemplateVO template = templateDao.findById(vol.getTemplateId());
if (template != null && !template.isDeployAsIs()) {
@ -435,11 +435,21 @@ public class VolumeServiceImpl implements VolumeService {
if (volume.getDataStore().getRole() == DataStoreRole.Image) {
// no need to change state in volumes table
volume.processEventOnly(Event.DestroyRequested);
if (volumeOnImageStore == null) {
logger.debug("Volume {} doesn't exist on image store, no need to delete", vol);
future.complete(result);
return future;
}
} else if (volume.getDataStore().getRole() == DataStoreRole.Primary) {
if (vol.getState() == Volume.State.Expunging) {
logger.info("Volume {} is already in Expunging, retrying", volume);
}
volume.processEvent(Event.ExpungeRequested);
if (!volumeExistsOnPrimary(vol)) {
logger.debug("Volume {} doesn't exist on primary storage, no need to delete", vol);
future.complete(result);
return future;
}
}
DeleteVolumeContext<VolumeApiResult> context = new DeleteVolumeContext<>(null, vo, future);
@ -460,13 +470,11 @@ public class VolumeServiceImpl implements VolumeService {
private boolean volumeExistsOnPrimary(VolumeVO vol) {
Long poolId = vol.getPoolId();
if (poolId == null) {
return false;
}
PrimaryDataStore primaryStore = dataStoreMgr.getPrimaryDataStore(poolId);
if (primaryStore == null) {
return false;
}
@ -476,8 +484,7 @@ public class VolumeServiceImpl implements VolumeService {
}
String volumePath = vol.getPath();
if (volumePath == null || volumePath.trim().isEmpty()) {
if (StringUtils.isBlank(volumePath)) {
return false;
}

View File

@ -1663,14 +1663,14 @@ public class ExtensionsManagerImpl extends ManagerBase implements ExtensionsMana
public List<String> getExtensionReservedResourceDetails(long extensionId) {
ExtensionDetailsVO detailsVO = extensionDetailsDao.findDetail(extensionId,
ApiConstants.RESERVED_RESOURCE_DETAILS);
if (detailsVO == null || !StringUtils.isNotBlank(detailsVO.getValue())) {
return Collections.emptyList();
}
List<String> reservedDetails = new ArrayList<>();
String[] parts = detailsVO.getValue().split(",");
for (String part : parts) {
if (StringUtils.isNotBlank(part)) {
reservedDetails.add(part.trim());
if (detailsVO != null && StringUtils.isNotBlank(detailsVO.getValue())) {
String[] parts = detailsVO.getValue().split(",");
for (String part : parts) {
String trimmedPart = part.trim();
if (StringUtils.isNotBlank(trimmedPart)) {
reservedDetails.add(trimmedPart);
}
}
}
addInbuiltExtensionReservedResourceDetails(extensionId, reservedDetails);

View File

@ -41,15 +41,15 @@ public class HypervInvestigator extends AdapterBase implements Investigator {
@Override
public boolean isVmAlive(com.cloud.vm.VirtualMachine vm, Host host) throws UnknownVM {
Status status = isAgentAlive(host);
Status status = getHostAgentStatus(host);
if (status == null) {
throw new UnknownVM();
}
return status == Status.Up ? true : null;
return status == Status.Up;
}
@Override
public Status isAgentAlive(Host agent) {
public Status getHostAgentStatus(Host agent) {
if (agent.getHypervisorType() != Hypervisor.HypervisorType.Hyperv) {
return null;
}

View File

@ -19,10 +19,7 @@
package com.cloud.ha;
import com.cloud.agent.AgentManager;
import com.cloud.agent.api.Answer;
import com.cloud.agent.api.CheckOnHostCommand;
import com.cloud.host.Host;
import com.cloud.host.HostVO;
import com.cloud.host.Status;
import com.cloud.host.dao.HostDao;
import com.cloud.hypervisor.Hypervisor;
@ -34,11 +31,12 @@ import org.apache.cloudstack.engine.subsystem.api.storage.DataStoreProvider;
import org.apache.cloudstack.engine.subsystem.api.storage.DataStoreProviderManager;
import org.apache.cloudstack.engine.subsystem.api.storage.PrimaryDataStoreDriver;
import org.apache.cloudstack.ha.HAManager;
import org.apache.cloudstack.kvm.ha.KVMHostActivityChecker;
import org.apache.cloudstack.storage.datastore.db.PrimaryDataStoreDao;
import org.apache.cloudstack.storage.datastore.db.StoragePoolVO;
import javax.inject.Inject;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
public class KVMInvestigator extends AdapterBase implements Investigator {
@ -54,13 +52,15 @@ public class KVMInvestigator extends AdapterBase implements Investigator {
private HAManager haManager;
@Inject
private DataStoreProviderManager dataStoreProviderMgr;
@Inject
private KVMHostActivityChecker hostActivityChecker;
@Override
public boolean isVmAlive(com.cloud.vm.VirtualMachine vm, Host host) throws UnknownVM {
if (haManager.isHAEligible(host)) {
return haManager.isVMAliveOnHost(host);
}
Status status = isAgentAlive(host);
Status status = getHostAgentStatus(host);
logger.debug("HA: HOST is ineligible legacy state {} for host {}", status, host);
if (status == null) {
throw new UnknownVM();
@ -73,86 +73,41 @@ public class KVMInvestigator extends AdapterBase implements Investigator {
}
@Override
public Status isAgentAlive(Host agent) {
if (agent.getHypervisorType() != Hypervisor.HypervisorType.KVM && agent.getHypervisorType() != Hypervisor.HypervisorType.LXC) {
public Status getHostAgentStatus(Host host) {
if (host.getHypervisorType() != Hypervisor.HypervisorType.KVM && host.getHypervisorType() != Hypervisor.HypervisorType.LXC) {
return null;
}
if (haManager.isHAEligible(agent)) {
return haManager.getHostStatus(agent);
if (haManager.isHAEligible(host)) {
return haManager.getHostStatusFromHAConfig(host);
}
List<StoragePoolVO> clusterPools = _storagePoolDao.findPoolsInClusters(Arrays.asList(agent.getClusterId()), null);
boolean storageSupportHA = storageSupportHa(clusterPools);
if (!storageSupportHA) {
List<StoragePoolVO> zonePools = _storagePoolDao.findZoneWideStoragePoolsByHypervisor(agent.getDataCenterId(), agent.getHypervisorType());
storageSupportHA = storageSupportHa(zonePools);
List<StoragePoolVO> clusterPools = _storagePoolDao.findPoolsInClusters(Collections.singletonList(host.getClusterId()), null);
boolean storageSupportsHA = storageSupportsHA(clusterPools);
if (!storageSupportsHA) {
List<StoragePoolVO> zonePools = _storagePoolDao.findZoneWideStoragePoolsByHypervisor(host.getDataCenterId(), host.getHypervisorType());
storageSupportsHA = storageSupportsHA(zonePools);
}
if (!storageSupportHA) {
logger.warn("Agent investigation was requested on host {}, but host does not support investigation because it has no NFS storage. Skipping investigation.", agent);
if (!storageSupportsHA) {
logger.warn("Agent investigation was requested on host {}, but host does not support investigation" +
" because it has no HA supported storage. Skipping investigation.", host);
return null;
}
Status hostStatus = null;
Status neighbourStatus = null;
boolean reportFailureIfOneStorageIsDown = HighAvailabilityManager.KvmHAFenceHostIfHeartbeatFailsOnStorage.value();
CheckOnHostCommand cmd = new CheckOnHostCommand(agent, reportFailureIfOneStorageIsDown);
try {
Answer answer = _agentMgr.easySend(agent.getId(), cmd);
if (answer != null) {
hostStatus = answer.getResult() ? Status.Down : Status.Up;
}
} catch (Exception e) {
logger.debug("Failed to send command to host: {}", agent);
}
if (hostStatus == null) {
hostStatus = Status.Disconnected;
}
List<HostVO> neighbors = _resourceMgr.listHostsInClusterByStatus(agent.getClusterId(), Status.Up);
for (HostVO neighbor : neighbors) {
if (neighbor.getId() == agent.getId()
|| (neighbor.getHypervisorType() != Hypervisor.HypervisorType.KVM && neighbor.getHypervisorType() != Hypervisor.HypervisorType.LXC)) {
continue;
}
logger.debug("Investigating host:{} via neighbouring host:{}", agent, neighbor);
try {
Answer answer = _agentMgr.easySend(neighbor.getId(), cmd);
if (answer != null) {
neighbourStatus = answer.getResult() ? Status.Down : Status.Up;
logger.debug("Neighbouring host:{} returned status:{} for the investigated host:{}", neighbor, neighbourStatus, agent);
if (neighbourStatus == Status.Up) {
break;
}
}
} catch (Exception e) {
logger.debug("Failed to send command to host: {}", neighbor);
}
}
if (neighbourStatus == Status.Up && (hostStatus == Status.Disconnected || hostStatus == Status.Down)) {
hostStatus = Status.Disconnected;
}
if (neighbourStatus == Status.Down && (hostStatus == Status.Disconnected || hostStatus == Status.Down)) {
hostStatus = Status.Down;
}
logger.debug("HA: HOST is ineligible legacy state {} for host {}", hostStatus, agent);
return hostStatus;
return hostActivityChecker.getHostAgentStatus(host);
}
private boolean storageSupportHa(List<StoragePoolVO> pools) {
boolean storageSupportHA = false;
private boolean storageSupportsHA(List<StoragePoolVO> pools) {
for (StoragePoolVO pool : pools) {
DataStoreProvider storeProvider = dataStoreProviderMgr.getDataStoreProvider(pool.getStorageProviderName());
DataStoreDriver storeDriver = storeProvider.getDataStoreDriver();
if (storeDriver instanceof PrimaryDataStoreDriver) {
PrimaryDataStoreDriver primaryStoreDriver = (PrimaryDataStoreDriver)storeDriver;
if (primaryStoreDriver.isStorageSupportHA(pool.getPoolType())) {
storageSupportHA = true;
break;
return true;
}
}
}
return storageSupportHA;
return false;
}
}

View File

@ -35,10 +35,9 @@ import com.cloud.agent.properties.AgentPropertiesFileHandler;
public class KVMHABase {
protected Logger logger = LogManager.getLogger(getClass());
private long _timeout = 60000; /* 1 minutes */
protected long _heartBeatUpdateTimeout = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.HEARTBEAT_UPDATE_TIMEOUT);
protected long _heartBeatUpdateFreq = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_UPDATE_FREQUENCY);
protected long _heartBeatUpdateFreqInMs = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_UPDATE_FREQUENCY);
protected long _heartBeatUpdateMaxTries = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_UPDATE_MAX_TRIES);
protected long _heartBeatUpdateRetrySleep = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_UPDATE_RETRY_SLEEP);
protected long _heartBeatUpdateRetrySleepInMs = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_UPDATE_RETRY_SLEEP);
public static enum PoolType {
PrimaryStorage, SecondaryStorage
@ -138,7 +137,7 @@ public class KVMHABase {
/* Can't find the mount point? */
/* we need to mount it under poolName */
if (poolName != null) {
Script mount = new Script("/bin/bash", 60000);
Script mount = new Script("/bin/bash", _timeout);
mount.add("-c");
mount.add("mount " + mountSource + " " + destPath);
String result = mount.execute();
@ -154,7 +153,6 @@ public class KVMHABase {
}
protected String getMountPoint(HAStoragePool storagePool) {
StoragePool pool = null;
String poolName = null;
try {
@ -171,7 +169,6 @@ public class KVMHABase {
}
poolName = pool.getName();
}
} catch (LibvirtException e) {
logger.debug("Ignoring libvirt error.", e);
} finally {
@ -234,7 +231,7 @@ public class KVMHABase {
return result;
}
public Boolean checkingHeartBeat() {
public Boolean hasHeartBeat() {
// TODO Auto-generated method stub
return null;
}

View File

@ -26,44 +26,43 @@ import com.cloud.agent.api.to.HostTO;
public class KVMHAChecker extends KVMHABase implements Callable<Boolean> {
private List<HAStoragePool> storagePools;
private HostTO host;
private boolean reportFailureIfOneStorageIsDown;
private boolean reportIfHeartBeatFailedForOneStoragePool;
public KVMHAChecker(List<HAStoragePool> pools, HostTO host, boolean reportFailureIfOneStorageIsDown) {
public KVMHAChecker(List<HAStoragePool> pools, HostTO host, boolean reportIfHeartBeatFailedForOneStoragePool) {
this.storagePools = pools;
this.host = host;
this.reportFailureIfOneStorageIsDown = reportFailureIfOneStorageIsDown;
this.reportIfHeartBeatFailedForOneStoragePool = reportIfHeartBeatFailedForOneStoragePool;
}
/*
* True means heartbeaing is on going, or we can't get it's status. False
* means heartbeating is stopped definitely
* True means heart beating is on going, or we can't get it's status.
* False means heart beating is stopped definitely.
*/
@Override
public Boolean checkingHeartBeat() {
boolean validResult = false;
String hostAndPools = String.format("host IP [%s] in pools [%s]", host.getPrivateNetwork().getIp(), storagePools.stream().map(pool -> pool.getPoolUUID()).collect(Collectors.joining(", ")));
logger.debug(String.format("Checking heart beat with KVMHAChecker for %s", hostAndPools));
public Boolean hasHeartBeat() {
String hostAndPools = String.format("host IP [%s] in pools [%s]", host.getPrivateNetwork().getIp(),
storagePools.stream().map(pool -> pool.getPoolUUID()).collect(Collectors.joining(", ")));
logger.debug("Checking heart beat with KVMHAChecker for {}", hostAndPools);
boolean heartBeatCheckResult = false;
for (HAStoragePool pool : storagePools) {
validResult = pool.getPool().checkingHeartBeat(pool, host);
if (reportFailureIfOneStorageIsDown && !validResult) {
heartBeatCheckResult = pool.getPool().hasHeartBeat(pool, host);
if (reportIfHeartBeatFailedForOneStoragePool && !heartBeatCheckResult) {
break;
}
}
if (!validResult) {
logger.warn(String.format("All checks with KVMHAChecker for %s considered it as dead. It may cause a shutdown of the host.", hostAndPools));
if (!heartBeatCheckResult) {
logger.warn("All checks with KVMHAChecker for {} considered it as dead. It may cause a shutdown of the host.", hostAndPools);
}
return validResult;
return heartBeatCheckResult;
}
@Override
public Boolean call() throws Exception {
// logger.addAppender(new org.apache.log4j.ConsoleAppender(new
// org.apache.log4j.PatternLayout(), "System.out"));
return checkingHeartBeat();
return hasHeartBeat();
}
}

View File

@ -34,53 +34,49 @@ import java.util.concurrent.ConcurrentHashMap;
public class KVMHAMonitor extends KVMHABase implements Runnable {
private final Map<String, HAStoragePool> storagePool = new ConcurrentHashMap<>();
private final Map<String, HAStoragePool> haStoragePools = new ConcurrentHashMap<>();
private final boolean rebootHostAndAlertManagementOnHeartbeatTimeout;
private final String hostPrivateIp;
public KVMHAMonitor(HAStoragePool pool, String host) {
if (pool != null) {
storagePool.put(pool.getPoolUUID(), pool);
}
public KVMHAMonitor(String host) {
hostPrivateIp = host;
rebootHostAndAlertManagementOnHeartbeatTimeout = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.REBOOT_HOST_AND_ALERT_MANAGEMENT_ON_HEARTBEAT_TIMEOUT);
}
public void addStoragePool(HAStoragePool pool) {
synchronized (storagePool) {
storagePool.put(pool.getPoolUUID(), pool);
synchronized (haStoragePools) {
haStoragePools.put(pool.getPoolUUID(), pool);
}
}
public void removeStoragePool(String uuid) {
synchronized (storagePool) {
HAStoragePool pool = storagePool.get(uuid);
synchronized (haStoragePools) {
HAStoragePool pool = haStoragePools.get(uuid);
if (pool != null) {
Script.runSimpleBashScript("umount " + pool.getMountDestPath());
storagePool.remove(uuid);
haStoragePools.remove(uuid);
}
}
}
public List<HAStoragePool> getStoragePools() {
synchronized (storagePool) {
return new ArrayList<>(storagePool.values());
synchronized (haStoragePools) {
return new ArrayList<>(haStoragePools.values());
}
}
public HAStoragePool getStoragePool(String uuid) {
synchronized (storagePool) {
return storagePool.get(uuid);
synchronized (haStoragePools) {
return haStoragePools.get(uuid);
}
}
protected void runHeartBeat() {
synchronized (storagePool) {
synchronized (haStoragePools) {
Set<String> removedPools = new HashSet<>();
for (String uuid : storagePool.keySet()) {
HAStoragePool primaryStoragePool = storagePool.get(uuid);
for (String uuid : haStoragePools.keySet()) {
HAStoragePool primaryStoragePool = haStoragePools.get(uuid);
if (HighAvailabilityManager.LIBVIRT_STORAGE_POOL_TYPES_WITH_HA_SUPPORT.contains(primaryStoragePool.getPool().getType())) {
checkForNotExistingLibvirtStoragePools(removedPools, uuid);
if (removedPools.contains(uuid)) {
@ -91,7 +87,7 @@ public class KVMHAMonitor extends KVMHABase implements Runnable {
result = executePoolHeartBeatCommand(uuid, primaryStoragePool, result);
if (result != null && rebootHostAndAlertManagementOnHeartbeatTimeout) {
logger.warn(String.format("Write heartbeat for pool [%s] failed: %s; stopping cloudstack-agent.", uuid, result));
logger.warn("Write heartbeat for pool [{}] failed: {}; stopping cloudstack-agent.", uuid, result);
primaryStoragePool.getPool().createHeartBeatCommand(primaryStoragePool, null, false);;
}
}
@ -104,20 +100,18 @@ public class KVMHAMonitor extends KVMHABase implements Runnable {
}
private String executePoolHeartBeatCommand(String uuid, HAStoragePool primaryStoragePool, String result) {
for (int i = 1; i <= _heartBeatUpdateMaxTries; i++) {
for (int attempt = 1; attempt <= _heartBeatUpdateMaxTries; attempt++) {
result = primaryStoragePool.getPool().createHeartBeatCommand(primaryStoragePool, hostPrivateIp, true);
if (result != null) {
logger.warn(String.format("Write heartbeat for pool [%s] failed: %s; try: %s of %s.", uuid, result, i, _heartBeatUpdateMaxTries));
try {
Thread.sleep(_heartBeatUpdateRetrySleep);
} catch (InterruptedException e) {
logger.debug("[IGNORED] Interrupted between heartbeat retries.", e);
}
} else {
if (result == null) {
break;
}
logger.warn("Write heartbeat for pool [{}] failed: {}; try: {} of {}.", uuid, result, attempt, _heartBeatUpdateMaxTries);
try {
Thread.sleep(_heartBeatUpdateRetrySleepInMs);
} catch (InterruptedException e) {
logger.debug("[IGNORED] Interrupted between heartbeat retries.", e);
}
}
return result;
}
@ -128,21 +122,21 @@ public class KVMHAMonitor extends KVMHABase implements Runnable {
StoragePool storage = conn.storagePoolLookupByUUIDString(uuid);
if (storage == null || storage.getInfo().state != StoragePoolState.VIR_STORAGE_POOL_RUNNING) {
if (storage == null) {
logger.debug(String.format("Libvirt storage pool [%s] not found, removing from HA list.", uuid));
logger.debug("Libvirt storage pool [{}] not found, removing from HA list.", uuid);
} else {
logger.debug(String.format("Libvirt storage pool [%s] found, but not running, removing from HA list.", uuid));
logger.debug("Libvirt storage pool [{}] found, but not running, removing from HA list.", uuid);
}
removedPools.add(uuid);
}
logger.debug(String.format("Found NFS storage pool [%s] in libvirt, continuing.", uuid));
logger.debug("Found NFS storage pool [{}] in libvirt, continuing.", uuid);
} catch (LibvirtException e) {
logger.debug(String.format("Failed to lookup libvirt storage pool [%s].", uuid), e);
logger.debug("Failed to lookup libvirt storage pool [{}].", uuid, e);
if (e.toString().contains("pool not found")) {
logger.debug(String.format("Removing pool [%s] from HA monitor since it was deleted.", uuid));
logger.debug("Removing pool [{}] from HA monitor since it was deleted.", uuid);
removedPools.add(uuid);
}
}
@ -155,11 +149,10 @@ public class KVMHAMonitor extends KVMHABase implements Runnable {
runHeartBeat();
try {
Thread.sleep(_heartBeatUpdateFreq);
Thread.sleep(_heartBeatUpdateFreqInMs);
} catch (InterruptedException e) {
logger.debug("[IGNORED] Interrupted between heartbeats.", e);
}
}
}
}

View File

@ -39,12 +39,12 @@ public class KVMHAVMActivityChecker extends KVMHABase implements Callable<Boolea
}
@Override
public Boolean checkingHeartBeat() {
return this.storagePool.getPool().vmActivityCheck(storagePool, host, activityScriptTimeout, volumeUuidList, vmActivityCheckPath, suspectTimeInSeconds);
public Boolean hasHeartBeat() {
return this.storagePool.getPool().hasVmActivity(storagePool, host, activityScriptTimeout, volumeUuidList, vmActivityCheckPath, suspectTimeInSeconds);
}
@Override
public Boolean call() throws Exception {
return checkingHeartBeat();
return hasHeartBeat();
}
}

View File

@ -1388,9 +1388,9 @@ public class LibvirtComputingResource extends ServerResourceBase implements Serv
final String[] info = NetUtils.getNetworkParams(privateNic);
kvmhaMonitor = new KVMHAMonitor(null, info[0]);
final Thread ha = new Thread(kvmhaMonitor);
ha.start();
kvmhaMonitor = new KVMHAMonitor(info[0]);
final Thread haMonitorThread = new Thread(kvmhaMonitor);
haMonitorThread.start();
storagePoolManager = new KVMStoragePoolManager(storageLayer, kvmhaMonitor);

View File

@ -26,6 +26,7 @@ import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import com.cloud.agent.api.Answer;
import com.cloud.agent.api.CheckOnHostAnswer;
import com.cloud.agent.api.CheckOnHostCommand;
import com.cloud.agent.api.to.HostTO;
import com.cloud.hypervisor.kvm.resource.KVMHABase.HAStoragePool;
@ -45,20 +46,21 @@ public final class LibvirtCheckOnHostCommandWrapper extends CommandWrapper<Check
final List<HAStoragePool> pools = monitor.getStoragePools();
final HostTO host = command.getHost();
final KVMHAChecker ha = new KVMHAChecker(pools, host, command.isCheckFailedOnOneStorage());
final KVMHAChecker ha = new KVMHAChecker(pools, host, command.shouldReportIfHeartBeatFailedForOneStoragePool());
final Future<Boolean> future = executors.submit(ha);
try {
final Boolean result = future.get();
if (result) {
return new Answer(command, false, "Heart is beating...");
final Boolean hasHeartBeat = future.get();
if (hasHeartBeat) {
return new CheckOnHostAnswer(command, true, "Heart is beating");
} else {
return new Answer(command);
return new CheckOnHostAnswer(command, "Heart is not beating");
}
} catch (final InterruptedException e) {
return new Answer(command, false, "CheckOnHostCommand: can't get status of host: InterruptedException");
return new CheckOnHostAnswer(command, "CheckOnHostCommand: can't get status of host: InterruptedException");
} catch (final ExecutionException e) {
return new Answer(command, false, "CheckOnHostCommand: can't get status of host: ExecutionException");
return new CheckOnHostAnswer(command, "CheckOnHostCommand: can't get status of host: ExecutionException");
}
}
}

View File

@ -49,8 +49,8 @@ public final class LibvirtCheckVMActivityOnStoragePoolCommandWrapper extends Com
KVMStoragePool primaryPool = storagePoolMgr.getStoragePool(pool.getType(), pool.getUuid());
if (primaryPool.isPoolSupportHA()) {
final HAStoragePool nfspool = monitor.getStoragePool(pool.getUuid());
final KVMHAVMActivityChecker ha = new KVMHAVMActivityChecker(nfspool, command.getHost(), command.getVolumeList(), libvirtComputingResource.getVmActivityCheckPath(), command.getSuspectTimeInSeconds());
final HAStoragePool haPool = monitor.getStoragePool(pool.getUuid());
final KVMHAVMActivityChecker ha = new KVMHAVMActivityChecker(haPool, command.getHost(), command.getVolumeList(), libvirtComputingResource.getVmActivityCheckPath(), command.getSuspectTimeInSeconds());
final Future<Boolean> future = executors.submit(ha);
try {
final Boolean result = future.get();

View File

@ -208,12 +208,12 @@ public class IscsiAdmStoragePool implements KVMStoragePool {
}
@Override
public Boolean checkingHeartBeat(HAStoragePool pool, HostTO host) {
public Boolean hasHeartBeat(HAStoragePool pool, HostTO host) {
return null;
}
@Override
public Boolean vmActivityCheck(HAStoragePool pool, HostTO host, Duration activityScriptTimeout, String volumeUUIDListString, String vmActivityCheckPath, long duration) {
public Boolean hasVmActivity(HAStoragePool pool, HostTO host, Duration activityScriptTimeout, String volumeUUIDListString, String vmActivityCheckPath, long duration) {
return null;
}

View File

@ -33,35 +33,33 @@ import com.cloud.storage.Storage.StoragePoolType;
public interface KVMStoragePool {
public static final long HeartBeatUpdateTimeout = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.HEARTBEAT_UPDATE_TIMEOUT);
public static final long HeartBeatUpdateFreq = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_UPDATE_FREQUENCY);
public static final long HeartBeatUpdateMaxTries = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_UPDATE_MAX_TRIES);
public static final long HeartBeatUpdateRetrySleep = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_UPDATE_RETRY_SLEEP);
public static final long HeartBeatCheckerTimeout = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_CHECKER_TIMEOUT);
long HeartBeatUpdateTimeoutInMs = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.HEARTBEAT_UPDATE_TIMEOUT);
long HeartBeatUpdateFreqInMs = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_UPDATE_FREQUENCY);
long HeartBeatCheckerTimeoutInMs = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_CHECKER_TIMEOUT);
public default KVMPhysicalDisk createPhysicalDisk(String volumeUuid, PhysicalDiskFormat format, Storage.ProvisioningType provisioningType, long size, Long usableSize, byte[] passphrase) {
default KVMPhysicalDisk createPhysicalDisk(String volumeUuid, PhysicalDiskFormat format, Storage.ProvisioningType provisioningType, long size, Long usableSize, byte[] passphrase) {
return createPhysicalDisk(volumeUuid, format, provisioningType, size, passphrase);
}
public KVMPhysicalDisk createPhysicalDisk(String volumeUuid, PhysicalDiskFormat format, Storage.ProvisioningType provisioningType, long size, byte[] passphrase);
KVMPhysicalDisk createPhysicalDisk(String volumeUuid, PhysicalDiskFormat format, Storage.ProvisioningType provisioningType, long size, byte[] passphrase);
public KVMPhysicalDisk createPhysicalDisk(String volumeUuid, Storage.ProvisioningType provisioningType, long size, byte[] passphrase);
KVMPhysicalDisk createPhysicalDisk(String volumeUuid, Storage.ProvisioningType provisioningType, long size, byte[] passphrase);
public boolean connectPhysicalDisk(String volumeUuid, Map<String, String> details);
boolean connectPhysicalDisk(String volumeUuid, Map<String, String> details);
public KVMPhysicalDisk getPhysicalDisk(String volumeUuid);
KVMPhysicalDisk getPhysicalDisk(String volumeUuid);
public boolean disconnectPhysicalDisk(String volumeUuid);
boolean disconnectPhysicalDisk(String volumeUuid);
public boolean deletePhysicalDisk(String volumeUuid, Storage.ImageFormat format);
boolean deletePhysicalDisk(String volumeUuid, Storage.ImageFormat format);
public List<KVMPhysicalDisk> listPhysicalDisks();
List<KVMPhysicalDisk> listPhysicalDisks();
public String getUuid();
String getUuid();
public long getCapacity();
long getCapacity();
public long getUsed();
long getUsed();
default Long getCapacityIops() {
return null;
@ -71,51 +69,51 @@ public interface KVMStoragePool {
return null;
}
public long getAvailable();
long getAvailable();
public boolean refresh();
boolean refresh();
public boolean isExternalSnapshot();
boolean isExternalSnapshot();
public String getLocalPath();
String getLocalPath();
public String getSourceHost();
String getSourceHost();
public String getSourceDir();
String getSourceDir();
public int getSourcePort();
int getSourcePort();
public String getAuthUserName();
String getAuthUserName();
public String getAuthSecret();
String getAuthSecret();
public StoragePoolType getType();
StoragePoolType getType();
public boolean delete();
boolean delete();
PhysicalDiskFormat getDefaultFormat();
public boolean createFolder(String path);
boolean createFolder(String path);
public boolean supportsConfigDriveIso();
boolean supportsConfigDriveIso();
public Map<String, String> getDetails();
Map<String, String> getDetails();
default String getLocalPathFor(String relativePath) {
return String.format("%s%s%s", getLocalPath(), File.separator, relativePath);
}
public boolean isPoolSupportHA();
boolean isPoolSupportHA();
public String getHearthBeatPath();
String getHearthBeatPath();
public String createHeartBeatCommand(HAStoragePool primaryStoragePool, String hostPrivateIp, boolean hostValidation);
String createHeartBeatCommand(HAStoragePool primaryStoragePool, String hostPrivateIp, boolean hostValidation);
public String getStorageNodeId();
String getStorageNodeId();
public Boolean checkingHeartBeat(HAStoragePool pool, HostTO host);
Boolean hasHeartBeat(HAStoragePool pool, HostTO host);
public Boolean vmActivityCheck(HAStoragePool pool, HostTO host, Duration activityScriptTimeout, String volumeUUIDListString, String vmActivityCheckPath, long duration);
Boolean hasVmActivity(HAStoragePool pool, HostTO host, Duration activityScriptTimeout, String volumeUUIDListString, String vmActivityCheckPath, long duration);
default LibvirtVMDef.DiskDef.BlockIOSize getSupportedLogicalBlockSize() {
return null;

View File

@ -345,16 +345,14 @@ public class LibvirtStoragePool implements KVMStoragePool {
public String createHeartBeatCommand(HAStoragePool primaryStoragePool, String hostPrivateIp, boolean hostValidation) {
Script cmd = new Script(primaryStoragePool.getPool().getHearthBeatPath(), HeartBeatUpdateTimeout, logger);
Script cmd = new Script(primaryStoragePool.getPool().getHearthBeatPath(), HeartBeatUpdateTimeoutInMs, logger);
cmd.add("-i", primaryStoragePool.getPoolIp());
cmd.add("-p", primaryStoragePool.getPoolMountSourcePath());
cmd.add("-m", primaryStoragePool.getMountDestPath());
if (hostValidation) {
cmd.add("-h", hostPrivateIp);
}
if (!hostValidation) {
} else {
cmd.add("-c");
}
@ -372,53 +370,53 @@ public class LibvirtStoragePool implements KVMStoragePool {
}
@Override
public Boolean checkingHeartBeat(HAStoragePool pool, HostTO host) {
boolean validResult = false;
public Boolean hasHeartBeat(HAStoragePool pool, HostTO host) {
String hostIp = host.getPrivateNetwork().getIp();
Script cmd = new Script(getHearthBeatPath(), HeartBeatCheckerTimeout, logger);
Script cmd = new Script(getHearthBeatPath(), HeartBeatCheckerTimeoutInMs, logger);
cmd.add("-i", pool.getPoolIp());
cmd.add("-p", pool.getPoolMountSourcePath());
cmd.add("-m", pool.getMountDestPath());
cmd.add("-h", hostIp);
cmd.add("-r");
cmd.add("-t", String.valueOf(HeartBeatUpdateFreq / 1000));
cmd.add("-t", String.valueOf(HeartBeatUpdateFreqInMs / 1000));
OutputInterpreter.OneLineParser parser = new OutputInterpreter.OneLineParser();
String result = cmd.execute(parser);
String parsedLine = parser.getLine();
logger.debug(String.format("Checking heart beat with KVMHAChecker [{command=\"%s\", result: \"%s\", log: \"%s\", pool: \"%s\"}].", cmd.toString(), result, parsedLine,
pool.getPoolIp()));
logger.debug("Checking heart beat for host IP {} with KVMHAChecker [{command=\"{}\", result: \"{}\", log: \"{}\", pool: \"{}\"}].", hostIp, cmd.toString(), result, parsedLine, pool.getPoolIp());
if (result == null && parsedLine.contains("DEAD")) {
logger.warn(String.format("Checking heart beat with KVMHAChecker command [%s] returned [%s]. [%s]. It may cause a shutdown of host IP [%s].", cmd.toString(),
result, parsedLine, hostIp));
logger.warn("Checking heart beat for host IP {} with KVMHAChecker command [{}] returned [{}]. It may cause a shutdown of the host.", hostIp, cmd.toString(), parsedLine);
return false;
} else {
validResult = true;
logger.debug("Checking heart beat for host IP {} with KVMHAChecker command [{}] succeeded.", hostIp, cmd.toString());
return true;
}
return validResult;
}
@Override
public Boolean vmActivityCheck(HAStoragePool pool, HostTO host, Duration activityScriptTimeout, String volumeUUIDListString, String vmActivityCheckPath, long duration) {
public Boolean hasVmActivity(HAStoragePool pool, HostTO host, Duration activityScriptTimeout, String volumeUUIDListString, String vmActivityCheckPath, long duration) {
String hostIp = host.getPrivateNetwork().getIp();
Script cmd = new Script(vmActivityCheckPath, activityScriptTimeout.getStandardSeconds(), logger);
cmd.add("-i", pool.getPoolIp());
cmd.add("-p", pool.getPoolMountSourcePath());
cmd.add("-m", pool.getMountDestPath());
cmd.add("-h", host.getPrivateNetwork().getIp());
cmd.add("-h", hostIp);
cmd.add("-u", volumeUUIDListString);
cmd.add("-t", String.valueOf(String.valueOf(System.currentTimeMillis() / 1000)));
cmd.add("-t", String.valueOf(System.currentTimeMillis() / 1000));
cmd.add("-d", String.valueOf(duration));
OutputInterpreter.OneLineParser parser = new OutputInterpreter.OneLineParser();
String result = cmd.execute(parser);
String parsedLine = parser.getLine();
logger.debug(String.format("Checking heart beat with KVMHAVMActivityChecker [{command=\"%s\", result: \"%s\", log: \"%s\", pool: \"%s\"}].", cmd.toString(), result, parsedLine, pool.getPoolIp()));
logger.debug("Checking VM activity for host IP {} with KVMHAVMActivityChecker [{command=\"{}\", result: \"{}\", log: \"{}\", pool: \"{}\"}].", hostIp, cmd.toString(), result, parsedLine, pool.getPoolIp());
if (result == null && parsedLine.contains("DEAD")) {
logger.warn(String.format("Checking heart beat with KVMHAVMActivityChecker command [%s] returned [%s]. It is [%s]. It may cause a shutdown of host IP [%s].", cmd.toString(), result, parsedLine, host.getPrivateNetwork().getIp()));
logger.warn("Checking VM activity for host IP {} with KVMHAVMActivityChecker command [{}] returned [{}]. It may cause a shutdown of the host.", hostIp, cmd.toString(), parsedLine);
return false;
} else {
logger.debug("Checking VM activity for host IP {} with KVMHAVMActivityChecker command [{}] succeeded.", hostIp, cmd.toString());
return true;
}
}

View File

@ -225,13 +225,13 @@ public class MultipathSCSIPool implements KVMStoragePool {
}
@Override
public Boolean checkingHeartBeat(HAStoragePool pool, HostTO host) {
public Boolean hasHeartBeat(HAStoragePool pool, HostTO host) {
return null;
}
@Override
public Boolean vmActivityCheck(HAStoragePool pool, HostTO host, Duration activityScriptTimeout,
String volumeUUIDListString, String vmActivityCheckPath, long duration) {
public Boolean hasVmActivity(HAStoragePool pool, HostTO host, Duration activityScriptTimeout,
String volumeUUIDListString, String vmActivityCheckPath, long duration) {
return null;
}

View File

@ -236,12 +236,12 @@ public class ScaleIOStoragePool implements KVMStoragePool {
}
@Override
public Boolean checkingHeartBeat(HAStoragePool pool, HostTO host) {
public Boolean hasHeartBeat(HAStoragePool pool, HostTO host) {
return null;
}
@Override
public Boolean vmActivityCheck(HAStoragePool pool, HostTO host, Duration activityScriptTimeout, String volumeUUIDListString, String vmActivityCheckPath, long duration) {
public Boolean hasVmActivity(HAStoragePool pool, HostTO host, Duration activityScriptTimeout, String volumeUUIDListString, String vmActivityCheckPath, long duration) {
return null;
}
}

View File

@ -19,38 +19,37 @@ package org.apache.cloudstack.kvm.ha;
import org.apache.cloudstack.framework.config.ConfigKey;
public class KVMHAConfig {
public interface KVMHAConfig {
public static final ConfigKey<Long> KvmHAHealthCheckTimeout = new ConfigKey<>("Advanced", Long.class, "kvm.ha.health.check.timeout", "10",
ConfigKey<Long> KvmHAHealthCheckTimeout = new ConfigKey<>("Advanced", Long.class, "kvm.ha.health.check.timeout", "10",
"The maximum length of time, in seconds, expected for an health check to complete.", true, ConfigKey.Scope.Cluster);
public static final ConfigKey<Long> KvmHAActivityCheckTimeout = new ConfigKey<>("Advanced", Long.class, "kvm.ha.activity.check.timeout", "60",
ConfigKey<Long> KvmHAActivityCheckTimeout = new ConfigKey<>("Advanced", Long.class, "kvm.ha.activity.check.timeout", "60",
"The maximum length of time, in seconds, expected for an activity check to complete.", true, ConfigKey.Scope.Cluster);
public static final ConfigKey<Long> KvmHAActivityCheckInterval = new ConfigKey<>("Advanced", Long.class, "kvm.ha.activity.check.interval", "60",
ConfigKey<Long> KvmHAActivityCheckInterval = new ConfigKey<>("Advanced", Long.class, "kvm.ha.activity.check.interval", "60",
"The interval, in seconds, between activity checks.", true, ConfigKey.Scope.Cluster);
public static final ConfigKey<Long> KvmHAActivityCheckMaxAttempts = new ConfigKey<>("Advanced", Long.class, "kvm.ha.activity.check.max.attempts", "10",
ConfigKey<Long> KvmHAActivityCheckMaxAttempts = new ConfigKey<>("Advanced", Long.class, "kvm.ha.activity.check.max.attempts", "10",
"The maximum number of activity check attempts to perform before deciding to recover or degrade a resource.", true, ConfigKey.Scope.Cluster);
public static final ConfigKey<Double> KvmHAActivityCheckFailureThreshold = new ConfigKey<>("Advanced", Double.class, "kvm.ha.activity.check.failure.ratio", "0.7",
ConfigKey<Double> KvmHAActivityCheckFailureThreshold = new ConfigKey<>("Advanced", Double.class, "kvm.ha.activity.check.failure.ratio", "0.7",
"The activity check failure threshold ratio. This is used with the activity check maximum attempts for deciding to recover or degrade a resource. For most environments, please keep this value above 0.5.",
true, ConfigKey.Scope.Cluster);
public static final ConfigKey<Long> KvmHADegradedMaxPeriod = new ConfigKey<>("Advanced", Long.class, "kvm.ha.degraded.max.period", "300",
ConfigKey<Long> KvmHADegradedMaxPeriod = new ConfigKey<>("Advanced", Long.class, "kvm.ha.degraded.max.period", "300",
"The maximum length of time, in seconds, a resource can be in degraded state where only health checks are performed.", true, ConfigKey.Scope.Cluster);
public static final ConfigKey<Long> KvmHARecoverTimeout = new ConfigKey<>("Advanced", Long.class, "kvm.ha.recover.timeout", "60",
ConfigKey<Long> KvmHARecoverTimeout = new ConfigKey<>("Advanced", Long.class, "kvm.ha.recover.timeout", "60",
"The maximum length of time, in seconds, expected for a recovery operation to complete.", true, ConfigKey.Scope.Cluster);
public static final ConfigKey<Long> KvmHARecoverWaitPeriod = new ConfigKey<>("Advanced", Long.class, "kvm.ha.recover.wait.period", "600",
ConfigKey<Long> KvmHARecoverWaitPeriod = new ConfigKey<>("Advanced", Long.class, "kvm.ha.recover.wait.period", "600",
"The maximum length of time, in seconds, to wait for a resource to recover.", true, ConfigKey.Scope.Cluster);
public static final ConfigKey<Long> KvmHARecoverAttemptThreshold = new ConfigKey<>("Advanced", Long.class, "kvm.ha.recover.failure.threshold", "1",
ConfigKey<Long> KvmHARecoverAttemptThreshold = new ConfigKey<>("Advanced", Long.class, "kvm.ha.recover.failure.threshold", "1",
"The maximum recovery attempts to be made for a resource, after which the resource is fenced. The recovery counter resets when a health check passes for a resource.",
true, ConfigKey.Scope.Cluster);
public static final ConfigKey<Long> KvmHAFenceTimeout = new ConfigKey<>("Advanced", Long.class, "kvm.ha.fence.timeout", "60",
ConfigKey<Long> KvmHAFenceTimeout = new ConfigKey<>("Advanced", Long.class, "kvm.ha.fence.timeout", "60",
"The maximum length of time, in seconds, expected for a fence operation to complete.", true, ConfigKey.Scope.Cluster);
}

View File

@ -68,17 +68,18 @@ public final class KVMHAProvider extends HAAbstractHostProvider implements HAPro
@Override
public boolean recover(Host r) throws HARecoveryException {
logger.debug("Recover the host {}", r);
try {
if (outOfBandManagementService.isOutOfBandManagementEnabled(r)){
if (outOfBandManagementService.isOutOfBandManagementEnabled(r)) {
final OutOfBandManagementResponse resp = outOfBandManagementService.executePowerOperation(r, PowerOperation.RESET, null);
return resp.getSuccess();
} else {
logger.warn("OOBM recover operation failed for the host {}", r);
return false;
}
} catch (Exception e){
} catch (Exception e) {
logger.warn("OOBM service is not configured or enabled for this host {} error is {}", r, e.getMessage());
throw new HARecoveryException(String.format(" OOBM service is not configured or enabled for this host %s", r), e);
throw new HARecoveryException(String.format("OOBM service is not configured or enabled for this host %s", r), e);
}
}

View File

@ -19,6 +19,7 @@ package org.apache.cloudstack.kvm.ha;
import com.cloud.agent.AgentManager;
import com.cloud.agent.api.Answer;
import com.cloud.agent.api.CheckOnHostAnswer;
import com.cloud.agent.api.CheckOnHostCommand;
import com.cloud.agent.api.CheckVMActivityOnStoragePoolCommand;
import com.cloud.dc.dao.ClusterDao;
@ -61,7 +62,7 @@ public class KVMHostActivityChecker extends AdapterBase implements ActivityCheck
@Inject
private AgentManager agentMgr;
@Inject
private PrimaryDataStoreDao storagePool;
private PrimaryDataStoreDao storagePoolDao;
@Inject
private StorageManager storageManager;
@Inject
@ -70,11 +71,11 @@ public class KVMHostActivityChecker extends AdapterBase implements ActivityCheck
@Override
public boolean isActive(Host r, DateTime suspectTime) throws HACheckerException {
try {
return isVMActivityOnHost(r, suspectTime);
return hasVMActivityOnHost(r, suspectTime);
} catch (HACheckerException e) {
//Re-throwing the exception to avoid poluting the 'HACheckerException' already thrown
//Re-throwing the exception to avoid polluting the 'HACheckerException' already thrown
throw e;
} catch (Exception e){
} catch (Exception e) {
String message = String.format("Operation timed out, probably the %s is not reachable.", r.toString());
logger.warn(message, e);
throw new HACheckerException(message, e);
@ -83,82 +84,115 @@ public class KVMHostActivityChecker extends AdapterBase implements ActivityCheck
@Override
public boolean isHealthy(Host r) {
return isAgentActive(r);
return isHostAgentUp(r);
}
private boolean isAgentActive(Host agent) {
if (agent.getHypervisorType() != Hypervisor.HypervisorType.KVM && agent.getHypervisorType() != Hypervisor.HypervisorType.LXC) {
throw new IllegalStateException(String.format("Calling KVM investigator for non KVM Host of type [%s].", agent.getHypervisorType()));
private boolean isHostAgentUp(Host host) {
if (host.getHypervisorType() != Hypervisor.HypervisorType.KVM && host.getHypervisorType() != Hypervisor.HypervisorType.LXC) {
throw new IllegalStateException(String.format("Calling KVM investigator for non KVM Host of type [%s].", host.getHypervisorType()));
}
Status hostStatus = Status.Unknown;
Status neighbourStatus = Status.Unknown;
final CheckOnHostCommand cmd = new CheckOnHostCommand(agent, HighAvailabilityManager.KvmHAFenceHostIfHeartbeatFailsOnStorage.value());
try {
logger.debug(String.format("Checking %s status...", agent.toString()));
Answer answer = agentMgr.easySend(agent.getId(), cmd);
if (answer != null) {
hostStatus = answer.getResult() ? Status.Down : Status.Up;
logger.debug(String.format("%s has the status [%s].", agent.toString(), hostStatus));
if ( hostStatus == Status.Up ){
return true;
Status hostStatus = getHostAgentStatus(host);
logger.debug("{} has the status [{}].", host.toString(), hostStatus);
return hostStatus == Status.Up;
}
public Status getHostAgentStatus(Host host) {
if (host.getHypervisorType() != Hypervisor.HypervisorType.KVM && host.getHypervisorType() != Hypervisor.HypervisorType.LXC) {
return null;
}
Status hostStatusFromItself = checkHostStatusWithSameHost(host);
if (hostStatusFromItself == Status.Up) {
return Status.Up;
}
Status hostStatusFromNeighbour = checkHostStatusWithNeighbourHosts(host);
Status hostStatus = hostStatusFromItself;
if (hostStatusFromNeighbour == Status.Up && (hostStatusFromItself == Status.Disconnected || hostStatusFromItself == Status.Down)) {
hostStatus = Status.Disconnected;
}
if (hostStatusFromNeighbour == Status.Down && (hostStatusFromItself == Status.Disconnected || hostStatusFromItself == Status.Down)) {
hostStatus = Status.Down;
}
logger.debug("HA: HOST is ineligible legacy state {} for host {}", hostStatus, host);
return hostStatus;
}
private Status checkHostStatusWithSameHost(Host host) {
Status hostStatus;
boolean reportFailureIfOneStorageIsDown = HighAvailabilityManager.KvmHAFenceHostIfHeartbeatFailsOnStorage.value();
final CheckOnHostCommand cmd = new CheckOnHostCommand(host, reportFailureIfOneStorageIsDown);
try {
logger.debug("Checking {} status...", host.toString());
Answer answer = agentMgr.easySend(host.getId(), cmd);
if (answer != null) {
if (answer.getResult()) {
hostStatus = ((CheckOnHostAnswer)answer).isAlive() ? Status.Up : Status.Down;
} else {
logger.debug("{} is not active according to itself, details: {}.", host.toString(), answer.getDetails());
hostStatus = Status.Down;
}
}
else {
logger.debug(String.format("Setting %s to \"Disconnected\" status.", agent.toString()));
logger.debug("{} has the status [{}].", host.toString(), hostStatus);
} else {
logger.debug("Setting {} to \"Disconnected\" status.", host.toString());
hostStatus = Status.Disconnected;
}
} catch (Exception e) {
logger.warn(String.format("Failed to send command CheckOnHostCommand to %s.", agent.toString()), e);
logger.warn("Failed to send command CheckOnHostCommand to {}.", host.toString(), e);
hostStatus = Status.Disconnected;
}
List<HostVO> neighbors = resourceManager.listHostsInClusterByStatus(agent.getClusterId(), Status.Up);
return hostStatus;
}
private Status checkHostStatusWithNeighbourHosts(Host host) {
Status hostStatusFromNeighbour = Status.Unknown;
boolean reportFailureIfOneStorageIsDown = HighAvailabilityManager.KvmHAFenceHostIfHeartbeatFailsOnStorage.value();
final CheckOnHostCommand cmd = new CheckOnHostCommand(host, reportFailureIfOneStorageIsDown);
List<HostVO> neighbors = resourceManager.listHostsInClusterByStatus(host.getClusterId(), Status.Up);
for (HostVO neighbor : neighbors) {
if (neighbor.getId() == agent.getId() || (neighbor.getHypervisorType() != Hypervisor.HypervisorType.KVM && neighbor.getHypervisorType() != Hypervisor.HypervisorType.LXC)) {
if (neighbor.getId() == host.getId()
|| (neighbor.getHypervisorType() != Hypervisor.HypervisorType.KVM && neighbor.getHypervisorType() != Hypervisor.HypervisorType.LXC)) {
continue;
}
try {
logger.debug(String.format("Investigating %s via neighbouring %s.", agent.toString(), neighbor.toString()));
logger.debug("Investigating {} via neighboring {}.", host.toString(), neighbor.toString());
Answer answer = agentMgr.easySend(neighbor.getId(), cmd);
if (answer != null) {
neighbourStatus = answer.getResult() ? Status.Down : Status.Up;
logger.debug(String.format("Neighbouring %s returned status [%s] for the investigated %s.", neighbor.toString(), neighbourStatus, agent.toString()));
if (neighbourStatus == Status.Up) {
break;
if (answer.getResult()) {
hostStatusFromNeighbour = ((CheckOnHostAnswer)answer).isAlive() ? Status.Up : Status.Down;
logger.debug("Neighboring {} returned status [{}] for the investigated {}.", neighbor.toString(), hostStatusFromNeighbour, host.toString());
if (hostStatusFromNeighbour == Status.Up) {
return hostStatusFromNeighbour;
}
} else {
logger.debug("{} is not active according to neighbor {}, details: {}.", host.toString(), neighbor.toString(), answer.getDetails());
}
} else {
logger.debug(String.format("Neighbouring %s is Disconnected.", neighbor.toString()));
logger.debug("Neighboring {} is Disconnected.", neighbor.toString());
}
} catch (Exception e) {
logger.warn(String.format("Failed to send command CheckOnHostCommand to %s.", neighbor.toString()), e);
logger.warn("Failed to send command CheckOnHostCommand to neighbor {}.", neighbor.toString(), e);
}
}
if (neighbourStatus == Status.Up && (hostStatus == Status.Disconnected || hostStatus == Status.Down)) {
hostStatus = Status.Disconnected;
}
if (neighbourStatus == Status.Down && (hostStatus == Status.Disconnected || hostStatus == Status.Down)) {
hostStatus = Status.Down;
}
logger.debug(String.format("%s has the status [%s].", agent.toString(), hostStatus));
return hostStatus == Status.Up;
return hostStatusFromNeighbour;
}
private boolean isVMActivityOnHost(Host agent, DateTime suspectTime) throws HACheckerException {
if (agent.getHypervisorType() != Hypervisor.HypervisorType.KVM && agent.getHypervisorType() != Hypervisor.HypervisorType.LXC) {
throw new IllegalStateException(String.format("Calling KVM investigator for non KVM Host of type [%s].", agent.getHypervisorType()));
private boolean hasVMActivityOnHost(Host host, DateTime suspectTime) throws HACheckerException {
if (host.getHypervisorType() != Hypervisor.HypervisorType.KVM && host.getHypervisorType() != Hypervisor.HypervisorType.LXC) {
throw new IllegalStateException(String.format("Calling KVM investigator for non KVM Host of type [%s].", host.getHypervisorType()));
}
boolean activityStatus = true;
HashMap<StoragePool, List<Volume>> poolVolMap = getVolumeUuidOnHost(agent);
for (StoragePool pool : poolVolMap.keySet()) {
activityStatus = verifyActivityOfStorageOnHost(poolVolMap, pool, agent, suspectTime, activityStatus);
HashMap<StoragePool, List<Volume>> poolVolumeMap = getStoragePoolAndVolumeInfoOnHost(host);
for (StoragePool pool : poolVolumeMap.keySet()) {
activityStatus = verifyActivityOfStorageOnHost(poolVolumeMap, pool, host, suspectTime, activityStatus);
if (!activityStatus) {
logger.warn("It seems that the storage pool [{}] does not have activity on {}.", pool, agent);
logger.warn("It seems that the storage pool [{}] does not have activity on {}.", pool, host);
break;
}
}
@ -166,66 +200,64 @@ public class KVMHostActivityChecker extends AdapterBase implements ActivityCheck
return activityStatus;
}
protected boolean verifyActivityOfStorageOnHost(HashMap<StoragePool, List<Volume>> poolVolMap, StoragePool pool, Host agent, DateTime suspectTime, boolean activityStatus) throws HACheckerException, IllegalStateException {
protected boolean verifyActivityOfStorageOnHost(HashMap<StoragePool, List<Volume>> poolVolMap, StoragePool pool, Host host, DateTime suspectTime, boolean activityStatus) throws HACheckerException, IllegalStateException {
List<Volume> volume_list = poolVolMap.get(pool);
final CheckVMActivityOnStoragePoolCommand cmd = new CheckVMActivityOnStoragePoolCommand(agent, pool, volume_list, suspectTime);
final CheckVMActivityOnStoragePoolCommand cmd = new CheckVMActivityOnStoragePoolCommand(host, pool, volume_list, suspectTime);
logger.debug("Checking VM activity for {} on storage pool [{}].", agent.toString(), pool);
logger.debug("Checking VM activity for {} on storage pool [{}].", host.toString(), pool);
try {
Answer answer = storageManager.sendToPool(pool, getNeighbors(agent), cmd);
Answer answer = storageManager.sendToPool(pool, getNeighbors(host), cmd);
if (answer != null) {
activityStatus = !answer.getResult();
logger.debug("{} {} activity on storage pool [{}]", agent.toString(), activityStatus ? "has" : "does not have", pool);
logger.debug("{} {} activity on storage pool [{}]", host.toString(), activityStatus ? "has" : "does not have", pool);
} else {
String message = String.format("Did not get a valid response for VM activity check for %s on storage pool [%s].", agent.toString(), pool);
String message = String.format("Did not get a valid response for VM activity check for %s on storage pool [%s].", host.toString(), pool);
logger.debug(message);
throw new IllegalStateException(message);
}
} catch (StorageUnavailableException e){
String message = String.format("Storage [%s] is unavailable to do the check, probably the %s is not reachable.", pool, agent);
} catch (StorageUnavailableException e) {
String message = String.format("Storage [%s] is unavailable to do the check, probably the %s is not reachable.", pool, host);
logger.warn(message, e);
throw new HACheckerException(message, e);
}
return activityStatus;
}
private HashMap<StoragePool, List<Volume>> getVolumeUuidOnHost(Host agent) {
List<VMInstanceVO> vm_list = vmInstanceDao.listByHostId(agent.getId());
List<VolumeVO> volume_list = new ArrayList<VolumeVO>();
for (VirtualMachine vm : vm_list) {
private HashMap<StoragePool, List<Volume>> getStoragePoolAndVolumeInfoOnHost(Host host) {
List<VMInstanceVO> vmListOnHost = vmInstanceDao.listByHostId(host.getId());
List<VolumeVO> volumeListOnHost = new ArrayList<>();
for (VirtualMachine vm : vmListOnHost) {
logger.debug("Retrieving volumes of VM [{}]...", vm);
List<VolumeVO> vm_volume_list = volumeDao.findByInstance(vm.getId());
volume_list.addAll(vm_volume_list);
List<VolumeVO> volumeListOfVM = volumeDao.findByInstance(vm.getId());
volumeListOnHost.addAll(volumeListOfVM);
}
HashMap<StoragePool, List<Volume>> poolVolMap = new HashMap<StoragePool, List<Volume>>();
for (Volume vol : volume_list) {
StoragePool sp = storagePool.findById(vol.getPoolId());
logger.debug("Retrieving storage pool [{}] of volume [{}]...", sp, vol);
if (!poolVolMap.containsKey(sp)) {
List<Volume> list = new ArrayList<Volume>();
list.add(vol);
HashMap<StoragePool, List<Volume>> poolVolumeMap = new HashMap<>();
for (Volume volume : volumeListOnHost) {
StoragePool pool = storagePoolDao.findById(volume.getPoolId());
logger.debug("Retrieving storage pool [{}] of volume [{}]...", pool, volume);
if (!poolVolumeMap.containsKey(pool)) {
List<Volume> volList = new ArrayList<>();
volList.add(volume);
poolVolMap.put(sp, list);
poolVolumeMap.put(pool, volList);
} else {
poolVolMap.get(sp).add(vol);
poolVolumeMap.get(pool).add(volume);
}
}
return poolVolMap;
return poolVolumeMap;
}
public long[] getNeighbors(Host agent) {
List<Long> neighbors = new ArrayList<Long>();
List<HostVO> cluster_hosts = resourceManager.listHostsInClusterByStatus(agent.getClusterId(), Status.Up);
logger.debug("Retrieving all \"Up\" hosts from cluster [{}]...", clusterDao.findById(agent.getClusterId()));
for (HostVO host : cluster_hosts) {
if (host.getId() == agent.getId() || (host.getHypervisorType() != Hypervisor.HypervisorType.KVM && host.getHypervisorType() != Hypervisor.HypervisorType.LXC)) {
public long[] getNeighbors(Host host) {
List<Long> neighbors = new ArrayList<>();
List<HostVO> clusterHosts = resourceManager.listHostsInClusterByStatus(host.getClusterId(), Status.Up);
logger.debug("Retrieving all \"Up\" hosts from cluster [{}]...", clusterDao.findById(host.getClusterId()));
for (HostVO clusterHost : clusterHosts) {
if (clusterHost.getId() == host.getId() || (clusterHost.getHypervisorType() != Hypervisor.HypervisorType.KVM && clusterHost.getHypervisorType() != Hypervisor.HypervisorType.LXC)) {
continue;
}
neighbors.add(host.getId());
neighbors.add(clusterHost.getId());
}
return ArrayUtils.toPrimitive(neighbors.toArray(new Long[neighbors.size()]));
}
}

View File

@ -3133,7 +3133,7 @@ public class LibvirtComputingResourceTest {
assertNotNull(wrapper);
final Answer answer = wrapper.execute(command, libvirtComputingResourceMock);
assertTrue(answer.getResult());
assertFalse(answer.getResult());
verify(libvirtComputingResourceMock, times(1)).getMonitor();
}

View File

@ -54,13 +54,13 @@ public class SimulatorInvestigator extends AdapterBase implements Investigator {
}
@Override
public Status isAgentAlive(Host agent) {
public Status getHostAgentStatus(Host agent) {
if (agent.getHypervisorType() != HypervisorType.Simulator) {
return null;
}
if (haManager.isHAEligible(agent)) {
return haManager.getHostStatus(agent);
return haManager.getHostStatusFromHAConfig(agent);
}
CheckOnHostCommand cmd = new CheckOnHostCommand(agent);

View File

@ -28,7 +28,7 @@ public class VmwareInvestigator extends AdapterBase implements Investigator {
}
@Override
public Status isAgentAlive(Host agent) {
public Status getHostAgentStatus(Host agent) {
if (agent.getHypervisorType() == HypervisorType.VMware)
return Status.Disconnected;

View File

@ -585,8 +585,8 @@ public class LinstorStorageAdaptor implements StorageAdaptor {
Path propFile = diskPath.getParent().resolve("template.properties");
if (Files.exists(propFile)) {
java.util.Properties templateProps = new java.util.Properties();
try {
templateProps.load(new FileInputStream(propFile.toFile()));
try (FileInputStream in = new FileInputStream(propFile.toFile())) {
templateProps.load(in);
String desc = templateProps.getProperty("description");
if (desc != null && desc.startsWith("SystemVM Template")) {
return true;

View File

@ -228,11 +228,11 @@ public class LinstorStoragePool implements KVMStoragePool {
public String createHeartBeatCommand(HAStoragePool pool, String hostPrivateIp,
boolean hostValidation) {
LOGGER.trace(String.format("Linstor.createHeartBeatCommand: %s, %s, %b", pool.getPoolIp(), hostPrivateIp, hostValidation));
boolean isStorageNodeUp = checkingHeartBeat(pool, null);
boolean isStorageNodeUp = hasHeartBeat(pool, null);
if (!isStorageNodeUp && !hostValidation) {
//restart the host
LOGGER.debug(String.format("The host [%s] will be restarted because the health check failed for the storage pool [%s]", hostPrivateIp, pool.getPool().getType()));
Script cmd = new Script(pool.getPool().getHearthBeatPath(), Duration.millis(HeartBeatUpdateTimeout), LOGGER);
Script cmd = new Script(pool.getPool().getHearthBeatPath(), Duration.millis(HeartBeatUpdateTimeoutInMs), LOGGER);
cmd.add("-c");
cmd.execute();
return "Down";
@ -258,7 +258,7 @@ public class LinstorStoragePool implements KVMStoragePool {
}
@Override
public Boolean checkingHeartBeat(HAStoragePool pool, HostTO host) {
public Boolean hasHeartBeat(HAStoragePool pool, HostTO host) {
String hostName;
if (host == null) {
hostName = localNodeName;
@ -274,7 +274,7 @@ public class LinstorStoragePool implements KVMStoragePool {
}
private String executeDrbdSetupStatus(OutputInterpreter.AllLinesParser parser) {
Script sc = new Script("drbdsetup", Duration.millis(HeartBeatUpdateTimeout), LOGGER);
Script sc = new Script("drbdsetup", Duration.millis(HeartBeatUpdateTimeoutInMs), LOGGER);
sc.add("status");
sc.add("--json");
return sc.execute(parser);
@ -329,7 +329,7 @@ public class LinstorStoragePool implements KVMStoragePool {
}
private String executeDrbdEventsNow(OutputInterpreter.AllLinesParser parser) {
Script sc = new Script("drbdsetup", Duration.millis(HeartBeatUpdateTimeout), LOGGER);
Script sc = new Script("drbdsetup", Duration.millis(HeartBeatUpdateTimeoutInMs), LOGGER);
sc.add("events2");
sc.add("--now");
return sc.execute(parser);
@ -369,8 +369,8 @@ public class LinstorStoragePool implements KVMStoragePool {
}
@Override
public Boolean vmActivityCheck(HAStoragePool pool, HostTO host, Duration activityScriptTimeout, String volumeUUIDListString, String vmActivityCheckPath, long duration) {
public Boolean hasVmActivity(HAStoragePool pool, HostTO host, Duration activityScriptTimeout, String volumeUUIDListString, String vmActivityCheckPath, long duration) {
LOGGER.trace(String.format("Linstor.vmActivityCheck: %s, %s", pool.getPoolIp(), host.getPrivateNetwork().getIp()));
return checkingHeartBeat(pool, host);
return hasHeartBeat(pool, host);
}
}

View File

@ -198,11 +198,11 @@ public class StorPoolStoragePool implements KVMStoragePool {
@Override
public String createHeartBeatCommand(HAStoragePool primaryStoragePool, String hostPrivateIp, boolean hostValidation) {
boolean isStorageNodeUp = checkingHeartBeat(primaryStoragePool, null);
boolean isStorageNodeUp = hasHeartBeat(primaryStoragePool, null);
if (!isStorageNodeUp && !hostValidation) {
//restart the host
logger.debug(String.format("The host [%s] will be restarted because the health check failed for the storage pool [%s]", hostPrivateIp, primaryStoragePool.getPool().getType()));
Script cmd = new Script(primaryStoragePool.getPool().getHearthBeatPath(), HeartBeatUpdateTimeout, logger);
Script cmd = new Script(primaryStoragePool.getPool().getHearthBeatPath(), HeartBeatUpdateTimeoutInMs, logger);
cmd.add("-c");
cmd.execute();
return "Down";
@ -240,7 +240,7 @@ public class StorPoolStoragePool implements KVMStoragePool {
}
@Override
public Boolean checkingHeartBeat(HAStoragePool pool, HostTO host) {
public Boolean hasHeartBeat(HAStoragePool pool, HostTO host) {
boolean isNodeWorking = false;
OutputInterpreter.AllLinesParser parser = new OutputInterpreter.AllLinesParser();
@ -300,8 +300,8 @@ public class StorPoolStoragePool implements KVMStoragePool {
}
@Override
public Boolean vmActivityCheck(HAStoragePool pool, HostTO host, Duration activityScriptTimeout, String volumeUuidListString, String vmActivityCheckPath, long duration) {
return checkingHeartBeat(pool, host);
public Boolean hasVmActivity(HAStoragePool pool, HostTO host, Duration activityScriptTimeout, String volumeUuidListString, String vmActivityCheckPath, long duration) {
return hasHeartBeat(pool, host);
}
@Override

View File

@ -75,7 +75,7 @@ fi
#delete VMs on this mountpoint
deleteVMs() {
local mountPoint=$1
vmPids=$(ps aux| grep qemu | grep "$mountPoint" | awk '{print $2}' 2> /dev/null)
vmPids=$(ps aux | grep qemu | grep "$mountPoint" | awk '{print $2}' 2> /dev/null)
if [ $? -gt 0 ]
then
return
@ -93,7 +93,7 @@ deleteVMs() {
}
#checking is there the same nfs server mounted under $MountPoint?
mounts=$(cat /proc/mounts |grep nfs|grep $MountPoint)
mounts=$(cat /proc/mounts | grep nfs | grep $MountPoint)
if [ $? -gt 0 ]
then
# remount it

View File

@ -38,7 +38,7 @@ public class CheckOnAgentInvestigator extends AdapterBase implements Investigato
}
@Override
public Status isAgentAlive(Host agent) {
public Status getHostAgentStatus(Host agent) {
return null;
}

View File

@ -42,6 +42,9 @@ import org.apache.cloudstack.engine.subsystem.api.storage.PrimaryDataStoreDriver
import org.apache.cloudstack.framework.config.ConfigKey;
import org.apache.cloudstack.framework.config.Configurable;
import org.apache.cloudstack.framework.config.dao.ConfigurationDao;
import org.apache.cloudstack.ha.HAConfig;
import org.apache.cloudstack.ha.HAResource;
import org.apache.cloudstack.ha.dao.HAConfigDao;
import org.apache.cloudstack.managed.context.ManagedContext;
import org.apache.cloudstack.managed.context.ManagedContextRunnable;
import org.apache.cloudstack.management.ManagementServerHost;
@ -223,6 +226,8 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
@Inject
ConfigurationDao _configDao;
@Inject
HAConfigDao _haConfigDao;
@Inject
VolumeOrchestrationService volumeMgr;
String _instance;
@ -237,25 +242,53 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
long _timeBetweenCleanups;
String _haTag = null;
protected HighAvailabilityManagerImpl() {
}
private boolean vmHasPendingHAJob(final List<HaWorkVO> pendingHaWorks, final VMInstanceVO vm) {
Optional<HaWorkVO> item = pendingHaWorks.stream()
.filter(h -> h.getInstanceId() == vm.getId())
.reduce((first, second) -> second);
if (item.isPresent() && (item.get().getTimesTried() < _maxRetries ||
!item.get().canScheduleNew(_timeBetweenFailures))) {
logger.debug(String.format("Skipping HA on %s as there is already a running HA job for it", vm));
logger.debug("Skipping HA on {} as there is already a running HA job for it", vm);
return true;
}
return false;
}
protected HighAvailabilityManagerImpl() {
private boolean isHostHAInspectionInProgress(long hostId) {
final HAConfig haConfig = _haConfigDao.findHAResource(hostId, HAResource.ResourceType.Host);
if (haConfig == null || !haConfig.isEnabled()) {
return false;
}
HAConfig.HAState state = haConfig.getState();
logger.debug("Checking Host HA inspection is in progress or not for the host {} from HAConfig, HA state is {}", hostId, state);
if (state == HAConfig.HAState.Suspect || state == HAConfig.HAState.Checking) {
return true;
}
if (state == HAConfig.HAState.Recovered || state == HAConfig.HAState.Available) {
// If the host HA state is Recovered, it indicates that the host has restarted successfully.
// If the host HA state is Available, it means the host has restarted successfully and the recovery waiting period has completed.
// In both states, the agent can connect as soon as the host is ready (and can move to Suspect -> Checking HA state if the agent connection fails again before Fencing).
final HostVO host = _hostDao.findById(hostId);
if (host != null && host.getStatus() != Status.Up) {
logger.debug("{} is in {} status and HA state is {}, considering Host HA inspection is still in progress" +
" until we are sure the host is ready after a recovery wait period and agent is connected/Up", host, host.getStatus(), state);
return true;
}
}
return false;
}
@Override
public Status investigate(final long hostId) {
final HostVO host = _hostDao.findById(hostId);
if (host == null) {
logger.warn("Host with id {} is removed or doesn't exists.", hostId);
return Status.Alert;
}
@ -270,7 +303,7 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
Status hostState = null;
for (Investigator investigator : investigators) {
hostState = investigator.isAgentAlive(host);
hostState = investigator.getHostAgentStatus(host);
if (hostState != null) {
if (logger.isDebugEnabled()) {
logger.debug("{} was able to determine host {} is in {}", investigator.getName(), host, hostState.toString());
@ -278,7 +311,7 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
return hostState;
}
if (logger.isDebugEnabled()) {
logger.debug(investigator.getName() + " unable to determine the state of the host. Moving on.");
logger.debug("{} unable to determine the state of the host. Moving on.", investigator.getName());
}
}
@ -570,9 +603,9 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
}
protected Long restart(final HaWorkVO work) {
logger.debug("RESTART with HAWORK");
logger.debug("RESTART with HA WORK");
List<HaWorkVO> items = _haDao.listFutureHaWorkForVm(work.getInstanceId(), work.getId());
if (items.size() > 0) {
if (!items.isEmpty()) {
StringBuilder str = new StringBuilder("Cancelling this work item because newer ones have been scheduled. Work Ids = [");
for (HaWorkVO item : items) {
str.append(item.getId()).append(", ");
@ -583,7 +616,7 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
}
items = _haDao.listRunningHaWorkForVm(work.getInstanceId());
if (items.size() > 0) {
if (!items.isEmpty()) {
StringBuilder str = new StringBuilder("Waiting because there's HA work being executed on an item currently. Work Ids =[");
for (HaWorkVO item : items) {
str.append(item.getId()).append(", ");
@ -597,21 +630,21 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
VirtualMachine vm = _itMgr.findById(work.getInstanceId());
if (vm == null) {
logger.info("Unable to find vm: " + vmId);
logger.info("Unable to find vm: {}", vmId);
return null;
}
if (checkAndCancelWorkIfNeeded(work)) {
return null;
}
logger.info("HA on " + vm);
logger.info("HA on {}", vm);
if (vm.getState() != work.getPreviousState() || vm.getUpdated() != work.getUpdateTime()) {
logger.info("VM " + vm + " has been changed. Current State = " + vm.getState() + " Previous State = " + work.getPreviousState() + " last updated = " +
vm.getUpdated() + " previous updated = " + work.getUpdateTime());
logger.info("VM {} has been changed. Current State = {} Previous State = {} last updated = {} previous updated = {}",
vm, vm.getState(), work.getPreviousState(), vm.getUpdated(), work.getUpdateTime());
return null;
}
if (vm.getHostId() != null && !vm.getHostId().equals(work.getHostId())) {
logger.info("VM " + vm + " has been changed. Current host id = " + vm.getHostId() + " Previous host id = " + work.getHostId());
logger.info("VM {} has been changed. Current host id = {} Previous host id = {}", vm, vm.getHostId(), work.getHostId());
return null;
}
@ -628,10 +661,13 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
boolean isHostRemoved = false;
if (host == null) {
host = _hostDao.findByIdIncludingRemoved(work.getHostId());
if (host != null) {
logger.debug("VM {} is now no longer on host {} as the host is removed", vm, host);
isHostRemoved = true;
if (host == null) {
logger.debug("VM {} is now no longer on host {}, the host doesn't exist", vm, work.getHostId());
return null;
}
logger.debug("VM {} is now no longer on host {} as the host is removed", vm, host);
isHostRemoved = true;
}
DataCenterVO dcVO = _dcDao.findById(host.getDataCenterId());
@ -652,40 +688,39 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
try
{
alive = investigator.isVmAlive(vm, host);
logger.info(investigator.getName() + " found " + vm + " to be alive? " + alive);
logger.info("{} found {} to be alive? {}", investigator.getName(), vm, alive);
break;
} catch (UnknownVM e) {
logger.info(investigator.getName() + " could not find " + vm);
logger.info("{} could not find {}", investigator.getName(), vm);
}
}
boolean fenced = false;
if (alive == null) {
logger.debug("Fencing off VM that we don't know the state of");
logger.debug("Fencing off VM {} that we don't know the state of", vm);
for (FenceBuilder fb : fenceBuilders) {
Boolean result = fb.fenceOff(vm, host);
logger.info("Fencer " + fb.getName() + " returned " + result);
logger.info("Fencer {} returned {}", fb.getName(), result);
if (result != null && result) {
fenced = true;
break;
}
}
} else if (!alive) {
fenced = true;
} else {
logger.debug("VM {} is found to be alive by {}", vm, investigator.getName());
logger.debug("VM {} is found to be alive by {} on host {}", vm, investigator.getName(), host);
if (host.getStatus() == Status.Up) {
logger.info(vm + " is alive and host is up. No need to restart it.");
logger.info("{} is alive and host {} is up. No need to restart it.", vm, host);
return null;
} else {
logger.debug("Rescheduling because the host is not up but the vm is alive");
logger.debug("Rescheduling because the host {} is not up but the vm {} is alive", host, vm);
return (System.currentTimeMillis() >> 10) + _investigateRetryInterval;
}
}
if (!fenced) {
logger.debug("We were unable to fence off the VM " + vm);
logger.debug("We were unable to fence off the VM {}", vm);
_alertMgr.sendAlert(alertType, vm.getDataCenterId(), vm.getPodIdToDeployIn(), "Unable to restart " + vm.getHostName() +
" which was running on host " + hostDesc, "Insufficient capacity to restart VM, name: " + vm.getHostName() + ", id: " + vmId +
" which was running on host " + hostDesc);
@ -728,15 +763,15 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
if (!ForceHA.value() && !vm.isHaEnabled()) {
if (logger.isDebugEnabled()) {
logger.debug("VM is not HA enabled so we're done.");
logger.debug("VM {} is not HA enabled so we're done.", vm);
}
return null; // VM doesn't require HA
}
if ((host == null || host.getRemoved() != null || host.getState() != Status.Up)
if ((host.getRemoved() != null || host.getState() != Status.Up)
&& !volumeMgr.canVmRestartOnAnotherServer(vm.getId())) {
if (logger.isDebugEnabled()) {
logger.debug("VM can not restart on another server.");
logger.debug("VM {} can not restart on another server.", vm);
}
return null;
}
@ -777,13 +812,13 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
if (started != null && started.getState() == VirtualMachine.State.Running) {
String message = String.format("HA starting VM: %s (%s)", started.getHostName(), started.getInstanceName());
HostVO hostVmHasStarted = _hostDao.findById(started.getHostId());
logger.info(String.format("HA is now restarting %s on %s", started, hostVmHasStarted));
logger.info("HA is now restarting {} on {}", started, hostVmHasStarted);
_alertMgr.sendAlert(alertType, vm.getDataCenterId(), vm.getPodIdToDeployIn(), message, message);
return null;
}
if (logger.isDebugEnabled()) {
logger.debug("Rescheduling VM " + vm.toString() + " to try again in " + _restartRetryInterval);
logger.debug("Rescheduling VM {} to try again in {}", vm.toString(), _restartRetryInterval);
}
} catch (final InsufficientCapacityException e) {
logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage());
@ -815,6 +850,9 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
if (!CancellableWorkReasonTypes.contains(work.getReasonType())) {
return false;
}
if (isHostHAInspectionInProgress(work.getHostId())) {
return false;
}
Status hostStatus = investigate(work.getHostId());
if (!Status.Up.equals(hostStatus)) {
return false;
@ -825,13 +863,14 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
}
public Long migrate(final HaWorkVO work) {
logger.debug("MIGRATE with HA WORK");
long vmId = work.getInstanceId();
long srcHostId = work.getHostId();
HostVO srcHost = _hostDao.findById(srcHostId);
VMInstanceVO vm = _instanceDao.findById(vmId);
if (vm == null) {
logger.info("Unable to find vm: " + vmId + ", skipping migrate.");
logger.info("Unable to find vm: {}, skipping migrate.", vmId);
return null;
}
if (checkAndCancelWorkIfNeeded(work)) {
@ -840,11 +879,11 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
logger.info("Migration attempt: for {} from {}. Starting attempt: {}/{} times.", vm, srcHost, 1 + work.getTimesTried(), _maxRetries);
if (VirtualMachine.State.Stopped.equals(vm.getState())) {
logger.info(String.format("vm %s is Stopped, skipping migrate.", vm));
logger.info("vm {} is Stopped, skipping migrate.", vm);
return null;
}
if (VirtualMachine.State.Running.equals(vm.getState()) && srcHostId != vm.getHostId()) {
logger.info(String.format("VM %s is running on a different host %s, skipping migration", vm, vm.getHostId()));
logger.info("VM {} is running on a different host {}, skipping migration", vm, vm.getHostId());
return null;
}
@ -879,7 +918,7 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.Destroy, Step.Scheduled, hostId, vm.getState(), 0, vm.getUpdated(), reasonType);
_haDao.persist(work);
if (logger.isDebugEnabled()) {
logger.debug("Scheduled " + work.toString());
logger.debug("{}}", work.toString());
}
wakeupWorkers();
return true;
@ -897,7 +936,7 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
}
private void destroyVM(VirtualMachine vm, boolean expunge) throws OperationTimedoutException, AgentUnavailableException {
logger.info("Destroying " + vm.toString());
logger.info("Destroying {}", vm.toString());
if (VirtualMachine.Type.ConsoleProxy.equals(vm.getType())) {
consoleProxyManager.destroyProxy(vm.getId());
} else if (VirtualMachine.Type.SecondaryStorageVm.equals(vm.getType())) {
@ -908,9 +947,10 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
}
protected Long destroyVM(final HaWorkVO work) {
logger.debug("DESTROY with HA WORK");
final VirtualMachine vm = _itMgr.findById(work.getInstanceId());
if (vm == null) {
logger.info("No longer can find VM " + work.getInstanceId() + ". Throwing away " + work);
logger.info("No longer can find VM {}. Throwing away {}", work.getInstanceId(), work);
return null;
}
if (checkAndCancelWorkIfNeeded(work)) {
@ -944,20 +984,21 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
}
protected Long stopVM(final HaWorkVO work) throws ConcurrentOperationException {
logger.debug("STOP with HA WORK");
VirtualMachine vm = _itMgr.findById(work.getInstanceId());
if (vm == null) {
logger.info("No longer can find VM " + work.getInstanceId() + ". Throwing away " + work);
logger.info("No longer can find VM {}. Throwing away {}", work.getInstanceId(), work);
work.setStep(Step.Done);
return null;
}
if (checkAndCancelWorkIfNeeded(work)) {
return null;
}
logger.info("Stopping " + vm);
logger.info("Stopping {}", vm);
try {
if (work.getWorkType() == WorkType.Stop) {
_itMgr.advanceStop(vm.getUuid(), false);
logger.info("Successfully stopped " + vm);
logger.info("Successfully stopped {}", vm);
return null;
} else if (work.getWorkType() == WorkType.CheckStop) {
if ((vm.getState() != work.getPreviousState()) || vm.getUpdated() != work.getUpdateTime() || vm.getHostId() == null ||
@ -969,7 +1010,7 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
}
_itMgr.advanceStop(vm.getUuid(), false);
logger.info("Stop for " + vm + " was successful");
logger.info("Stop for {} was successful", vm);
return null;
} else if (work.getWorkType() == WorkType.ForceStop) {
if ((vm.getState() != work.getPreviousState()) || vm.getUpdated() != work.getUpdateTime() || vm.getHostId() == null ||
@ -981,13 +1022,13 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
}
_itMgr.advanceStop(vm.getUuid(), true);
logger.info("Stop for " + vm + " was successful");
logger.info("Stop for {} was successful", vm);
return null;
} else {
assert false : "Who decided there's other steps but didn't modify the guy who does the work?";
}
} catch (final ResourceUnavailableException e) {
logger.debug("Agnet is not available" + e.getMessage());
logger.debug("Agent is not available" + e.getMessage());
} catch (OperationTimedoutException e) {
logger.debug("operation timed out: " + e.getMessage());
}
@ -1043,7 +1084,8 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
try {
if (vm != null && !VmHaEnabled.valueIn(vm.getDataCenterId())) {
if (logger.isDebugEnabled()) {
logger.debug(String.format("VM high availability manager is disabled, rescheduling the HA work %s, for the VM %s (id) to retry later in case VM high availability manager is enabled on retry attempt", work, vm.getName(), vm.getId()));
logger.debug("VM high availability manager is disabled, rescheduling the HA work {} for the VM {} ({}) " +
"to retry later in case VM high availability manager is enabled on retry attempt", work, vm.getName(), vm.getId());
}
long nextTime = getRescheduleTime(wt);
rescheduleWork(work, nextTime);
@ -1065,13 +1107,13 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
}
if (nextTime == null) {
logger.info("Completed work " + work + ". Took " + (work.getTimesTried() + 1) + "/" + _maxRetries + " attempts.");
logger.info("Completed work {}. Took {}/{} attempts.", work, work.getTimesTried() + 1, _maxRetries);
work.setStep(Step.Done);
} else {
rescheduleWork(work, nextTime.longValue());
}
} catch (Exception e) {
logger.warn("Encountered unhandled exception during HA process, reschedule work", e);
logger.warn("Encountered unhandled exception during HA process, reschedule work {}", work, e);
long nextTime = getRescheduleTime(wt);
rescheduleWork(work, nextTime);
@ -1085,11 +1127,11 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
} finally {
if (!Step.Done.equals(work.getStep())) {
if (work.getTimesTried() >= _maxRetries) {
logger.warn("Giving up, retried max " + work.getTimesTried() + "/" + _maxRetries + " times for work: " + work);
logger.warn("Giving up, retried max {}/{} times for work: {}", work.getTimesTried(), _maxRetries, work);
work.setStep(Step.Done);
} else {
logger.warn("Rescheduling work " + work + " to try again at " + new Date(work.getTimeToTry() << 10) +
". Finished attempt " + work.getTimesTried() + "/" + _maxRetries + " times.");
logger.warn("Rescheduling work {} to try again at {}. Finished attempt {}/{} times.",
work, new Date(work.getTimeToTry() << 10), work.getTimesTried(), _maxRetries);
}
}
_haDao.update(work.getId(), work);

View File

@ -74,7 +74,7 @@ public class KVMFencer extends AdapterBase implements FenceBuilder {
@Override
public Boolean fenceOff(VirtualMachine vm, Host host) {
if (host.getHypervisorType() != HypervisorType.KVM && host.getHypervisorType() != HypervisorType.LXC) {
logger.warn("Don't know how to fence non kvm hosts " + host.getHypervisorType());
logger.warn("Don't know how to fence non kvm hosts {}", host.getHypervisorType());
return null;
}
@ -97,11 +97,8 @@ public class KVMFencer extends AdapterBase implements FenceBuilder {
FenceAnswer answer;
try {
answer = (FenceAnswer)_agentMgr.send(h.getId(), fence);
} catch (AgentUnavailableException e) {
logger.info("Moving on to the next host because " + h.toString() + " is unavailable", e);
continue;
} catch (OperationTimedoutException e) {
logger.info("Moving on to the next host because " + h.toString() + " is unavailable", e);
} catch (AgentUnavailableException | OperationTimedoutException e) {
logger.info("Moving on to the next host because {} is unavailable", h.toString(), e);
continue;
}
if (answer != null && answer.getResult()) {
@ -115,7 +112,7 @@ public class KVMFencer extends AdapterBase implements FenceBuilder {
"Fencing off host " + host.getId() + " did not succeed after asking " + i + " hosts. " +
"Check Agent logs for more information.");
logger.error("Unable to fence off " + vm.toString() + " on " + host.toString());
logger.error("Unable to fence off {} on {}", vm.toString(), host.toString());
return false;
}

View File

@ -104,7 +104,7 @@ public class ManagementIPSystemVMInvestigator extends AbstractInvestigatorImpl {
}
@Override
public Status isAgentAlive(Host agent) {
public Status getHostAgentStatus(Host agent) {
return null;
}

View File

@ -103,7 +103,7 @@ public class UserVmDomRInvestigator extends AbstractInvestigatorImpl {
}
@Override
public Status isAgentAlive(Host agent) {
public Status getHostAgentStatus(Host agent) {
if (logger.isDebugEnabled()) {
logger.debug("checking if agent ({}) is alive", agent);
}

View File

@ -46,7 +46,7 @@ public class XenServerInvestigator extends AdapterBase implements Investigator {
}
@Override
public Status isAgentAlive(Host agent) {
public Status getHostAgentStatus(Host agent) {
if (agent.getHypervisorType() != HypervisorType.XenServer) {
return null;
}
@ -74,7 +74,7 @@ public class XenServerInvestigator extends AdapterBase implements Investigator {
@Override
public boolean isVmAlive(VirtualMachine vm, Host host) throws UnknownVM {
Status status = isAgentAlive(host);
Status status = getHostAgentStatus(host);
if (status == null) {
throw new UnknownVM();
}

View File

@ -3036,7 +3036,7 @@ public class ManagementServerImpl extends MutualExclusiveIdsManagerBase implemen
final String hypervisor = cmd.getHypervisor();
final String hypervisorVersion = cmd.getHypervisorVersion();
//throw exception if hypervisor name is not passed, but version is
//throw exception if hypervisor name is not passed, but a version is
if (hypervisorVersion != null && (hypervisor == null || hypervisor.isEmpty())) {
throw new InvalidParameterValueException("Hypervisor version parameter cannot be used without specifying a hypervisor : XenServer, KVM or VMware");
}
@ -3054,7 +3054,7 @@ public class ManagementServerImpl extends MutualExclusiveIdsManagerBase implemen
final SearchCriteria<GuestOSHypervisorVO> sc = sb.create();
if (id != null) {
sc.setParameters("id", SearchCriteria.Op.EQ, id);
sc.setParameters("id", id);
}
if (osTypeId != null) {

View File

@ -765,6 +765,13 @@ public class VMSnapshotManagerImpl extends MutualExclusiveIdsManagerBase impleme
"In order to revert to a Snapshot without memory you need to first stop the Instance.");
}
if (userVm.getState() == VirtualMachine.State.Running && vmSnapshotVo.getType() == VMSnapshot.Type.Disk) {
throw new InvalidParameterValueException(
"Reverting to the Instance Snapshot is not allowed for running Instances as this would result in an Instance state change. " +
"For running Instances only Snapshots with memory can be reverted. " +
"In order to revert to a Snapshot without memory you need to first stop the Instance.");
}
if (userVm.getState() == VirtualMachine.State.Stopped && vmSnapshotVo.getType() == VMSnapshot.Type.DiskAndMemory) {
throw new InvalidParameterValueException(
"Reverting to the Instance Snapshot is not allowed for stopped Instances when the Snapshot contains memory as this would result in an Instance state change. " +

View File

@ -67,11 +67,16 @@ public interface HAManager extends HAConfigManager {
"The number of pending fence operations per management server. This setting determines the size of the size of the FENCE queue.", true);
boolean transitionHAState(final HAConfig.Event event, final HAConfig haConfig);
HAProvider getHAProvider(final String name);
HAResourceCounter getHACounter(final Long resourceId, final HAResource.ResourceType resourceType);
void purgeHACounter(final Long resourceId, final HAResource.ResourceType resourceType);
boolean isHAEligible(final HAResource resource);
Boolean isVMAliveOnHost(final Host host) throws Investigator.UnknownVM;
Status getHostStatus(final Host host);
Status getHostStatusFromHAConfig(final Host host);
}

View File

@ -139,9 +139,7 @@ public final class HAManagerImpl extends ManagerBase implements HAManager, Clust
public synchronized void purgeHACounter(final Long resourceId, final HAResource.ResourceType resourceType) {
final String key = resourceCounterKey(resourceId, resourceType);
if (haCounterMap.containsKey(key)) {
haCounterMap.remove(key);
}
haCounterMap.remove(key);
}
public boolean transitionHAState(final HAConfig.Event event, final HAConfig haConfig) {
@ -248,6 +246,7 @@ public final class HAManagerImpl extends ManagerBase implements HAManager, Clust
}
private boolean isHAEnabledForCluster(final HAResource resource) {
// HA is enabled by default when cluster details doesn't exist
if (resource == null || resource.getClusterId() == null) {
return true;
}
@ -259,14 +258,10 @@ public final class HAManagerImpl extends ManagerBase implements HAManager, Clust
if (resource == null || resource.getId() < 1L) {
return false;
}
HAResource.ResourceType resourceType = null;
if (resource instanceof Host) {
resourceType = HAResource.ResourceType.Host;
}
if (resourceType == null) {
if (!(resource instanceof Host)) {
return false;
}
final HAConfig haConfig = haConfigDao.findHAResource(resource.getId(), resourceType);
final HAConfig haConfig = haConfigDao.findHAResource(resource.getId(), HAResource.ResourceType.Host);
return haConfig != null && haConfig.isEnabled()
&& haConfig.getState() != HAConfig.HAState.Disabled
&& haConfig.getState() != HAConfig.HAState.Ineligible;
@ -317,19 +312,23 @@ public final class HAManagerImpl extends ManagerBase implements HAManager, Clust
throw new Investigator.UnknownVM();
}
public Status getHostStatus(final Host host) {
public Status getHostStatusFromHAConfig(final Host host) {
final HAConfig haConfig = haConfigDao.findHAResource(host.getId(), HAResource.ResourceType.Host);
if (haConfig != null) {
if (haConfig.getState() == HAConfig.HAState.Fenced) {
logger.debug("HA: Agent [{}] is available/suspect/checking Up.", host);
return Status.Down;
} else if (haConfig.getState() == HAConfig.HAState.Degraded || haConfig.getState() == HAConfig.HAState.Recovering || haConfig.getState() == HAConfig.HAState.Fencing) {
logger.debug("HA: Agent [{}] is disconnected. State: {}, {}.", host, haConfig.getState(), haConfig.getState().getDescription());
return Status.Disconnected;
}
return Status.Up;
if (haConfig == null) {
logger.warn("HA: Agent [{}] config is not available.", host);
return Status.Unknown;
}
return Status.Unknown;
if (haConfig.getState() == HAConfig.HAState.Fenced) {
logger.debug("HA: Agent [{}] is fenced.", host);
return Status.Down;
}
if (haConfig.getState() == HAConfig.HAState.Degraded || haConfig.getState() == HAConfig.HAState.Recovering || haConfig.getState() == HAConfig.HAState.Fencing) {
logger.debug("HA: Agent [{}] is disconnected. State: {}, {}.", host, haConfig.getState(), haConfig.getState().getDescription());
return Status.Disconnected;
}
logger.debug("HA: Agent [{}] is considered Up (HA state can be Available/Suspect/Checking/Recovered). State: {}, {}.", host, haConfig.getState(), haConfig.getState().getDescription());
return Status.Up;
}
//////////////////////////////////////////////////////
@ -511,9 +510,14 @@ public final class HAManagerImpl extends ManagerBase implements HAManager, Clust
// Attempt recovery
if (newState == HAConfig.HAState.Recovering) {
if (counter.getRecoveryCounter() >= (Long) (haProvider.getConfigValue(HAProviderConfig.MaxRecoveryAttempts, resource))) {
long recoveryCounter = counter.getRecoveryCounter();
Long maxRecoveryAttempts = (Long) (haProvider.getConfigValue(HAProviderConfig.MaxRecoveryAttempts, resource));
if (recoveryCounter >= maxRecoveryAttempts) {
logger.debug("Recovery attempts have reached the configured limit: {} for the resource [{}].", maxRecoveryAttempts, resource);
return false;
}
logger.debug("Recovery attempt #{} for the resource [{}]. Max recovery attempts configured is {}.", recoveryCounter + 1, resource, maxRecoveryAttempts);
final RecoveryTask task = ComponentContext.inject(new RecoveryTask(resource, haProvider, haConfig,
HAProviderConfig.RecoveryTimeout, recoveryExecutor));
final Future<Boolean> recoveryFuture = recoveryExecutor.submit(task);
@ -536,20 +540,20 @@ public final class HAManagerImpl extends ManagerBase implements HAManager, Clust
return false;
}
logger.debug(String.format("HA state pre-transition:: new state=[%s], old state=[%s], for resource id=[%s], status=[%s], ha config state=[%s]." , newState, oldState, haConfig.getResourceId(), status, haConfig.getState()));
logger.debug("HA state pre-transition:: new state=[{}], old state=[{}], for resource id=[{}], status=[{}], ha config state=[{}].", newState, oldState, haConfig.getResourceId(), status, haConfig.getState());
if (status && haConfig.getState() != newState) {
logger.warn(String.format("HA state pre-transition:: HA state is not equal to transition state, HA state=[%s], new state=[%s].", haConfig.getState(), newState));
logger.warn("HA state pre-transition:: HA state is not equal to transition state, HA state=[{}], new state=[{}].", haConfig.getState(), newState);
}
return processHAStateChange(haConfig, newState, status);
}
@Override
public boolean postStateTransitionEvent(final StateMachine2.Transition<HAConfig.HAState, HAConfig.Event> transition, final HAConfig haConfig, final boolean status, final Object opaque) {
logger.debug(String.format("HA state post-transition:: new state=[%s], old state=[%s], for resource id=[%s], status=[%s], ha config state=[%s].", transition.getToState(), transition.getCurrentState(), haConfig.getResourceId(), status, haConfig.getState()));
logger.debug("HA state post-transition:: new state=[{}], old state=[{}], for resource id=[{}], status=[{}], ha config state=[{}].", transition.getToState(), transition.getCurrentState(), haConfig.getResourceId(), status, haConfig.getState());
if (status && haConfig.getState() != transition.getToState()) {
logger.warn(String.format("HA state post-transition:: HA state is not equal to transition state, HA state=[%s], new state=[%s].", haConfig.getState(), transition.getToState()));
logger.warn("HA state post-transition:: HA state is not equal to transition state, HA state=[{}], new state=[{}].", haConfig.getState(), transition.getToState());
}
return processHAStateChange(haConfig, transition.getToState(), status);
}
@ -645,7 +649,7 @@ public final class HAManagerImpl extends ManagerBase implements HAManager, Clust
try {
logger.debug("HA health check task is running...");
final List<HAConfig> haConfigList = new ArrayList<HAConfig>(haConfigDao.listAll());
final List<HAConfig> haConfigList = new ArrayList<>(haConfigDao.listAll());
for (final HAConfig haConfig : haConfigList) {
currentHaConfig = haConfig;
@ -676,8 +680,8 @@ public final class HAManagerImpl extends ManagerBase implements HAManager, Clust
HAProviderConfig.HealthCheckTimeout, healthCheckExecutor));
healthCheckExecutor.submit(task);
break;
default:
break;
default:
break;
}
final HAResourceCounter counter = getHACounter(haConfig.getResourceId(), haConfig.getResourceType());
@ -695,16 +699,22 @@ public final class HAManagerImpl extends ManagerBase implements HAManager, Clust
}
if (haConfig.getState() == HAConfig.HAState.Recovering) {
if (counter.getRecoveryCounter() >= (Long) (haProvider.getConfigValue(HAProviderConfig.MaxRecoveryAttempts, resource))) {
long recoveryCounter = counter.getRecoveryCounter();
Long maxRecoveryAttempts = (Long) (haProvider.getConfigValue(HAProviderConfig.MaxRecoveryAttempts, resource));
if (recoveryCounter >= maxRecoveryAttempts) {
logger.debug("Recovery attempts have reached the max limit: {} for the resource [{}].", maxRecoveryAttempts, resource);
transitionHAState(HAConfig.Event.RecoveryOperationThresholdExceeded, haConfig);
} else {
logger.debug("Retry recovery for the resource [{}]. Max recovery attempts configured is {}.", resource, maxRecoveryAttempts);
transitionHAState(HAConfig.Event.RetryRecovery, haConfig);
}
}
if (haConfig.getState() == HAConfig.HAState.Recovered) {
counter.markRecoveryStarted();
if (counter.canExitRecovery((Long)(haProvider.getConfigValue(HAProviderConfig.RecoveryWaitTimeout, resource)))) {
Long recoveryWaitTimeout = (Long)(haProvider.getConfigValue(HAProviderConfig.RecoveryWaitTimeout, resource));
logger.debug("Recovery started for the resource [{}], wait period configured to become Available is {} secs", resource, recoveryWaitTimeout);
if (counter.canExitRecovery(recoveryWaitTimeout)) {
if (transitionHAState(HAConfig.Event.RecoveryWaitPeriodTimeout, haConfig)) {
counter.markRecoveryCompleted();
}
@ -717,7 +727,7 @@ public final class HAManagerImpl extends ManagerBase implements HAManager, Clust
}
} catch (Throwable t) {
if (currentHaConfig != null) {
logger.error(String.format("Error trying to perform health checks in HA manager [%s].", currentHaConfig.getHaProvider()), t);
logger.error("Error trying to perform health checks in HA manager [{}].", currentHaConfig.getHaProvider(), t);
} else {
logger.error("Error trying to perform health checks in HA manager.", t);
}

View File

@ -36,6 +36,10 @@ public final class HAResourceCounter {
return activityCheckCounter.get();
}
public long getActivityCheckFailureCounter() {
return activityCheckFailureCounter.get();
}
public long getRecoveryCounter() {
return recoveryOperationCounter.get();
}
@ -66,7 +70,7 @@ public final class HAResourceCounter {
firstHealthCheckFailureTimestamp = null;
}
public boolean hasActivityThresholdExceeded(final double failureRatio) {
public boolean hasActivityFailureThresholdExceeded(final double failureRatio) {
return activityCheckFailureCounter.get() > (activityCheckCounter.get() * failureRatio);
}

View File

@ -62,6 +62,8 @@ public class ActivityCheckTask extends BaseHATask {
return;
}
long activityCounter = counter.getActivityCheckCounter();
logger.debug("Activity check #{}, result: {} for the resource {}. Max activity checks configured is {}", activityCounter + 1, result, getResource(), maxActivityChecks);
counter.incrActivityCounter(!result);
if (counter.getActivityCheckCounter() < maxActivityChecks) {
@ -69,7 +71,9 @@ public class ActivityCheckTask extends BaseHATask {
return;
}
if (counter.hasActivityThresholdExceeded(activityCheckFailureRatio)) {
long activityCheckFailureCount = counter.getActivityCheckFailureCounter();
logger.debug("{} activity checks failed out of {} checks performed for the resource {}. Failure threshold configured is {}", activityCheckFailureCount, maxActivityChecks, getResource(), activityCheckFailureRatio);
if (counter.hasActivityFailureThresholdExceeded(activityCheckFailureRatio)) {
haManager.transitionHAState(HAConfig.Event.ActivityCheckFailureOverThresholdRatio, haConfig);
} else {
if (haManager.transitionHAState(HAConfig.Event.ActivityCheckFailureUnderThresholdRatio, haConfig)) {

View File

@ -97,7 +97,7 @@ public abstract class BaseHATask implements Callable<Boolean> {
result = future.get(timeout, TimeUnit.SECONDS);
}
} catch (InterruptedException | ExecutionException e) {
logger.warn("Exception occurred while running " + getTaskType() + " on a resource: " + e.getMessage(), e.getCause());
logger.warn("Exception occurred while running {} on a resource: {}", getTaskType(), e.getMessage(), e.getCause());
throwable = e.getCause();
} catch (TimeoutException e) {
logger.trace("{} operation timed out for resource: {}", getTaskType(), resource);

View File

@ -35,6 +35,7 @@ import org.apache.cloudstack.engine.orchestration.service.VolumeOrchestrationSer
import org.apache.cloudstack.engine.subsystem.api.storage.DataStoreProviderManager;
import org.apache.cloudstack.framework.config.ConfigKey;
import org.apache.cloudstack.framework.config.dao.ConfigurationDao;
import org.apache.cloudstack.ha.dao.HAConfigDao;
import org.apache.cloudstack.managed.context.ManagedContext;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
@ -118,6 +119,8 @@ public class HighAvailabilityManagerImplTest {
@Mock
ConfigurationDao _configDao;
@Mock
HAConfigDao _haConfigDao;
@Mock
VolumeOrchestrationService volumeMgr;
@Mock
ConsoleProxyManager consoleProxyManager;
@ -362,7 +365,7 @@ public class HighAvailabilityManagerImplTest {
investigators.add(investigator);
highAvailabilityManager.setInvestigators(investigators);
// Mock isAgentAlive to return host status as Down
Mockito.when(investigator.isAgentAlive(hostVO)).thenReturn(Status.Down);
Mockito.when(investigator.getHostAgentStatus(hostVO)).thenReturn(Status.Down);
ConfigKey<Boolean> haEnabled = Mockito.mock(ConfigKey.class);
highAvailabilityManager.VmHaEnabled = haEnabled;

View File

@ -169,10 +169,10 @@ import com.cloud.vm.ImportVMTaskVO;
import com.cloud.vm.NicProfile;
import com.cloud.vm.UserVmManager;
import com.cloud.vm.UserVmVO;
import com.cloud.vm.VmDetailConstants;
import com.cloud.vm.VMInstanceDetailVO;
import com.cloud.vm.VMInstanceVO;
import com.cloud.vm.VirtualMachine;
import com.cloud.vm.VmDetailConstants;
import com.cloud.vm.dao.NicDao;
import com.cloud.vm.dao.UserVmDao;
import com.cloud.vm.dao.VMInstanceDao;

View File

@ -231,21 +231,19 @@ public class RedfishClient {
}
protected HttpResponse retryHttpRequest(String url, HttpRequestBase httpReq, HttpClient client) {
logger.warn(String.format("Failed to execute HTTP %s request [URL: %s]. Executing the request again.", httpReq.getMethod(), url));
logger.warn("Failed to execute HTTP {} request [URL: {}]. Executing the request again.", httpReq.getMethod(), url);
HttpResponse response = null;
for (int attempt = 1; attempt < redfishRequestMaxRetries + 1; attempt++) {
try {
TimeUnit.SECONDS.sleep(WAIT_FOR_REQUEST_RETRY);
logger.debug(String.format("HTTP %s request retry attempt %d/%d [URL: %s].", httpReq.getMethod(), attempt, redfishRequestMaxRetries, url));
logger.debug("HTTP {} request retry attempt {}/{} [URL: {}].", httpReq.getMethod(), attempt, redfishRequestMaxRetries, url);
response = client.execute(httpReq);
break;
} catch (IOException | InterruptedException e) {
if (attempt == redfishRequestMaxRetries) {
throw new RedfishException(String.format("Failed to execute HTTP %s request retry attempt %d/%d [URL: %s] due to exception %s", httpReq.getMethod(), attempt, redfishRequestMaxRetries,url, e));
} else {
logger.warn(
String.format("Failed to execute HTTP %s request retry attempt %d/%d [URL: %s] due to exception %s", httpReq.getMethod(), attempt, redfishRequestMaxRetries,
url, e));
logger.warn("Failed to execute HTTP {} request retry attempt {}/{} [URL: {}] due to exception {}", httpReq.getMethod(), attempt, redfishRequestMaxRetries, url, e);
}
}
}
@ -312,7 +310,7 @@ public class RedfishClient {
throw new RedfishException(String.format("Failed to execute System power command for host by performing '%s' request on URL '%s' and host address '%s'. The expected HTTP status code is '%s' but it got '%s'.",
HttpPost.METHOD_NAME, url, hostAddress, EXPECTED_HTTP_STATUS, statusCode));
}
logger.debug(String.format("Sending ComputerSystem.Reset Command '%s' to host '%s' with request '%s %s'", resetCommand, hostAddress, HttpPost.METHOD_NAME, url));
logger.debug("Sending ComputerSystem.Reset Command '{}' to host '{}' with request '{} {}'", resetCommand, hostAddress, HttpPost.METHOD_NAME, url);
}
/**
@ -330,7 +328,7 @@ public class RedfishClient {
String systemId = processGetSystemIdResponse(response);
logger.debug(String.format("Retrieved System ID '%s' with request '%s: %s'", systemId, HttpGet.METHOD_NAME, url));
logger.debug("Retrieved System ID '{}' with request '{}: {}'", systemId, HttpGet.METHOD_NAME, url);
return systemId;
}
@ -384,7 +382,7 @@ public class RedfishClient {
}
RedfishPowerState powerState = processGetSystemRequestResponse(response);
logger.debug(String.format("Retrieved System power state '%s' with request '%s: %s'", powerState, HttpGet.METHOD_NAME, url));
logger.debug("Retrieved System power state '{}' with request '{}: {}'", powerState, HttpGet.METHOD_NAME, url);
return powerState;
}