mirror of https://github.com/apache/cloudstack.git
server: prevent duplicate HA works and alerts (#10624)
Signed-off-by: Abhishek Kumar <abhishek.mrt22@gmail.com>
This commit is contained in:
parent
1f8442eb69
commit
919c9797cc
|
|
@ -38,9 +38,6 @@ import java.util.concurrent.locks.ReentrantLock;
|
|||
import javax.inject.Inject;
|
||||
import javax.naming.ConfigurationException;
|
||||
|
||||
import com.cloud.configuration.Config;
|
||||
import com.cloud.utils.NumbersUtil;
|
||||
import com.cloud.utils.db.GlobalLock;
|
||||
import org.apache.cloudstack.agent.lb.IndirectAgentLB;
|
||||
import org.apache.cloudstack.ca.CAManager;
|
||||
import org.apache.cloudstack.engine.orchestration.service.NetworkOrchestrationService;
|
||||
|
|
@ -54,6 +51,7 @@ import org.apache.cloudstack.outofbandmanagement.dao.OutOfBandManagementDao;
|
|||
import org.apache.cloudstack.utils.identity.ManagementServerNode;
|
||||
import org.apache.cloudstack.utils.reflectiontostringbuilderutils.ReflectionToStringBuilderUtils;
|
||||
import org.apache.commons.lang3.BooleanUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.log4j.MDC;
|
||||
|
||||
|
|
@ -82,6 +80,7 @@ import com.cloud.agent.api.UnsupportedAnswer;
|
|||
import com.cloud.agent.transport.Request;
|
||||
import com.cloud.agent.transport.Response;
|
||||
import com.cloud.alert.AlertManager;
|
||||
import com.cloud.configuration.Config;
|
||||
import com.cloud.configuration.ManagementServiceConfiguration;
|
||||
import com.cloud.dc.ClusterVO;
|
||||
import com.cloud.dc.DataCenterVO;
|
||||
|
|
@ -105,11 +104,13 @@ import com.cloud.resource.Discoverer;
|
|||
import com.cloud.resource.ResourceManager;
|
||||
import com.cloud.resource.ResourceState;
|
||||
import com.cloud.resource.ServerResource;
|
||||
import com.cloud.utils.NumbersUtil;
|
||||
import com.cloud.utils.Pair;
|
||||
import com.cloud.utils.component.ManagerBase;
|
||||
import com.cloud.utils.concurrency.NamedThreadFactory;
|
||||
import com.cloud.utils.db.DB;
|
||||
import com.cloud.utils.db.EntityManager;
|
||||
import com.cloud.utils.db.GlobalLock;
|
||||
import com.cloud.utils.db.QueryBuilder;
|
||||
import com.cloud.utils.db.SearchCriteria.Op;
|
||||
import com.cloud.utils.db.TransactionLegacy;
|
||||
|
|
@ -124,7 +125,6 @@ import com.cloud.utils.nio.Link;
|
|||
import com.cloud.utils.nio.NioServer;
|
||||
import com.cloud.utils.nio.Task;
|
||||
import com.cloud.utils.time.InaccurateClock;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
/**
|
||||
* Implementation of the Agent Manager. This class controls the connection to the agents.
|
||||
|
|
@ -208,6 +208,11 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager, Handl
|
|||
protected final ConfigKey<Boolean> CheckTxnBeforeSending = new ConfigKey<Boolean>("Developer", Boolean.class, "check.txn.before.sending.agent.commands", "false",
|
||||
"This parameter allows developers to enable a check to see if a transaction wraps commands that are sent to the resource. This is not to be enabled on production systems.", true);
|
||||
|
||||
public static final List<Host.Type> HOST_DOWN_ALERT_UNSUPPORTED_HOST_TYPES = Arrays.asList(
|
||||
Host.Type.SecondaryStorage,
|
||||
Host.Type.ConsoleProxy
|
||||
);
|
||||
|
||||
@Override
|
||||
public boolean configure(final String name, final Map<String, Object> params) throws ConfigurationException {
|
||||
|
||||
|
|
@ -901,7 +906,10 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager, Handl
|
|||
if (determinedState == Status.Down) {
|
||||
final String message = "Host is down: " + host.getId() + "-" + host.getName() + ". Starting HA on the VMs";
|
||||
s_logger.error(message);
|
||||
if (host.getType() != Host.Type.SecondaryStorage && host.getType() != Host.Type.ConsoleProxy) {
|
||||
if (Status.Down.equals(host.getStatus())) {
|
||||
s_logger.debug(String.format("Skipping sending alert for %s as it already in %s state",
|
||||
host, host.getStatus()));
|
||||
} else if (!HOST_DOWN_ALERT_UNSUPPORTED_HOST_TYPES.contains(host.getType())) {
|
||||
_alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_HOST, host.getDataCenterId(), host.getPodId(), "Host down, " + host.getId(), message);
|
||||
}
|
||||
event = Status.Event.HostDown;
|
||||
|
|
|
|||
|
|
@ -21,6 +21,7 @@ import java.util.Date;
|
|||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.ScheduledExecutorService;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
|
@ -41,6 +42,7 @@ import org.apache.cloudstack.framework.config.dao.ConfigurationDao;
|
|||
import org.apache.cloudstack.managed.context.ManagedContext;
|
||||
import org.apache.cloudstack.managed.context.ManagedContextRunnable;
|
||||
import org.apache.cloudstack.management.ManagementServerHost;
|
||||
import org.apache.commons.collections.CollectionUtils;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.log4j.NDC;
|
||||
|
||||
|
|
@ -71,7 +73,6 @@ import com.cloud.hypervisor.Hypervisor.HypervisorType;
|
|||
import com.cloud.network.VpcVirtualNetworkApplianceService;
|
||||
import com.cloud.resource.ResourceManager;
|
||||
import com.cloud.server.ManagementServer;
|
||||
import com.cloud.service.ServiceOfferingVO;
|
||||
import com.cloud.service.dao.ServiceOfferingDao;
|
||||
import com.cloud.storage.Storage.StoragePoolType;
|
||||
import com.cloud.storage.StorageManager;
|
||||
|
|
@ -223,6 +224,18 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
|
|||
long _timeBetweenCleanups;
|
||||
String _haTag = null;
|
||||
|
||||
private boolean vmHasPendingHAJob(final List<HaWorkVO> pendingHaWorks, final VMInstanceVO vm) {
|
||||
Optional<HaWorkVO> item = pendingHaWorks.stream()
|
||||
.filter(h -> h.getInstanceId() == vm.getId())
|
||||
.reduce((first, second) -> second);
|
||||
if (item.isPresent() && (item.get().getTimesTried() < _maxRetries ||
|
||||
!item.get().canScheduleNew(_timeBetweenFailures))) {
|
||||
s_logger.debug(String.format("Skipping HA on %s as there is already a running HA job for it", vm));
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
protected HighAvailabilityManagerImpl() {
|
||||
}
|
||||
|
||||
|
|
@ -265,28 +278,37 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
|
|||
s_logger.warn("Scheduling restart for VMs on host " + host.getId() + "-" + host.getName());
|
||||
|
||||
final List<VMInstanceVO> vms = _instanceDao.listByHostId(host.getId());
|
||||
final List<HaWorkVO> pendingHaWorks = _haDao.listPendingHAWorkForHost(host.getId());
|
||||
final DataCenterVO dcVO = _dcDao.findById(host.getDataCenterId());
|
||||
|
||||
// send an email alert that the host is down
|
||||
StringBuilder sb = null;
|
||||
List<VMInstanceVO> reorderedVMList = new ArrayList<VMInstanceVO>();
|
||||
if ((vms != null) && !vms.isEmpty()) {
|
||||
int skippedHAVms = 0;
|
||||
if (CollectionUtils.isNotEmpty(vms)) {
|
||||
sb = new StringBuilder();
|
||||
sb.append(" Starting HA on the following VMs:");
|
||||
// collect list of vm names for the alert email
|
||||
for (int i = 0; i < vms.size(); i++) {
|
||||
VMInstanceVO vm = vms.get(i);
|
||||
for (VMInstanceVO vm : vms) {
|
||||
if (vmHasPendingHAJob(pendingHaWorks, vm)) {
|
||||
skippedHAVms++;
|
||||
continue;
|
||||
}
|
||||
if (vm.getType() == VirtualMachine.Type.User) {
|
||||
reorderedVMList.add(vm);
|
||||
} else {
|
||||
reorderedVMList.add(0, vm);
|
||||
}
|
||||
if (vm.isHaEnabled()) {
|
||||
sb.append(" " + vm.getHostName());
|
||||
sb.append(" ").append(vm.getHostName());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (reorderedVMList.isEmpty() && skippedHAVms > 0 && skippedHAVms == vms.size()) {
|
||||
s_logger.debug(String.format(
|
||||
"Skipping sending alert for %s as it is suspected to be a duplicate of a recent alert", host));
|
||||
return;
|
||||
}
|
||||
// send an email alert that the host is down, include VMs
|
||||
HostPodVO podVO = _podDao.findById(host.getPodId());
|
||||
String hostDesc = "name: " + host.getName() + " (id:" + host.getId() + "), availability zone: " + dcVO.getName() + ", pod: " + podVO.getName();
|
||||
|
|
@ -294,7 +316,6 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
|
|||
"Host [" + hostDesc + "] is down." + ((sb != null) ? sb.toString() : ""));
|
||||
|
||||
for (VMInstanceVO vm : reorderedVMList) {
|
||||
ServiceOfferingVO vmOffering = _serviceOfferingDao.findById(vm.getServiceOfferingId());
|
||||
if (_itMgr.isRootVolumeOnLocalStorage(vm.getId())) {
|
||||
if (s_logger.isDebugEnabled()){
|
||||
s_logger.debug("Skipping HA on vm " + vm + ", because it uses local storage. Its fate is tied to the host.");
|
||||
|
|
|
|||
|
|
@ -85,4 +85,6 @@ public interface HighAvailabilityDao extends GenericDao<HaWorkVO, Long> {
|
|||
List<HaWorkVO> listPendingHaWorkForVm(long vmId);
|
||||
|
||||
List<HaWorkVO> listPendingMigrationsForVm(long vmId);
|
||||
|
||||
List<HaWorkVO> listPendingHAWorkForHost(long hostId);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -19,7 +19,6 @@ package com.cloud.ha.dao;
|
|||
import java.util.Date;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
|
|
@ -260,4 +259,17 @@ public class HighAvailabilityDaoImpl extends GenericDaoBase<HaWorkVO, Long> impl
|
|||
|
||||
return update(vo, sc);
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<HaWorkVO> listPendingHAWorkForHost(long hostId) {
|
||||
SearchBuilder<HaWorkVO> sb = createSearchBuilder();
|
||||
sb.and("hostId", sb.entity().getHostId(), Op.EQ);
|
||||
sb.and("type", sb.entity().getWorkType(), Op.EQ);
|
||||
sb.and("step", sb.entity().getStep(), Op.NIN);
|
||||
SearchCriteria<HaWorkVO> sc = sb.create();
|
||||
sc.setParameters("hostId", hostId);
|
||||
sc.setParameters("type", WorkType.HA);
|
||||
sc.setParameters("step", Step.Done, Step.Cancelled, Step.Error);
|
||||
return listBy(sc);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -62,7 +62,6 @@ import com.cloud.hypervisor.Hypervisor.HypervisorType;
|
|||
import com.cloud.network.VpcVirtualNetworkApplianceService;
|
||||
import com.cloud.resource.ResourceManager;
|
||||
import com.cloud.server.ManagementServer;
|
||||
import com.cloud.service.ServiceOfferingVO;
|
||||
import com.cloud.service.dao.ServiceOfferingDao;
|
||||
import com.cloud.storage.StorageManager;
|
||||
import com.cloud.storage.dao.GuestOSCategoryDao;
|
||||
|
|
@ -214,7 +213,6 @@ public class HighAvailabilityManagerImplTest {
|
|||
Mockito.when(_dcDao.findById(Mockito.anyLong())).thenReturn(Mockito.mock(DataCenterVO.class));
|
||||
Mockito.when(_haDao.findPreviousHA(Mockito.anyLong())).thenReturn(Arrays.asList(Mockito.mock(HaWorkVO.class)));
|
||||
Mockito.when(_haDao.persist((HaWorkVO)Mockito.anyObject())).thenReturn(Mockito.mock(HaWorkVO.class));
|
||||
Mockito.when(_serviceOfferingDao.findById(vm1.getServiceOfferingId())).thenReturn(Mockito.mock(ServiceOfferingVO.class));
|
||||
|
||||
highAvailabilityManager.scheduleRestartForVmsOnHost(hostVO, true);
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue