server: prevent duplicate HA works and alerts (#10624)

Signed-off-by: Abhishek Kumar <abhishek.mrt22@gmail.com>
This commit is contained in:
Abhishek Kumar 2025-05-06 14:12:30 +05:30 committed by GitHub
parent 1f8442eb69
commit 919c9797cc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 56 additions and 15 deletions

View File

@ -38,9 +38,6 @@ import java.util.concurrent.locks.ReentrantLock;
import javax.inject.Inject;
import javax.naming.ConfigurationException;
import com.cloud.configuration.Config;
import com.cloud.utils.NumbersUtil;
import com.cloud.utils.db.GlobalLock;
import org.apache.cloudstack.agent.lb.IndirectAgentLB;
import org.apache.cloudstack.ca.CAManager;
import org.apache.cloudstack.engine.orchestration.service.NetworkOrchestrationService;
@ -54,6 +51,7 @@ import org.apache.cloudstack.outofbandmanagement.dao.OutOfBandManagementDao;
import org.apache.cloudstack.utils.identity.ManagementServerNode;
import org.apache.cloudstack.utils.reflectiontostringbuilderutils.ReflectionToStringBuilderUtils;
import org.apache.commons.lang3.BooleanUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.log4j.Logger;
import org.apache.log4j.MDC;
@ -82,6 +80,7 @@ import com.cloud.agent.api.UnsupportedAnswer;
import com.cloud.agent.transport.Request;
import com.cloud.agent.transport.Response;
import com.cloud.alert.AlertManager;
import com.cloud.configuration.Config;
import com.cloud.configuration.ManagementServiceConfiguration;
import com.cloud.dc.ClusterVO;
import com.cloud.dc.DataCenterVO;
@ -105,11 +104,13 @@ import com.cloud.resource.Discoverer;
import com.cloud.resource.ResourceManager;
import com.cloud.resource.ResourceState;
import com.cloud.resource.ServerResource;
import com.cloud.utils.NumbersUtil;
import com.cloud.utils.Pair;
import com.cloud.utils.component.ManagerBase;
import com.cloud.utils.concurrency.NamedThreadFactory;
import com.cloud.utils.db.DB;
import com.cloud.utils.db.EntityManager;
import com.cloud.utils.db.GlobalLock;
import com.cloud.utils.db.QueryBuilder;
import com.cloud.utils.db.SearchCriteria.Op;
import com.cloud.utils.db.TransactionLegacy;
@ -124,7 +125,6 @@ import com.cloud.utils.nio.Link;
import com.cloud.utils.nio.NioServer;
import com.cloud.utils.nio.Task;
import com.cloud.utils.time.InaccurateClock;
import org.apache.commons.lang3.StringUtils;
/**
* Implementation of the Agent Manager. This class controls the connection to the agents.
@ -208,6 +208,11 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager, Handl
protected final ConfigKey<Boolean> CheckTxnBeforeSending = new ConfigKey<Boolean>("Developer", Boolean.class, "check.txn.before.sending.agent.commands", "false",
"This parameter allows developers to enable a check to see if a transaction wraps commands that are sent to the resource. This is not to be enabled on production systems.", true);
public static final List<Host.Type> HOST_DOWN_ALERT_UNSUPPORTED_HOST_TYPES = Arrays.asList(
Host.Type.SecondaryStorage,
Host.Type.ConsoleProxy
);
@Override
public boolean configure(final String name, final Map<String, Object> params) throws ConfigurationException {
@ -901,7 +906,10 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager, Handl
if (determinedState == Status.Down) {
final String message = "Host is down: " + host.getId() + "-" + host.getName() + ". Starting HA on the VMs";
s_logger.error(message);
if (host.getType() != Host.Type.SecondaryStorage && host.getType() != Host.Type.ConsoleProxy) {
if (Status.Down.equals(host.getStatus())) {
s_logger.debug(String.format("Skipping sending alert for %s as it already in %s state",
host, host.getStatus()));
} else if (!HOST_DOWN_ALERT_UNSUPPORTED_HOST_TYPES.contains(host.getType())) {
_alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_HOST, host.getDataCenterId(), host.getPodId(), "Host down, " + host.getId(), message);
}
event = Status.Event.HostDown;

View File

@ -21,6 +21,7 @@ import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
@ -41,6 +42,7 @@ import org.apache.cloudstack.framework.config.dao.ConfigurationDao;
import org.apache.cloudstack.managed.context.ManagedContext;
import org.apache.cloudstack.managed.context.ManagedContextRunnable;
import org.apache.cloudstack.management.ManagementServerHost;
import org.apache.commons.collections.CollectionUtils;
import org.apache.log4j.Logger;
import org.apache.log4j.NDC;
@ -71,7 +73,6 @@ import com.cloud.hypervisor.Hypervisor.HypervisorType;
import com.cloud.network.VpcVirtualNetworkApplianceService;
import com.cloud.resource.ResourceManager;
import com.cloud.server.ManagementServer;
import com.cloud.service.ServiceOfferingVO;
import com.cloud.service.dao.ServiceOfferingDao;
import com.cloud.storage.Storage.StoragePoolType;
import com.cloud.storage.StorageManager;
@ -223,6 +224,18 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
long _timeBetweenCleanups;
String _haTag = null;
private boolean vmHasPendingHAJob(final List<HaWorkVO> pendingHaWorks, final VMInstanceVO vm) {
Optional<HaWorkVO> item = pendingHaWorks.stream()
.filter(h -> h.getInstanceId() == vm.getId())
.reduce((first, second) -> second);
if (item.isPresent() && (item.get().getTimesTried() < _maxRetries ||
!item.get().canScheduleNew(_timeBetweenFailures))) {
s_logger.debug(String.format("Skipping HA on %s as there is already a running HA job for it", vm));
return true;
}
return false;
}
protected HighAvailabilityManagerImpl() {
}
@ -265,28 +278,37 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
s_logger.warn("Scheduling restart for VMs on host " + host.getId() + "-" + host.getName());
final List<VMInstanceVO> vms = _instanceDao.listByHostId(host.getId());
final List<HaWorkVO> pendingHaWorks = _haDao.listPendingHAWorkForHost(host.getId());
final DataCenterVO dcVO = _dcDao.findById(host.getDataCenterId());
// send an email alert that the host is down
StringBuilder sb = null;
List<VMInstanceVO> reorderedVMList = new ArrayList<VMInstanceVO>();
if ((vms != null) && !vms.isEmpty()) {
int skippedHAVms = 0;
if (CollectionUtils.isNotEmpty(vms)) {
sb = new StringBuilder();
sb.append(" Starting HA on the following VMs:");
// collect list of vm names for the alert email
for (int i = 0; i < vms.size(); i++) {
VMInstanceVO vm = vms.get(i);
for (VMInstanceVO vm : vms) {
if (vmHasPendingHAJob(pendingHaWorks, vm)) {
skippedHAVms++;
continue;
}
if (vm.getType() == VirtualMachine.Type.User) {
reorderedVMList.add(vm);
} else {
reorderedVMList.add(0, vm);
}
if (vm.isHaEnabled()) {
sb.append(" " + vm.getHostName());
sb.append(" ").append(vm.getHostName());
}
}
}
if (reorderedVMList.isEmpty() && skippedHAVms > 0 && skippedHAVms == vms.size()) {
s_logger.debug(String.format(
"Skipping sending alert for %s as it is suspected to be a duplicate of a recent alert", host));
return;
}
// send an email alert that the host is down, include VMs
HostPodVO podVO = _podDao.findById(host.getPodId());
String hostDesc = "name: " + host.getName() + " (id:" + host.getId() + "), availability zone: " + dcVO.getName() + ", pod: " + podVO.getName();
@ -294,7 +316,6 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
"Host [" + hostDesc + "] is down." + ((sb != null) ? sb.toString() : ""));
for (VMInstanceVO vm : reorderedVMList) {
ServiceOfferingVO vmOffering = _serviceOfferingDao.findById(vm.getServiceOfferingId());
if (_itMgr.isRootVolumeOnLocalStorage(vm.getId())) {
if (s_logger.isDebugEnabled()){
s_logger.debug("Skipping HA on vm " + vm + ", because it uses local storage. Its fate is tied to the host.");

View File

@ -85,4 +85,6 @@ public interface HighAvailabilityDao extends GenericDao<HaWorkVO, Long> {
List<HaWorkVO> listPendingHaWorkForVm(long vmId);
List<HaWorkVO> listPendingMigrationsForVm(long vmId);
List<HaWorkVO> listPendingHAWorkForHost(long hostId);
}

View File

@ -19,7 +19,6 @@ package com.cloud.ha.dao;
import java.util.Date;
import java.util.List;
import org.apache.log4j.Logger;
import org.springframework.stereotype.Component;
@ -260,4 +259,17 @@ public class HighAvailabilityDaoImpl extends GenericDaoBase<HaWorkVO, Long> impl
return update(vo, sc);
}
@Override
public List<HaWorkVO> listPendingHAWorkForHost(long hostId) {
SearchBuilder<HaWorkVO> sb = createSearchBuilder();
sb.and("hostId", sb.entity().getHostId(), Op.EQ);
sb.and("type", sb.entity().getWorkType(), Op.EQ);
sb.and("step", sb.entity().getStep(), Op.NIN);
SearchCriteria<HaWorkVO> sc = sb.create();
sc.setParameters("hostId", hostId);
sc.setParameters("type", WorkType.HA);
sc.setParameters("step", Step.Done, Step.Cancelled, Step.Error);
return listBy(sc);
}
}

View File

@ -62,7 +62,6 @@ import com.cloud.hypervisor.Hypervisor.HypervisorType;
import com.cloud.network.VpcVirtualNetworkApplianceService;
import com.cloud.resource.ResourceManager;
import com.cloud.server.ManagementServer;
import com.cloud.service.ServiceOfferingVO;
import com.cloud.service.dao.ServiceOfferingDao;
import com.cloud.storage.StorageManager;
import com.cloud.storage.dao.GuestOSCategoryDao;
@ -214,7 +213,6 @@ public class HighAvailabilityManagerImplTest {
Mockito.when(_dcDao.findById(Mockito.anyLong())).thenReturn(Mockito.mock(DataCenterVO.class));
Mockito.when(_haDao.findPreviousHA(Mockito.anyLong())).thenReturn(Arrays.asList(Mockito.mock(HaWorkVO.class)));
Mockito.when(_haDao.persist((HaWorkVO)Mockito.anyObject())).thenReturn(Mockito.mock(HaWorkVO.class));
Mockito.when(_serviceOfferingDao.findById(vm1.getServiceOfferingId())).thenReturn(Mockito.mock(ServiceOfferingVO.class));
highAvailabilityManager.scheduleRestartForVmsOnHost(hostVO, true);
}