server: destroy ssvm, cpvm on last host maintenance (#4644)

* server: destroy ssvm, cpvm on last host maintenance

When a single or last UP host enters into maintenance just stopping SSVM and CPVM will leave behind VMs on hypervisor side. As these system vms will be recreated they can be destroyed.
Fixes #3719

Signed-off-by: Abhishek Kumar <abhishek.mrt22@gmail.com>

* fix methods

Signed-off-by: Abhishek Kumar <abhishek.mrt22@gmail.com>

* immediately destroy systemvms

Signed-off-by: Abhishek Kumar <abhishek.mrt22@gmail.com>

* fix destroy

Added bypassHostMaintenance flag in Comma.java class to allow command to be handled by host agent even when host is in maintenace.
Flag is set true only for delete commands for ssvm and cpvm.

Signed-off-by: Abhishek Kumar <abhishek.mrt22@gmail.com>

* unit test fix

Signed-off-by: Abhishek Kumar <abhishek.mrt22@gmail.com>

* fix missing return statement

Signed-off-by: Abhishek Kumar <abhishek.mrt22@gmail.com>

* fix

VM should be stopped with cleanup before calling expunge else it server may through error with host in PrepareForMaintenance state.

Signed-off-by: Abhishek Kumar <abhishek.mrt22@gmail.com>

* refactor

Signed-off-by: Abhishek Kumar <abhishek.mrt22@gmail.com>

* rename

Signed-off-by: Abhishek Kumar <abhishek.mrt22@gmail.com>

* refactor

Signed-off-by: Abhishek Kumar <abhishek.mrt22@gmail.com>
This commit is contained in:
Abhishek Kumar 2021-05-14 23:16:15 +05:30 committed by GitHub
parent 755791089d
commit dc91a1fd4d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 140 additions and 56 deletions

View File

@ -37,6 +37,7 @@ public abstract class Command {
@LogLevel(Log4jLevel.Trace)
protected Map<String, String> contextMap = new HashMap<String, String>();
private int wait; //in second
private boolean bypassHostMaintenance = false;
protected Command() {
this.wait = 0;
@ -74,6 +75,14 @@ public abstract class Command {
return true;
}
public boolean isBypassHostMaintenance() {
return bypassHostMaintenance;
}
public void setBypassHostMaintenance(boolean bypassHostMaintenance) {
this.bypassHostMaintenance = bypassHostMaintenance;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;

View File

@ -31,10 +31,7 @@ import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import com.cloud.agent.api.ModifySshKeysCommand;
import com.cloud.agent.api.ModifyStoragePoolCommand;
import org.apache.cloudstack.agent.lb.SetupMSListCommand;
import com.cloud.agent.api.RollingMaintenanceCommand;
import org.apache.cloudstack.managed.context.ManagedContextRunnable;
import org.apache.log4j.Logger;
@ -48,10 +45,13 @@ import com.cloud.agent.api.CleanupNetworkRulesCmd;
import com.cloud.agent.api.Command;
import com.cloud.agent.api.MaintainCommand;
import com.cloud.agent.api.MigrateCommand;
import com.cloud.agent.api.ModifySshKeysCommand;
import com.cloud.agent.api.ModifyStoragePoolCommand;
import com.cloud.agent.api.ModifyTargetsCommand;
import com.cloud.agent.api.PingTestCommand;
import com.cloud.agent.api.PvlanSetupCommand;
import com.cloud.agent.api.ReadyCommand;
import com.cloud.agent.api.RollingMaintenanceCommand;
import com.cloud.agent.api.SetupCommand;
import com.cloud.agent.api.ShutdownCommand;
import com.cloud.agent.api.StartCommand;
@ -167,7 +167,7 @@ public abstract class AgentAttache {
if (_maintenance) {
for (final Command cmd : cmds) {
if (Arrays.binarySearch(s_commandsAllowedInMaintenanceMode, cmd.getClass().toString()) < 0) {
if (Arrays.binarySearch(s_commandsAllowedInMaintenanceMode, cmd.getClass().toString()) < 0 && !cmd.isBypassHostMaintenance()) {
throw new AgentUnavailableException("Unable to send " + cmd.getClass().toString() + " because agent " + _name + " is in maintenance mode", _id);
}
}

View File

@ -519,6 +519,11 @@ public class VirtualMachineManagerImpl extends ManagerBase implements VirtualMac
advanceExpunge(vm);
}
private boolean expungeCommandCanBypassHostMaintenance(VirtualMachine vm) {
return VirtualMachine.Type.SecondaryStorageVm.equals(vm.getType()) ||
VirtualMachine.Type.ConsoleProxy.equals(vm.getType());
}
protected void advanceExpunge(VMInstanceVO vm) throws ResourceUnavailableException, OperationTimedoutException, ConcurrentOperationException {
if (vm == null || vm.getRemoved() != null) {
if (s_logger.isDebugEnabled()) {
@ -565,6 +570,7 @@ public class VirtualMachineManagerImpl extends ManagerBase implements VirtualMac
final Commands cmds = new Commands(Command.OnError.Stop);
for (final Command volumeExpungeCommand : volumeExpungeCommands) {
volumeExpungeCommand.setBypassHostMaintenance(expungeCommandCanBypassHostMaintenance(vm));
cmds.addCommand(volumeExpungeCommand);
}
@ -606,10 +612,12 @@ public class VirtualMachineManagerImpl extends ManagerBase implements VirtualMac
if (hostId != null) {
final Commands cmds = new Commands(Command.OnError.Stop);
for (final Command command : finalizeExpungeCommands) {
command.setBypassHostMaintenance(expungeCommandCanBypassHostMaintenance(vm));
cmds.addCommand(command);
}
if (nicExpungeCommands != null) {
for (final Command command : nicExpungeCommands) {
command.setBypassHostMaintenance(expungeCommandCanBypassHostMaintenance(vm));
cmds.addCommand(command);
}
}

View File

@ -18,14 +18,14 @@
*/
package org.apache.cloudstack.storage.datastore.driver;
import static com.cloud.utils.NumbersUtil.toHumanReadableSize;
import java.util.HashMap;
import java.util.Map;
import java.util.UUID;
import javax.inject.Inject;
import org.apache.log4j.Logger;
import org.apache.cloudstack.engine.subsystem.api.storage.ChapInfo;
import org.apache.cloudstack.engine.subsystem.api.storage.CopyCommandResult;
import org.apache.cloudstack.engine.subsystem.api.storage.CreateCmdResult;
@ -53,6 +53,7 @@ import org.apache.cloudstack.storage.datastore.db.StoragePoolVO;
import org.apache.cloudstack.storage.to.SnapshotObjectTO;
import org.apache.cloudstack.storage.to.TemplateObjectTO;
import org.apache.cloudstack.storage.volume.VolumeObject;
import org.apache.log4j.Logger;
import com.cloud.agent.api.Answer;
import com.cloud.agent.api.storage.ResizeVolumeAnswer;
@ -70,16 +71,17 @@ import com.cloud.storage.ResizeVolumePayload;
import com.cloud.storage.Storage;
import com.cloud.storage.StorageManager;
import com.cloud.storage.StoragePool;
import com.cloud.storage.Volume;
import com.cloud.storage.dao.DiskOfferingDao;
import com.cloud.storage.dao.SnapshotDao;
import com.cloud.storage.dao.VMTemplateDao;
import com.cloud.storage.dao.VolumeDao;
import com.cloud.storage.snapshot.SnapshotManager;
import com.cloud.template.TemplateManager;
import com.cloud.vm.VMInstanceVO;
import com.cloud.vm.VirtualMachine;
import com.cloud.vm.dao.VMInstanceDao;
import static com.cloud.utils.NumbersUtil.toHumanReadableSize;
public class CloudStackPrimaryDataStoreDriverImpl implements PrimaryDataStoreDriver {
@Override
public Map<String, String> getCapabilities() {
@ -211,10 +213,22 @@ public class CloudStackPrimaryDataStoreDriverImpl implements PrimaryDataStoreDri
}
}
private boolean commandCanBypassHostMaintenance(DataObject data) {
if (DataObjectType.VOLUME.equals(data.getType())) {
Volume volume = (Volume)data;
if (volume.getInstanceId() != null) {
VMInstanceVO vm = vmDao.findById(volume.getInstanceId());
return vm != null && (VirtualMachine.Type.SecondaryStorageVm.equals(vm.getType()) ||
VirtualMachine.Type.ConsoleProxy.equals(vm.getType()));
}
}
return false;
}
@Override
public void deleteAsync(DataStore dataStore, DataObject data, AsyncCompletionCallback<CommandResult> callback) {
DeleteCommand cmd = new DeleteCommand(data.getTO());
cmd.setBypassHostMaintenance(commandCanBypassHostMaintenance(data));
CommandResult result = new CommandResult();
try {
EndPoint ep = null;

View File

@ -1006,6 +1006,13 @@ public class ConsoleProxyManagerImpl extends ManagerBase implements ConsoleProxy
}
public boolean isZoneReady(Map<Long, ZoneHostInfo> zoneHostInfoMap, long dataCenterId) {
List <HostVO> hosts = _hostDao.listByDataCenterId(dataCenterId);
if (CollectionUtils.isEmpty(hosts)) {
if (s_logger.isDebugEnabled()) {
s_logger.debug("Zone " + dataCenterId + " has no host available which is enabled and in Up state");
}
return false;
}
ZoneHostInfo zoneHostInfo = zoneHostInfoMap.get(dataCenterId);
if (zoneHostInfo != null && isZoneHostReady(zoneHostInfo)) {
VMTemplateVO template = _templateDao.findSystemVMReadyTemplate(dataCenterId, HypervisorType.Any);

View File

@ -16,9 +16,32 @@
// under the License.
package com.cloud.ha;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import javax.inject.Inject;
import javax.naming.ConfigurationException;
import org.apache.cloudstack.engine.orchestration.service.VolumeOrchestrationService;
import org.apache.cloudstack.framework.config.ConfigKey;
import org.apache.cloudstack.framework.config.Configurable;
import org.apache.cloudstack.framework.config.dao.ConfigurationDao;
import org.apache.cloudstack.managed.context.ManagedContext;
import org.apache.cloudstack.managed.context.ManagedContextRunnable;
import org.apache.cloudstack.management.ManagementServerHost;
import org.apache.log4j.Logger;
import org.apache.log4j.NDC;
import com.cloud.agent.AgentManager;
import com.cloud.alert.AlertManager;
import com.cloud.cluster.ClusterManagerListener;
import com.cloud.consoleproxy.ConsoleProxyManager;
import com.cloud.dc.ClusterDetailsDao;
import com.cloud.dc.DataCenterVO;
import com.cloud.dc.HostPodVO;
@ -46,37 +69,16 @@ import com.cloud.service.dao.ServiceOfferingDao;
import com.cloud.storage.StorageManager;
import com.cloud.storage.dao.GuestOSCategoryDao;
import com.cloud.storage.dao.GuestOSDao;
import com.cloud.storage.secondary.SecondaryStorageVmManager;
import com.cloud.user.AccountManager;
import com.cloud.utils.component.ManagerBase;
import com.cloud.utils.concurrency.NamedThreadFactory;
import com.cloud.utils.exception.CloudRuntimeException;
import com.cloud.vm.VMInstanceVO;
import com.cloud.vm.VirtualMachine;
import com.cloud.vm.VirtualMachine.State;
import com.cloud.vm.VirtualMachineManager;
import com.cloud.vm.VirtualMachineProfile;
import com.cloud.vm.dao.VMInstanceDao;
import org.apache.cloudstack.engine.orchestration.service.VolumeOrchestrationService;
import org.apache.cloudstack.framework.config.ConfigKey;
import org.apache.cloudstack.framework.config.Configurable;
import org.apache.cloudstack.framework.config.dao.ConfigurationDao;
import org.apache.cloudstack.managed.context.ManagedContext;
import org.apache.cloudstack.managed.context.ManagedContextRunnable;
import org.apache.cloudstack.management.ManagementServerHost;
import org.apache.log4j.Logger;
import org.apache.log4j.NDC;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import javax.inject.Inject;
import javax.naming.ConfigurationException;
/**
* HighAvailabilityManagerImpl coordinates the HA process. VMs are registered with the HA Manager for HA. The request is stored
@ -125,9 +127,12 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
HostPodDao _podDao;
@Inject
ClusterDetailsDao _clusterDetailsDao;
@Inject
ServiceOfferingDao _serviceOfferingDao;
@Inject
private ConsoleProxyManager consoleProxyManager;
@Inject
private SecondaryStorageVmManager secondaryStorageVmManager;
long _serverId;
@ -680,31 +685,51 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
_haDao.delete(vm.getId(), WorkType.Destroy);
}
private void stopVMWithCleanup(VirtualMachine vm, VirtualMachine.State state) throws OperationTimedoutException, ResourceUnavailableException {
if (VirtualMachine.State.Running.equals(state)) {
_itMgr.advanceStop(vm.getUuid(), true);
}
}
private void destroyVM(VirtualMachine vm, boolean expunge) throws OperationTimedoutException, AgentUnavailableException {
s_logger.info("Destroying " + vm.toString());
if (VirtualMachine.Type.ConsoleProxy.equals(vm.getType())) {
consoleProxyManager.destroyProxy(vm.getId());
} else if (VirtualMachine.Type.SecondaryStorageVm.equals(vm.getType())) {
secondaryStorageVmManager.destroySecStorageVm(vm.getId());
} else {
_itMgr.destroy(vm.getUuid(), expunge);
}
}
protected Long destroyVM(final HaWorkVO work) {
final VirtualMachine vm = _itMgr.findById(work.getInstanceId());
s_logger.info("Destroying " + vm.toString());
if (vm == null) {
s_logger.info("No longer can find VM " + work.getInstanceId() + ". Throwing away " + work);
return null;
}
boolean expunge = VirtualMachine.Type.SecondaryStorageVm.equals(vm.getType())
|| VirtualMachine.Type.ConsoleProxy.equals(vm.getType());
if (!expunge && VirtualMachine.State.Destroyed.equals(work.getPreviousState())) {
s_logger.info("VM " + vm.getUuid() + " already in " + vm.getState() + " state. Throwing away " + work);
return null;
}
try {
if (vm.getState() != State.Destroyed) {
s_logger.info("VM is no longer in Destroyed state " + vm.toString());
return null;
}
if (vm.getHostId() != null) {
_itMgr.destroy(vm.getUuid(), false);
s_logger.info("Successfully destroy " + vm);
stopVMWithCleanup(vm, work.getPreviousState());
if (!VirtualMachine.State.Expunging.equals(work.getPreviousState())) {
destroyVM(vm, expunge);
return null;
} else {
if (s_logger.isDebugEnabled()) {
s_logger.debug(vm + " has already been stopped");
}
return null;
s_logger.info("VM " + vm.getUuid() + " still in " + vm.getState() + " state.");
}
} catch (final AgentUnavailableException e) {
s_logger.debug("Agnet is not available" + e.getMessage());
s_logger.debug("Agent is not available" + e.getMessage());
} catch (OperationTimedoutException e) {
s_logger.debug("operation timed out: " + e.getMessage());
} catch (ConcurrentOperationException e) {
s_logger.debug("concurrent operation: " + e.getMessage());
} catch (ResourceUnavailableException e) {
s_logger.debug("Resource unavailable: " + e.getMessage());
}
return (System.currentTimeMillis() >> 10) + _stopRetryInterval;
@ -793,9 +818,8 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
case Stop:
case CheckStop:
case ForceStop:
return ((System.currentTimeMillis() >> 10) + _stopRetryInterval);
case Destroy:
return ((System.currentTimeMillis() >> 10) + _restartRetryInterval);
return ((System.currentTimeMillis() >> 10) + _stopRetryInterval);
}
return 0;
}

View File

@ -16,6 +16,8 @@
// under the License.
package com.cloud.resource;
import static com.cloud.configuration.ConfigurationManagerImpl.SET_HOST_DOWN_TO_MAINTENANCE;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URLDecoder;
@ -180,9 +182,6 @@ import com.cloud.vm.dao.UserVmDetailsDao;
import com.cloud.vm.dao.VMInstanceDao;
import com.google.gson.Gson;
import static com.cloud.configuration.ConfigurationManagerImpl.SET_HOST_DOWN_TO_MAINTENANCE;
@Component
public class ResourceManagerImpl extends ManagerBase implements ResourceManager, ResourceService, Manager {
private static final Logger s_logger = Logger.getLogger(ResourceManagerImpl.class);
@ -1229,6 +1228,19 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
return _hostDao.updateResourceState(currentState, event, nextState, host);
}
private void handleVmForLastHostOrWithVGpu(final HostVO host, final VMInstanceVO vm) {
// Migration is not supported for VGPU Vms so stop them.
// for the last host in this cluster, destroy SSVM/CPVM and stop all other VMs
if (VirtualMachine.Type.SecondaryStorageVm.equals(vm.getType())
|| VirtualMachine.Type.ConsoleProxy.equals(vm.getType())) {
s_logger.error(String.format("Maintenance: VM is of type %s. Destroying VM %s (ID: %s) immediately instead of migration.", vm.getType().toString(), vm.getInstanceName(), vm.getUuid()));
_haMgr.scheduleDestroy(vm, host.getId());
return;
}
s_logger.error(String.format("Maintenance: No hosts available for migrations. Scheduling shutdown for VM %s instead of migration.", vm.getUuid()));
_haMgr.scheduleStop(vm, host.getId(), WorkType.ForceStop);
}
private boolean doMaintain(final long hostId) {
final HostVO host = _hostDao.findById(hostId);
s_logger.info("Maintenance: attempting maintenance of host " + host.getUuid());
@ -1266,10 +1278,7 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
for (final VMInstanceVO vm : vms) {
if (hosts == null || hosts.isEmpty() || !answer.getMigrate()
|| _serviceOfferingDetailsDao.findDetail(vm.getServiceOfferingId(), GPU.Keys.vgpuType.toString()) != null) {
// Migration is not supported for VGPU Vms so stop them.
// for the last host in this cluster, stop all the VMs
s_logger.error("Maintenance: No hosts available for migrations. Scheduling shutdown instead of migrations.");
_haMgr.scheduleStop(vm, hostId, WorkType.ForceStop);
handleVmForLastHostOrWithVGpu(host, vm);
} else if (HypervisorType.LXC.equals(host.getHypervisorType()) && VirtualMachine.Type.User.equals(vm.getType())){
//Migration is not supported for LXC Vms. Schedule restart instead.
_haMgr.scheduleRestart(vm, false);
@ -1417,7 +1426,7 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
* on a host. We need to track the various VM states on each run and accordingly transit to the
* appropriate state.
*
* We change states as follws -
* We change states as follows -
* 1. If there are no VMs in running, migrating, starting, stopping, error, unknown states we can move
* to maintenance state. Note that there cannot be incoming migrations as the API Call prepare for
* maintenance checks incoming migrations before starting.

View File

@ -44,6 +44,7 @@ import org.mockito.runners.MockitoJUnitRunner;
import com.cloud.agent.AgentManager;
import com.cloud.alert.AlertManager;
import com.cloud.consoleproxy.ConsoleProxyManager;
import com.cloud.dc.ClusterDetailsDao;
import com.cloud.dc.DataCenterVO;
import com.cloud.dc.HostPodVO;
@ -64,6 +65,7 @@ import com.cloud.service.dao.ServiceOfferingDao;
import com.cloud.storage.StorageManager;
import com.cloud.storage.dao.GuestOSCategoryDao;
import com.cloud.storage.dao.GuestOSDao;
import com.cloud.storage.secondary.SecondaryStorageVmManager;
import com.cloud.user.AccountManager;
import com.cloud.vm.VMInstanceVO;
import com.cloud.vm.VirtualMachine;
@ -112,6 +114,10 @@ public class HighAvailabilityManagerImplTest {
@Mock
VolumeOrchestrationService volumeMgr;
@Mock
ConsoleProxyManager consoleProxyManager;
@Mock
SecondaryStorageVmManager secondaryStorageVmManager;
@Mock
HostVO hostVO;
HighAvailabilityManagerImpl highAvailabilityManager;

View File

@ -811,6 +811,13 @@ public class SecondaryStorageManagerImpl extends ManagerBase implements Secondar
}
public boolean isZoneReady(Map<Long, ZoneHostInfo> zoneHostInfoMap, long dataCenterId) {
List <HostVO> hosts = _hostDao.listByDataCenterId(dataCenterId);
if (CollectionUtils.isEmpty(hosts)) {
if (s_logger.isDebugEnabled()) {
s_logger.debug("Zone " + dataCenterId + " has no host available which is enabled and in Up state");
}
return false;
}
ZoneHostInfo zoneHostInfo = zoneHostInfoMap.get(dataCenterId);
if (zoneHostInfo != null && (zoneHostInfo.getFlags() & RunningHostInfoAgregator.ZoneHostInfo.ROUTING_HOST_MASK) != 0) {
VMTemplateVO template = _templateDao.findSystemVMReadyTemplate(dataCenterId, HypervisorType.Any);