// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. package com.cloud.ha; import static org.apache.cloudstack.framework.config.ConfigKey.Scope.Cluster; import com.cloud.deploy.DeploymentPlanner; import com.cloud.host.HostVO; import com.cloud.host.Status; import com.cloud.storage.Storage.StoragePoolType; import com.cloud.utils.component.Manager; import com.cloud.vm.VMInstanceVO; import org.apache.cloudstack.framework.config.ConfigKey; import java.util.List; /** * HighAvailabilityManager checks to make sure the VMs are running fine. */ public interface HighAvailabilityManager extends Manager { List LIBVIRT_STORAGE_POOL_TYPES_WITH_HA_SUPPORT = List.of(StoragePoolType.NetworkFilesystem, StoragePoolType.SharedMountPoint); ConfigKey ForceHA = new ConfigKey<>("Advanced", Boolean.class, "force.ha", "false", "Force High-Availability to happen even if the VM says no.", true, Cluster); ConfigKey HAWorkers = new ConfigKey<>("Advanced", Integer.class, "ha.workers", "5", "The number of High-Availability worker threads.", true, Cluster); ConfigKey InvestigateRetryInterval = new ConfigKey<>("Advanced", Integer.class, "investigate.retry.interval", "60", "The time (in seconds) between VM pings when the agent is disconnected.", true, Cluster); ConfigKey MigrateRetryInterval = new ConfigKey<>("Advanced", Integer.class, "migrate.retry.interval", "120", "The time (in seconds) between migration retries.", true, Cluster); ConfigKey RestartRetryInterval = new ConfigKey<>("Advanced", Integer.class, "restart.retry.interval", "600", "The time (in seconds) between retries to restart a VM.", true, Cluster); ConfigKey StopRetryInterval = new ConfigKey<>("Advanced", Integer.class, "stop.retry.interval", "600", "The time in seconds between retries to stop or destroy a VM.", true, Cluster); ConfigKey TimeBetweenCleanup = new ConfigKey<>("Advanced", Long.class, "time.between.cleanup", "86400", "The time in seconds to wait before the" + " cleanup thread runs for the different HA-Worker-Threads. The cleanup thread finds all the work items " + "that were successful and is now ready to be purged from the database (table: op_ha_work).", true, Cluster); ConfigKey MaxRetries = new ConfigKey<>("Advanced", Integer.class, "max.retries", "5", "The number of times to try a restart for the different Work-Types: " + "Migrating - VMs off of a host, Destroy - a VM, Stop - a VM for storage pool migration purposes," + " CheckStop - checks if a VM has been stopped, ForceStop - force a VM to stop even if the " + "states don't allow it, Destroy - a VM and HA - restart a VM.", true, Cluster); ConfigKey TimeToSleep = new ConfigKey<>("Advanced", Long.class, "time.to.sleep", "60", "The time in seconds to sleep before checking the database (table: op_ha_work) " + "for new working types (Migration, Stop, CheckStop, ForceStop, Destroy and HA), if no work items are found.", true, Cluster); ConfigKey TimeBetweenFailures = new ConfigKey<>("Advanced", Long.class, "time.between.failures", "3600", "The time in seconds before try to cleanup all the VMs" + " which are registered for the HA event that were successful and are now ready to be purged.", true, Cluster); ConfigKey KvmHAFenceHostIfHeartbeatFailsOnStorage = new ConfigKey<>("Advanced", Boolean.class, "kvm.ha.fence.on.storage.heartbeat.failure", "false", "Proceed fencing the host even the heartbeat failed for only one storage pool", false, ConfigKey.Scope.Zone); enum WorkType { Migration, // Migrating VMs off of a host. Stop, // Stops a VM for storage pool migration purposes. This should be obsolete now. CheckStop, // Checks if a VM has been stopped. ForceStop, // Force a VM to stop even if the states don't allow it. Use this only if you know the VM is stopped on the physical hypervisor. Destroy, // Destroy a VM. HA; // Restart a VM. } enum ReasonType { Unknown, HostMaintenance, HostDown, HostDegraded; } enum Step { Scheduled, Investigating, Fencing, Stopping, Restarting, Migrating, Cancelled, Done, Error, } /** * Investigate why a host has disconnected and migrate the VMs on it * if necessary. * * @param hostId - the id of the host that has disconnected. */ Status investigate(long hostId); /** * Restart a vm that has gone away due to various reasons. Whether a * VM is restarted depends on various reasons. * 1. Is the VM really dead. This method will try to find out. * 2. Is the VM HA enabled? If not, the VM is simply stopped. * * All VMs that enter HA mode is not allowed to be operated on until it * has been determined that the VM is dead. * * @param vm the vm that has gone away. * @param investigate must be investigated before we do anything with this vm. */ void scheduleRestart(VMInstanceVO vm, boolean investigate); void scheduleRestart(VMInstanceVO vm, boolean investigate, ReasonType reasonType); void cancelDestroy(VMInstanceVO vm, Long hostId); boolean scheduleDestroy(VMInstanceVO vm, long hostId, ReasonType reasonType); /** * Schedule restarts for all vms running on the host. * @param host host. * @param investigate whether to investigate * @param reasonType reason for HA work */ void scheduleRestartForVmsOnHost(HostVO host, boolean investigate, ReasonType reasonType); /** * Schedule the vm for migration. * * @param vm * @return true if schedule worked. */ boolean scheduleMigration(VMInstanceVO vm); boolean scheduleMigration(VMInstanceVO vm, ReasonType reasonType); List findTakenMigrationWork(); /** * Schedules a work item to stop a VM. This method schedules a work * item to do one of three things. * * 1. Perform a regular stop of a VM: WorkType.Stop * 2. Perform a force stop of a VM: WorkType.ForceStop * 3. Check if a VM has been stopped: WorkType.CheckStop * * @param vm virtual machine to stop. * @param hostId the id of the host the virtual machine is on. * @param type which type of stop is requested. */ boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type); boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type, ReasonType reasonType); void cancelScheduledMigrations(HostVO host); boolean hasPendingHaWork(long vmId); boolean hasPendingMigrationsWork(long vmId); /** * @return */ String getHaTag(); DeploymentPlanner getHAPlanner(); int expungeWorkItemsByVmList(List vmIds, Long batchSize); }