mirror of https://github.com/apache/cloudstack.git
175 lines
7.6 KiB
Java
175 lines
7.6 KiB
Java
// Licensed to the Apache Software Foundation (ASF) under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing,
|
|
// software distributed under the License is distributed on an
|
|
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
// KIND, either express or implied. See the License for the
|
|
// specific language governing permissions and limitations
|
|
// under the License.
|
|
package com.cloud.ha;
|
|
|
|
import static org.apache.cloudstack.framework.config.ConfigKey.Scope.Cluster;
|
|
|
|
import com.cloud.deploy.DeploymentPlanner;
|
|
import com.cloud.host.HostVO;
|
|
import com.cloud.host.Status;
|
|
import com.cloud.storage.Storage.StoragePoolType;
|
|
import com.cloud.utils.component.Manager;
|
|
import com.cloud.vm.VMInstanceVO;
|
|
import org.apache.cloudstack.framework.config.ConfigKey;
|
|
|
|
import java.util.List;
|
|
|
|
/**
|
|
* HighAvailabilityManager checks to make sure the VMs are running fine.
|
|
*/
|
|
public interface HighAvailabilityManager extends Manager {
|
|
|
|
List<StoragePoolType> LIBVIRT_STORAGE_POOL_TYPES_WITH_HA_SUPPORT = List.of(StoragePoolType.NetworkFilesystem, StoragePoolType.SharedMountPoint);
|
|
|
|
ConfigKey<Boolean> ForceHA = new ConfigKey<>("Advanced", Boolean.class, "force.ha", "false",
|
|
"Force High-Availability to happen even if the VM says no.", true, Cluster);
|
|
|
|
ConfigKey<Integer> HAWorkers = new ConfigKey<>("Advanced", Integer.class, "ha.workers", "5",
|
|
"The number of High-Availability worker threads.", true, Cluster);
|
|
|
|
ConfigKey<Integer> InvestigateRetryInterval = new ConfigKey<>("Advanced", Integer.class, "investigate.retry.interval",
|
|
"60", "The time (in seconds) between VM pings when the agent is disconnected.", true, Cluster);
|
|
|
|
ConfigKey<Integer> MigrateRetryInterval = new ConfigKey<>("Advanced", Integer.class, "migrate.retry.interval",
|
|
"120", "The time (in seconds) between migration retries.", true, Cluster);
|
|
|
|
ConfigKey<Integer> RestartRetryInterval = new ConfigKey<>("Advanced", Integer.class, "restart.retry.interval",
|
|
"600", "The time (in seconds) between retries to restart a VM.", true, Cluster);
|
|
|
|
ConfigKey<Integer> StopRetryInterval = new ConfigKey<>("Advanced", Integer.class, "stop.retry.interval",
|
|
"600", "The time in seconds between retries to stop or destroy a VM.", true, Cluster);
|
|
|
|
ConfigKey<Long> TimeBetweenCleanup = new ConfigKey<>("Advanced", Long.class,
|
|
"time.between.cleanup", "86400", "The time in seconds to wait before the"
|
|
+ " cleanup thread runs for the different HA-Worker-Threads. The cleanup thread finds all the work items "
|
|
+ "that were successful and is now ready to be purged from the database (table: op_ha_work).",
|
|
true, Cluster);
|
|
|
|
ConfigKey<Integer> MaxRetries = new ConfigKey<>("Advanced", Integer.class, "max.retries",
|
|
"5", "The number of times to try a restart for the different Work-Types: "
|
|
+ "Migrating - VMs off of a host, Destroy - a VM, Stop - a VM for storage pool migration purposes,"
|
|
+ " CheckStop - checks if a VM has been stopped, ForceStop - force a VM to stop even if the "
|
|
+ "states don't allow it, Destroy - a VM and HA - restart a VM.", true, Cluster);
|
|
|
|
ConfigKey<Long> TimeToSleep = new ConfigKey<>("Advanced", Long.class, "time.to.sleep",
|
|
"60", "The time in seconds to sleep before checking the database (table: op_ha_work) "
|
|
+ "for new working types (Migration, Stop, CheckStop, ForceStop, Destroy and HA), if no work items are found.",
|
|
true, Cluster);
|
|
|
|
ConfigKey<Long> TimeBetweenFailures = new ConfigKey<>("Advanced", Long.class,
|
|
"time.between.failures", "3600", "The time in seconds before try to cleanup all the VMs"
|
|
+ " which are registered for the HA event that were successful and are now ready to be purged.",
|
|
true, Cluster);
|
|
|
|
ConfigKey<Boolean> KvmHAFenceHostIfHeartbeatFailsOnStorage = new ConfigKey<>("Advanced", Boolean.class, "kvm.ha.fence.on.storage.heartbeat.failure", "false",
|
|
"Proceed fencing the host even the heartbeat failed for only one storage pool", false, ConfigKey.Scope.Zone);
|
|
|
|
enum WorkType {
|
|
Migration, // Migrating VMs off of a host.
|
|
Stop, // Stops a VM for storage pool migration purposes. This should be obsolete now.
|
|
CheckStop, // Checks if a VM has been stopped.
|
|
ForceStop, // Force a VM to stop even if the states don't allow it. Use this only if you know the VM is stopped on the physical hypervisor.
|
|
Destroy, // Destroy a VM.
|
|
HA; // Restart a VM.
|
|
}
|
|
|
|
enum ReasonType {
|
|
Unknown,
|
|
HostMaintenance,
|
|
HostDown,
|
|
HostDegraded;
|
|
}
|
|
|
|
enum Step {
|
|
Scheduled, Investigating, Fencing, Stopping, Restarting, Migrating, Cancelled, Done, Error,
|
|
}
|
|
|
|
/**
|
|
* Investigate why a host has disconnected and migrate the VMs on it
|
|
* if necessary.
|
|
*
|
|
* @param hostId - the id of the host that has disconnected.
|
|
*/
|
|
Status investigate(long hostId);
|
|
|
|
/**
|
|
* Restart a vm that has gone away due to various reasons. Whether a
|
|
* VM is restarted depends on various reasons.
|
|
* 1. Is the VM really dead. This method will try to find out.
|
|
* 2. Is the VM HA enabled? If not, the VM is simply stopped.
|
|
*
|
|
* All VMs that enter HA mode is not allowed to be operated on until it
|
|
* has been determined that the VM is dead.
|
|
*
|
|
* @param vm the vm that has gone away.
|
|
* @param investigate must be investigated before we do anything with this vm.
|
|
*/
|
|
void scheduleRestart(VMInstanceVO vm, boolean investigate);
|
|
void scheduleRestart(VMInstanceVO vm, boolean investigate, ReasonType reasonType);
|
|
|
|
void cancelDestroy(VMInstanceVO vm, Long hostId);
|
|
|
|
boolean scheduleDestroy(VMInstanceVO vm, long hostId, ReasonType reasonType);
|
|
|
|
/**
|
|
* Schedule restarts for all vms running on the host.
|
|
* @param host host.
|
|
* @param investigate whether to investigate
|
|
* @param reasonType reason for HA work
|
|
*/
|
|
void scheduleRestartForVmsOnHost(HostVO host, boolean investigate, ReasonType reasonType);
|
|
|
|
/**
|
|
* Schedule the vm for migration.
|
|
*
|
|
* @param vm
|
|
* @return true if schedule worked.
|
|
*/
|
|
boolean scheduleMigration(VMInstanceVO vm);
|
|
boolean scheduleMigration(VMInstanceVO vm, ReasonType reasonType);
|
|
|
|
List<VMInstanceVO> findTakenMigrationWork();
|
|
|
|
/**
|
|
* Schedules a work item to stop a VM. This method schedules a work
|
|
* item to do one of three things.
|
|
*
|
|
* 1. Perform a regular stop of a VM: WorkType.Stop
|
|
* 2. Perform a force stop of a VM: WorkType.ForceStop
|
|
* 3. Check if a VM has been stopped: WorkType.CheckStop
|
|
*
|
|
* @param vm virtual machine to stop.
|
|
* @param hostId the id of the host the virtual machine is on.
|
|
* @param type which type of stop is requested.
|
|
*/
|
|
boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type);
|
|
boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type, ReasonType reasonType);
|
|
|
|
void cancelScheduledMigrations(HostVO host);
|
|
|
|
boolean hasPendingHaWork(long vmId);
|
|
|
|
boolean hasPendingMigrationsWork(long vmId);
|
|
/**
|
|
* @return
|
|
*/
|
|
String getHaTag();
|
|
|
|
DeploymentPlanner getHAPlanner();
|
|
int expungeWorkItemsByVmList(List<Long> vmIds, Long batchSize);
|
|
}
|