cloudstack/engine/components-api/src/main/java/com/cloud/ha/HighAvailabilityManager.java

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.
package com.cloud.ha;

import static org.apache.cloudstack.framework.config.ConfigKey.Scope.Cluster;

import com.cloud.deploy.DeploymentPlanner;
import com.cloud.host.HostVO;
import com.cloud.host.Status;
import com.cloud.storage.Storage.StoragePoolType;
import com.cloud.utils.component.Manager;
import com.cloud.vm.VMInstanceVO;
import org.apache.cloudstack.framework.config.ConfigKey;

import java.util.List;

/**
 * HighAvailabilityManager checks to make sure the VMs are running fine.
 */
public interface HighAvailabilityManager extends Manager {

    List<StoragePoolType> LIBVIRT_STORAGE_POOL_TYPES_WITH_HA_SUPPORT = List.of(StoragePoolType.NetworkFilesystem, StoragePoolType.SharedMountPoint);

    ConfigKey<Boolean> ForceHA = new ConfigKey<>("Advanced", Boolean.class, "force.ha", "false",
        "Force High-Availability to happen even if the VM says no.", true, Cluster);

    ConfigKey<Integer> HAWorkers = new ConfigKey<>("Advanced", Integer.class, "ha.workers", "5",
        "The number of High-Availability worker threads.", true, Cluster);

    ConfigKey<Integer> InvestigateRetryInterval = new ConfigKey<>("Advanced", Integer.class, "investigate.retry.interval",
        "60", "The time (in seconds) between VM pings when the agent is disconnected.", true, Cluster);

    ConfigKey<Integer> MigrateRetryInterval = new ConfigKey<>("Advanced", Integer.class, "migrate.retry.interval",
        "120", "The time (in seconds) between migration retries.", true, Cluster);

    ConfigKey<Integer> RestartRetryInterval = new ConfigKey<>("Advanced", Integer.class, "restart.retry.interval",
        "600", "The time (in seconds) between retries to restart a VM.", true, Cluster);

    ConfigKey<Integer> StopRetryInterval = new ConfigKey<>("Advanced", Integer.class, "stop.retry.interval",
        "600", "The time in seconds between retries to stop or destroy a VM.", true, Cluster);

    ConfigKey<Long> TimeBetweenCleanup = new ConfigKey<>("Advanced", Long.class,
        "time.between.cleanup", "86400", "The time in seconds to wait before the"
        + " cleanup thread runs for the different HA-Worker-Threads. The cleanup thread finds all the work items "
        + "that were successful and is now ready to be purged from the database (table: op_ha_work).",
        true, Cluster);

    ConfigKey<Integer> MaxRetries = new ConfigKey<>("Advanced", Integer.class, "max.retries",
        "5", "The number of times to try a restart for the different Work-Types: "
        + "Migrating - VMs off of a host, Destroy - a VM, Stop - a VM for storage pool migration purposes,"
        + " CheckStop - checks if a VM has been stopped, ForceStop - force a VM to stop even if the "
        + "states don't allow it, Destroy - a VM and HA - restart a VM.", true, Cluster);

    ConfigKey<Long> TimeToSleep = new ConfigKey<>("Advanced", Long.class, "time.to.sleep",
        "60", "The time in seconds to sleep before checking the database (table: op_ha_work) "
        + "for new working types (Migration, Stop, CheckStop, ForceStop, Destroy and HA), if no work items are found.",
        true, Cluster);

    ConfigKey<Long> TimeBetweenFailures = new ConfigKey<>("Advanced", Long.class,
        "time.between.failures", "3600", "The time in seconds before try to cleanup all the VMs"
        + " which are registered for the HA event that were successful and are now ready to be purged.",
        true, Cluster);

    ConfigKey<Boolean> KvmHAFenceHostIfHeartbeatFailsOnStorage = new ConfigKey<>("Advanced", Boolean.class, "kvm.ha.fence.on.storage.heartbeat.failure", "false",
            "Proceed fencing the host even the heartbeat failed for only one storage pool", false, ConfigKey.Scope.Zone);

    enum WorkType {
        Migration,  // Migrating VMs off of a host.
        Stop,       // Stops a VM for storage pool migration purposes.  This should be obsolete now.
        CheckStop,  // Checks if a VM has been stopped.
        ForceStop,  // Force a VM to stop even if the states don't allow it.  Use this only if you know the VM is stopped on the physical hypervisor.
        Destroy,    // Destroy a VM.
        HA;         // Restart a VM.
    }

    enum ReasonType {
        Unknown,
        HostMaintenance,
        HostDown,
        HostDegraded;
    }

    enum Step {
        Scheduled, Investigating, Fencing, Stopping, Restarting, Migrating, Cancelled, Done, Error,
    }

    /**
     * Investigate why a host has disconnected and migrate the VMs on it
     * if necessary.
     *
     * @param hostId - the id of the host that has disconnected.
     */
    Status investigate(long hostId);

    /**
     * Restart a vm that has gone away due to various reasons.  Whether a
     * VM is restarted depends on various reasons.
     *   1. Is the VM really dead.  This method will try to find out.
     *   2. Is the VM HA enabled?  If not, the VM is simply stopped.
     *
     * All VMs that enter HA mode is not allowed to be operated on until it
     * has been determined that the VM is dead.
     *
     * @param vm the vm that has gone away.
     * @param investigate must be investigated before we do anything with this vm.
     */
    void scheduleRestart(VMInstanceVO vm, boolean investigate);
    void scheduleRestart(VMInstanceVO vm, boolean investigate, ReasonType reasonType);

    void cancelDestroy(VMInstanceVO vm, Long hostId);

    boolean scheduleDestroy(VMInstanceVO vm, long hostId, ReasonType reasonType);

    /**
     * Schedule restarts for all vms running on the host.
     * @param host host.
     * @param investigate whether to investigate
     * @param reasonType reason for HA work
     */
    void scheduleRestartForVmsOnHost(HostVO host, boolean investigate, ReasonType reasonType);

    /**
     * Schedule the vm for migration.
     *
     * @param vm
     * @return true if schedule worked.
     */
    boolean scheduleMigration(VMInstanceVO vm);
    boolean scheduleMigration(VMInstanceVO vm, ReasonType reasonType);

    List<VMInstanceVO> findTakenMigrationWork();

    /**
     * Schedules a work item to stop a VM.  This method schedules a work
     * item to do one of three things.
     *
     * 1. Perform a regular stop of a VM: WorkType.Stop
     * 2. Perform a force stop of a VM: WorkType.ForceStop
     * 3. Check if a VM has been stopped: WorkType.CheckStop
     *
     * @param vm virtual machine to stop.
     * @param hostId the id of the host the virtual machine is on.
     * @param type which type of stop is requested.
     */
    boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type);
    boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type, ReasonType reasonType);

    void cancelScheduledMigrations(HostVO host);

    boolean hasPendingHaWork(long vmId);

    boolean hasPendingMigrationsWork(long vmId);
    /**
     * @return
     */
    String getHaTag();

    DeploymentPlanner getHAPlanner();
    int expungeWorkItemsByVmList(List<Long> vmIds, Long batchSize);
}