This commit is contained in:
James Peru Mmbono 2026-05-12 08:17:30 +01:00 committed by GitHub
commit 25c553532f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 154 additions and 10 deletions

View File

@ -310,6 +310,31 @@ iscsi.session.cleanup.enabled=false
# This parameter specifies if the host must be rebooted when something goes wrong with the heartbeat.
#reboot.host.and.alert.management.on.heartbeat.timeout=true
# Action taken by kvmheartbeat.sh / kvmspheartbeat.sh when a storage heartbeat
# write fails persistently. Supersedes the legacy binary
# 'reboot.host.and.alert.management.on.heartbeat.timeout' when set to a non-default value.
#
# Allowed values:
# hard-reboot - immediate sysrq-trigger reboot (default; 'reboot' kept as alias).
# Required default for setups where a stale NFSv3 mount can prevent
# a graceful shutdown from completing.
# graceful-reboot - 'systemctl reboot' instead of sysrq; allows VMs to stop cleanly.
# Use only if a stale storage mount cannot block shutdown.
# restart-agent - restart cloudstack-agent only; running VMs are preserved.
# log-only - log + alert; take no automatic action (admin must investigate).
# custom - invoke the script at 'kvm.heartbeat.fence.custom.script' (see below).
# Script is called with one positional arg: the heartbeat script name
# (e.g. 'kvmheartbeat.sh'). Falls back to hard-reboot if missing or
# not executable.
#
# The non-default values are recommended for setups using LINSTOR/DRBD or any local
# storage with replication, where transient I/O contention can cause a heartbeat
# write to time out without the host actually being unhealthy.
#kvm.heartbeat.fence.action=hard-reboot
# Path to the operator-supplied script invoked when kvm.heartbeat.fence.action=custom.
#kvm.heartbeat.fence.custom.script=/etc/cloudstack/agent/heartbeat-fence-custom.sh
# Enables manually setting CPU's topology on KVM's VM.
#enable.manually.setting.cpu.topology.on.kvm.vm=true

View File

@ -598,6 +598,44 @@ public class AgentProperties{
public static final Property<Boolean> REBOOT_HOST_AND_ALERT_MANAGEMENT_ON_HEARTBEAT_TIMEOUT
= new Property<>("reboot.host.and.alert.management.on.heartbeat.timeout", true);
/**
* Action taken by the KVM agent's storage heartbeat scripts (kvmheartbeat.sh / kvmspheartbeat.sh)
* when a heartbeat write fails persistently. Allowed values:
* <ul>
* <li>{@code hard-reboot} (default; {@code reboot} accepted as alias) immediate
* sysrq-trigger reboot. Required default for setups where a stale NFSv3 mount can
* prevent a graceful shutdown from completing.</li>
* <li>{@code graceful-reboot} {@code systemctl reboot} instead of sysrq; allows VMs
* to stop cleanly. Use only if a stale storage mount cannot block shutdown.</li>
* <li>{@code restart-agent} restart cloudstack-agent only; running VMs preserved.</li>
* <li>{@code log-only} log + alert; take no automatic action (admin must investigate).</li>
* <li>{@code custom} invoke the script at {@link #KVM_HEARTBEAT_FENCE_CUSTOM_SCRIPT}
* (default {@code /etc/cloudstack/agent/heartbeat-fence-custom.sh}). The script is
* called with one argument: the heartbeat script name (e.g. {@code kvmheartbeat.sh}).
* If the script is missing or not executable, falls back to {@code hard-reboot}.</li>
* </ul>
* The non-default values are recommended for setups using LINSTOR/DRBD or other replicated
* local storage, where transient I/O contention can cause a heartbeat write to time out
* without the host actually being unhealthy.<br>
* Read by the heartbeat shell scripts directly from agent.properties.<br>
* Data type: String.<br>
* Default value: {@code hard-reboot}
*/
public static final Property<String> KVM_HEARTBEAT_FENCE_ACTION
= new Property<>("kvm.heartbeat.fence.action", "hard-reboot");
/**
* Path to the operator-supplied script invoked when
* {@link #KVM_HEARTBEAT_FENCE_ACTION} is set to {@code custom}. The script must be
* executable and is called with a single positional argument: the heartbeat script name
* that triggered the fence (e.g. {@code kvmheartbeat.sh}). Read by the heartbeat shell
* scripts directly from agent.properties.<br>
* Data type: String.<br>
* Default value: {@code /etc/cloudstack/agent/heartbeat-fence-custom.sh}
*/
public static final Property<String> KVM_HEARTBEAT_FENCE_CUSTOM_SCRIPT
= new Property<>("kvm.heartbeat.fence.custom.script", "/etc/cloudstack/agent/heartbeat-fence-custom.sh");
/**
* Enables manually setting CPU's topology on KVM's VM. <br>
* Data type: Boolean.<br>

View File

@ -0,0 +1,85 @@
#!/bin/bash
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
# Shared fence-action helper for kvmheartbeat.sh and kvmspheartbeat.sh.
# Sourced by both scripts; do not invoke directly.
#
# Usage from caller:
# source "$(dirname "$0")/kvmha-fence.sh"
# fence_action "kvmheartbeat.sh" # script name passed for log tagging
AGENT_PROPS="${AGENT_PROPS:-/etc/cloudstack/agent/agent.properties}"
fence_action() {
local source_script="${1:-kvmha}"
local FENCE_ACTION="hard-reboot"
local CUSTOM_SCRIPT="/etc/cloudstack/agent/heartbeat-fence-custom.sh"
if [ -r "$AGENT_PROPS" ]; then
local val
val=$(grep -E '^[[:space:]]*kvm\.heartbeat\.fence\.action[[:space:]]*=' "$AGENT_PROPS" | tail -n 1 | cut -d= -f2- | tr -d '[:space:]')
[ -n "$val" ] && FENCE_ACTION="$val"
local cval
cval=$(grep -E '^[[:space:]]*kvm\.heartbeat\.fence\.custom\.script[[:space:]]*=' "$AGENT_PROPS" | tail -n 1 | cut -d= -f2- | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
[ -n "$cval" ] && CUSTOM_SCRIPT="$cval"
fi
case "$FENCE_ACTION" in
log-only)
/usr/bin/logger -t heartbeat "${source_script}: heartbeat write to storage failed; fence action 'log-only' selected — taking no automatic action. Operator must investigate."
exit 0
;;
restart-agent)
/usr/bin/logger -t heartbeat "${source_script}: heartbeat write to storage failed; fence action 'restart-agent' — restarting cloudstack-agent (running VMs preserved)."
sync &
sleep 2
systemctl restart cloudstack-agent
exit $?
;;
graceful-reboot)
/usr/bin/logger -t heartbeat "${source_script}: heartbeat write to storage failed; fence action 'graceful-reboot' — rebooting via systemctl (allows running VMs to stop cleanly)."
sync &
sleep 5
systemctl reboot
exit $?
;;
custom)
if [ -x "$CUSTOM_SCRIPT" ]; then
/usr/bin/logger -t heartbeat "${source_script}: heartbeat write to storage failed; fence action 'custom' — running ${CUSTOM_SCRIPT}."
sync &
sleep 2
"$CUSTOM_SCRIPT" "$source_script"
exit $?
else
/usr/bin/logger -t heartbeat "${source_script}: heartbeat write to storage failed; fence action 'custom' selected but ${CUSTOM_SCRIPT} is missing or not executable — falling back to hard-reboot."
sync &
sleep 5
echo b > /proc/sysrq-trigger
exit $?
fi
;;
hard-reboot|reboot|*)
# 'reboot' kept as alias for back-compat with pre-existing deployments.
/usr/bin/logger -t heartbeat "${source_script} will reboot system because it was unable to write the heartbeat to the storage."
sync &
sleep 5
echo b > /proc/sysrq-trigger
exit $?
;;
esac
}

View File

@ -156,11 +156,9 @@ then
exit 0
elif [ "$cflag" == "1" ]
then
/usr/bin/logger -t heartbeat "kvmheartbeat.sh will reboot system because it was unable to write the heartbeat to the storage."
sync &
sleep 5
echo b > /proc/sysrq-trigger
exit $?
# shellcheck disable=SC1091
. "$(dirname "$0")/kvmha-fence.sh"
fence_action "kvmheartbeat.sh"
else
write_hbLog
exit $?

View File

@ -58,9 +58,7 @@ deleteVMs() {
if [ "$cflag" == "1" ]
then
/usr/bin/logger -t heartbeat "kvmspheartbeat.sh will reboot system because it was unable to write the heartbeat to the storage."
sync &
sleep 5
echo b > /proc/sysrq-trigger
exit $?
# shellcheck disable=SC1091
. "$(dirname "$0")/kvmha-fence.sh"
fence_action "kvmspheartbeat.sh"
fi