mirror of https://github.com/apache/cloudstack.git
KVM: make storage heartbeat fence action configurable
The KVM agent's storage heartbeat scripts (kvmheartbeat.sh and
kvmspheartbeat.sh) hard-code an immediate kernel-level reboot via
'echo b > /proc/sysrq-trigger' when a heartbeat write to primary storage
times out. This bypasses all OS-level shutdown protections, drops every
running VM on the host instantly, and triggers HA cascades onto
surviving hosts.
For NFS shared storage the binary "heartbeat-write-failed = host-is-dead"
heuristic is reasonable. For LINSTOR/DRBD or other replicated local
storage, the same disk serves application I/O, replication I/O and
heartbeat I/O simultaneously - so a transient I/O contention spike can
time out the heartbeat write without the host actually being unhealthy.
The result is false-positive sysrq fencing.
Adds a new agent.properties option:
kvm.heartbeat.fence.action = reboot | graceful-reboot
| restart-agent | log-only
Default value is "reboot" so existing deployments keep their current
behavior. Operators on replicated storage backends can choose a less
destructive action:
- graceful-reboot: 'systemctl reboot' instead of sysrq, allowing VMs
a chance to shut down cleanly
- restart-agent: restart cloudstack-agent only, preserving running VMs
- log-only: log + alert, no automatic action
The existing 'reboot.host.and.alert.management.on.heartbeat.timeout'
boolean continues to function as a complete Java-side bypass.
Refs: https://github.com/apache/cloudstack/issues/13089
This commit is contained in:
parent
30e6c224bd
commit
d603b260c4
|
|
@ -310,6 +310,22 @@ iscsi.session.cleanup.enabled=false
|
|||
# This parameter specifies if the host must be rebooted when something goes wrong with the heartbeat.
|
||||
#reboot.host.and.alert.management.on.heartbeat.timeout=true
|
||||
|
||||
# Action taken by kvmheartbeat.sh / kvmspheartbeat.sh when a storage heartbeat
|
||||
# write fails persistently. Supersedes the legacy binary
|
||||
# 'reboot.host.and.alert.management.on.heartbeat.timeout' when set to a non-default value.
|
||||
#
|
||||
# Allowed values:
|
||||
# reboot - immediate sysrq-trigger reboot (default; original behavior)
|
||||
# graceful-reboot - 'systemctl reboot' instead of sysrq; allows VMs to stop cleanly
|
||||
# restart-agent - restart cloudstack-agent only; running VMs are preserved
|
||||
# log-only - log + alert; take no automatic action (admin must investigate)
|
||||
#
|
||||
# The 'graceful-reboot', 'restart-agent', and 'log-only' actions are recommended
|
||||
# for setups using LINSTOR/DRBD or any local storage with replication, where
|
||||
# transient I/O contention can cause a heartbeat write to time out without the
|
||||
# host actually being unhealthy.
|
||||
#kvm.heartbeat.fence.action=reboot
|
||||
|
||||
# Enables manually setting CPU's topology on KVM's VM.
|
||||
#enable.manually.setting.cpu.topology.on.kvm.vm=true
|
||||
|
||||
|
|
|
|||
|
|
@ -598,6 +598,25 @@ public class AgentProperties{
|
|||
public static final Property<Boolean> REBOOT_HOST_AND_ALERT_MANAGEMENT_ON_HEARTBEAT_TIMEOUT
|
||||
= new Property<>("reboot.host.and.alert.management.on.heartbeat.timeout", true);
|
||||
|
||||
/**
|
||||
* Action taken by the KVM agent's storage heartbeat scripts (kvmheartbeat.sh / kvmspheartbeat.sh)
|
||||
* when a heartbeat write fails persistently. Allowed values:
|
||||
* <ul>
|
||||
* <li>{@code reboot} (default) — immediate sysrq-trigger reboot; original behavior</li>
|
||||
* <li>{@code graceful-reboot} — {@code systemctl reboot} instead of sysrq, lets VMs stop cleanly</li>
|
||||
* <li>{@code restart-agent} — restart cloudstack-agent only; running VMs preserved</li>
|
||||
* <li>{@code log-only} — log + alert, no automatic action</li>
|
||||
* </ul>
|
||||
* The non-default values are recommended for setups using LINSTOR/DRBD or other replicated
|
||||
* local storage, where transient I/O contention can cause a heartbeat write to time out
|
||||
* without the host actually being unhealthy.<br>
|
||||
* Read by the heartbeat shell scripts directly from agent.properties.<br>
|
||||
* Data type: String.<br>
|
||||
* Default value: {@code reboot}
|
||||
*/
|
||||
public static final Property<String> KVM_HEARTBEAT_FENCE_ACTION
|
||||
= new Property<>("kvm.heartbeat.fence.action", "reboot");
|
||||
|
||||
/**
|
||||
* Enables manually setting CPU's topology on KVM's VM. <br>
|
||||
* Data type: Boolean.<br>
|
||||
|
|
|
|||
|
|
@ -156,11 +156,43 @@ then
|
|||
exit 0
|
||||
elif [ "$cflag" == "1" ]
|
||||
then
|
||||
/usr/bin/logger -t heartbeat "kvmheartbeat.sh will reboot system because it was unable to write the heartbeat to the storage."
|
||||
sync &
|
||||
sleep 5
|
||||
echo b > /proc/sysrq-trigger
|
||||
exit $?
|
||||
# Read fence action from agent.properties (default: reboot for backward compatibility).
|
||||
# Allowed values: reboot | graceful-reboot | restart-agent | log-only
|
||||
AGENT_PROPS="/etc/cloudstack/agent/agent.properties"
|
||||
FENCE_ACTION="reboot"
|
||||
if [ -r "$AGENT_PROPS" ]; then
|
||||
val=$(grep -E '^[[:space:]]*kvm\.heartbeat\.fence\.action[[:space:]]*=' "$AGENT_PROPS" | tail -n 1 | cut -d= -f2- | tr -d '[:space:]')
|
||||
[ -n "$val" ] && FENCE_ACTION="$val"
|
||||
fi
|
||||
|
||||
case "$FENCE_ACTION" in
|
||||
log-only)
|
||||
/usr/bin/logger -t heartbeat "kvmheartbeat.sh: heartbeat write to storage failed; fence action 'log-only' selected — taking no automatic action. Operator must investigate."
|
||||
exit 0
|
||||
;;
|
||||
restart-agent)
|
||||
/usr/bin/logger -t heartbeat "kvmheartbeat.sh: heartbeat write to storage failed; fence action 'restart-agent' — restarting cloudstack-agent (running VMs preserved)."
|
||||
sync &
|
||||
sleep 2
|
||||
systemctl restart cloudstack-agent
|
||||
exit $?
|
||||
;;
|
||||
graceful-reboot)
|
||||
/usr/bin/logger -t heartbeat "kvmheartbeat.sh: heartbeat write to storage failed; fence action 'graceful-reboot' — rebooting via systemctl (allows running VMs to stop cleanly)."
|
||||
sync &
|
||||
sleep 5
|
||||
systemctl reboot
|
||||
exit $?
|
||||
;;
|
||||
reboot|*)
|
||||
# Original behavior: immediate kernel-level reboot via sysrq-trigger
|
||||
/usr/bin/logger -t heartbeat "kvmheartbeat.sh will reboot system because it was unable to write the heartbeat to the storage."
|
||||
sync &
|
||||
sleep 5
|
||||
echo b > /proc/sysrq-trigger
|
||||
exit $?
|
||||
;;
|
||||
esac
|
||||
else
|
||||
write_hbLog
|
||||
exit $?
|
||||
|
|
|
|||
|
|
@ -58,9 +58,41 @@ deleteVMs() {
|
|||
|
||||
if [ "$cflag" == "1" ]
|
||||
then
|
||||
/usr/bin/logger -t heartbeat "kvmspheartbeat.sh will reboot system because it was unable to write the heartbeat to the storage."
|
||||
sync &
|
||||
sleep 5
|
||||
echo b > /proc/sysrq-trigger
|
||||
exit $?
|
||||
# Read fence action from agent.properties (default: reboot for backward compatibility).
|
||||
# Allowed values: reboot | graceful-reboot | restart-agent | log-only
|
||||
AGENT_PROPS="/etc/cloudstack/agent/agent.properties"
|
||||
FENCE_ACTION="reboot"
|
||||
if [ -r "$AGENT_PROPS" ]; then
|
||||
val=$(grep -E '^[[:space:]]*kvm\.heartbeat\.fence\.action[[:space:]]*=' "$AGENT_PROPS" | tail -n 1 | cut -d= -f2- | tr -d '[:space:]')
|
||||
[ -n "$val" ] && FENCE_ACTION="$val"
|
||||
fi
|
||||
|
||||
case "$FENCE_ACTION" in
|
||||
log-only)
|
||||
/usr/bin/logger -t heartbeat "kvmspheartbeat.sh: heartbeat write to storage failed; fence action 'log-only' selected — taking no automatic action. Operator must investigate."
|
||||
exit 0
|
||||
;;
|
||||
restart-agent)
|
||||
/usr/bin/logger -t heartbeat "kvmspheartbeat.sh: heartbeat write to storage failed; fence action 'restart-agent' — restarting cloudstack-agent (running VMs preserved)."
|
||||
sync &
|
||||
sleep 2
|
||||
systemctl restart cloudstack-agent
|
||||
exit $?
|
||||
;;
|
||||
graceful-reboot)
|
||||
/usr/bin/logger -t heartbeat "kvmspheartbeat.sh: heartbeat write to storage failed; fence action 'graceful-reboot' — rebooting via systemctl (allows running VMs to stop cleanly)."
|
||||
sync &
|
||||
sleep 5
|
||||
systemctl reboot
|
||||
exit $?
|
||||
;;
|
||||
reboot|*)
|
||||
# Original behavior: immediate kernel-level reboot via sysrq-trigger
|
||||
/usr/bin/logger -t heartbeat "kvmspheartbeat.sh will reboot system because it was unable to write the heartbeat to the storage."
|
||||
sync &
|
||||
sleep 5
|
||||
echo b > /proc/sysrq-trigger
|
||||
exit $?
|
||||
;;
|
||||
esac
|
||||
fi
|
||||
|
|
|
|||
Loading…
Reference in New Issue