From c6e53f6cc6e6363276dd591eefe7515584130737 Mon Sep 17 00:00:00 2001 From: Rohit Yadav Date: Tue, 30 Oct 2018 15:13:59 +0530 Subject: [PATCH] kvm: reset KVM host on heartbeat failure (#2984) On actual testing, I could see that kvmheartbeat.sh script fails on NFS server failure and stops the agent only. Any HA VMs could be launched in different hosts, and recovery of NFS server could lead to a state where a HA enabled VM runs on two hosts and can potentially cause disk corruptions. In most cases, VM disk corruption will be worse than VM downtime. I've kept the sleep interval between check/rounds but reduced it to 10s. The change in behaviour was introduced in #2722. Signed-off-by: Rohit Yadav --- .../kvm/src/com/cloud/hypervisor/kvm/resource/KVMHABase.java | 2 +- scripts/vm/hypervisor/kvm/kvmheartbeat.sh | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/plugins/hypervisors/kvm/src/com/cloud/hypervisor/kvm/resource/KVMHABase.java b/plugins/hypervisors/kvm/src/com/cloud/hypervisor/kvm/resource/KVMHABase.java index f180848a8d5..9aa55f89dc8 100644 --- a/plugins/hypervisors/kvm/src/com/cloud/hypervisor/kvm/resource/KVMHABase.java +++ b/plugins/hypervisors/kvm/src/com/cloud/hypervisor/kvm/resource/KVMHABase.java @@ -35,7 +35,7 @@ public class KVMHABase { protected long _heartBeatUpdateTimeout = 60000; protected long _heartBeatUpdateFreq = 60000; protected long _heartBeatUpdateMaxTries = 5; - protected long _heartBeatUpdateRetrySleep = 15000; + protected long _heartBeatUpdateRetrySleep = 10000; public static enum PoolType { PrimaryStorage, SecondaryStorage diff --git a/scripts/vm/hypervisor/kvm/kvmheartbeat.sh b/scripts/vm/hypervisor/kvm/kvmheartbeat.sh index 30ca72a2aa9..df2e54db85a 100755 --- a/scripts/vm/hypervisor/kvm/kvmheartbeat.sh +++ b/scripts/vm/hypervisor/kvm/kvmheartbeat.sh @@ -155,10 +155,10 @@ then exit 0 elif [ "$cflag" == "1" ] then - /usr/bin/logger -t heartbeat "kvmheartbeat.sh stopped cloudstack-agent because it was unable to write the heartbeat to the storage." + /usr/bin/logger -t heartbeat "kvmheartbeat.sh will reboot system because it was unable to write the heartbeat to the storage." sync & sleep 5 - service cloudstack-agent stop + echo b > /proc/sysrq-trigger exit $? else write_hbLog