From e8b6f6658280f858e6c15a8b4e5ac4b74eff4490 Mon Sep 17 00:00:00 2001 From: Brenn Oosterbaan Date: Tue, 5 Mar 2013 15:38:24 +0100 Subject: [PATCH] =?UTF-8?q?Review=209647:=20In=20some=20storage=20failure?= =?UTF-8?q?=20scenario=E2=80=99s=20the=20NFS=20timeout=20can=20cause=20wri?= =?UTF-8?q?ting=20the=20heartbeat=20to=20take=20longer=20than=20expected.?= =?UTF-8?q?=20By=20comparing=20the=20last=20successful=20heartbeat=20epoch?= =?UTF-8?q?=20with=20the=20current=20epoch=20we=20check=20if=20the=20timeo?= =?UTF-8?q?ut=20value=20has=20been=20met.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../vm/hypervisor/xenserver/xenheartbeat.sh | 33 +++++++++++-------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/scripts/vm/hypervisor/xenserver/xenheartbeat.sh b/scripts/vm/hypervisor/xenserver/xenheartbeat.sh index 5edacf7e39a..dd876ba4b79 100755 --- a/scripts/vm/hypervisor/xenserver/xenheartbeat.sh +++ b/scripts/vm/hypervisor/xenserver/xenheartbeat.sh @@ -36,7 +36,7 @@ fi if [ ! -z $3 ]; then interval=$3 else - interval=10 + interval=5 fi if [ $interval -gt $2 ]; then @@ -45,10 +45,9 @@ if [ $interval -gt $2 ]; then fi file=/opt/xensource/bin/heartbeat -maxtries=$(($2 / $interval)) -tries=1 +lastdate=$(($(date +%s) + $interval)) -while [ $tries -le $maxtries ] +while [ $(date +%s) -lt $(($lastdate + $2)) ] do sleep $interval @@ -57,6 +56,14 @@ do continue fi + # test heartbeat file + dirs=$(cat $file | grep "sr-mount\|VG_XenStorage") + if [ ! -n "$dirs" ];then + /usr/bin/logger -t heartbeat "Problem with heartbeat, no iSCSI or NFS mount defined in $file!" + lastdate=$(date +%s) + continue + fi + # for iscsi dirs=$(cat $file | grep VG_XenStorage) for dir in $dirs @@ -65,13 +72,13 @@ do hb=$dir/hb-$1 date +%s | dd of=$hb count=100 bs=1 2>/dev/null if [ $? -ne 0 ]; then - /usr/bin/logger -t heartbeat "Potential problem with $hb: not reachable since $(($tries * $interval)) seconds" - tries=$(($tries + 1)) + /usr/bin/logger -t heartbeat "Potential problem with $hb: not reachable since $(($(date +%s) - $lastdate)) seconds" else - tries=1 + lastdate=$(date +%s) fi else - /usr/bin/logger -t heartbeat "Heartbeat dir not found for $dir" + /usr/bin/logger -t heartbeat "Potential problem with heartbeat, dir not found for $dir" + lastdate=$(date +%s) sed -i /${dir##/*/}/d $file fi done @@ -85,17 +92,17 @@ do hb=$dir/hb-$1 date +%s | dd of=$hb count=100 bs=1 2>/dev/null if [ $? -ne 0 ]; then - /usr/bin/logger -t heartbeat "Potential problem with $hb: not reachable since $(($tries * $interval)) seconds" - tries=$(($tries + 1)) + /usr/bin/logger -t heartbeat "Potential problem with $hb: not reachable since $(($(date +%s) - $lastdate)) seconds" else - tries=1 + lastdate=$(date +%s) fi else - /usr/bin/logger -t heartbeat "Heartbeat mount not found for $dir" + /usr/bin/logger -t heartbeat "Potential problem with heartbeat, mount not found for $dir" + lastdate=$(date +%s) sed -i /${dir##/*/}/d $file fi done done -/usr/bin/logger -t heartbeat "Problem with $hb: not reachable for $2 seconds, rebooting system!" +/usr/bin/logger -t heartbeat "Problem with $hb: not reachable for $(($(date +%s) - $lastdate)) seconds, rebooting system!" reboot -f