From ab0bce2a1baa4e199a447e3f2d74ca93c026a037 Mon Sep 17 00:00:00 2001 From: Rohit Yadav Date: Thu, 15 Mar 2018 16:32:18 +0530 Subject: [PATCH] CLOUDSTACK-10296: Find time different from last timestamp (#2458) This fixes a difference issue in rVR heartbeat check script raised recently on dev@. Reduce logging to avoid logging to fill ramdisk Make checkrouter return fault state when keepalived is not running Signed-off-by: Rohit Yadav --- systemvm/debian/opt/cloud/bin/checkrouter.sh | 7 ++ .../cloud/templates/check_heartbeat.sh.templ | 80 ++++++++++--------- 2 files changed, 49 insertions(+), 38 deletions(-) diff --git a/systemvm/debian/opt/cloud/bin/checkrouter.sh b/systemvm/debian/opt/cloud/bin/checkrouter.sh index 0a9041bfbd1..bb6c9f8a0f2 100755 --- a/systemvm/debian/opt/cloud/bin/checkrouter.sh +++ b/systemvm/debian/opt/cloud/bin/checkrouter.sh @@ -17,6 +17,13 @@ # under the License. STATUS=UNKNOWN + +if [ "$(systemctl is-active keepalived)" != "active" ] +then + echo "Status: FAULT" + exit +fi + ROUTER_TYPE=$(cat /etc/cloudstack/cmdline.json | grep type | awk '{print $2;}' | sed -e 's/[,\"]//g') if [ "$ROUTER_TYPE" = "router" ] then diff --git a/systemvm/debian/opt/cloud/templates/check_heartbeat.sh.templ b/systemvm/debian/opt/cloud/templates/check_heartbeat.sh.templ index 2ab9abaa8a4..62a2b180e6c 100755 --- a/systemvm/debian/opt/cloud/templates/check_heartbeat.sh.templ +++ b/systemvm/debian/opt/cloud/templates/check_heartbeat.sh.templ @@ -16,48 +16,52 @@ # specific language governing permissions and limitations # under the License. -ROUTER_BIN_PATH=/ramdisk/rrouter -ROUTER_LOG=${ROUTER_BIN_PATH}/keepalived.log +ROUTER_BIN_PATH="/ramdisk/rrouter" +ROUTER_LOG="${ROUTER_BIN_PATH}/keepalived.log" STRIKE_FILE="$ROUTER_BIN_PATH/keepalived.strikes" +TS_FILE="$ROUTER_BIN_PATH/keepalived.ts" +CT_FILE="$ROUTER_BIN_PATH/keepalived.ct" -if [ -e $ROUTER_BIN_PATH/keepalived.ts2 ] +checktime=$(date +%s) +hbtime=$(cat $TS_FILE) +diff=$(($checktime - $hbtime)) + +lastcheck=0 +if [ -e $CT_FILE ] then - thistime=$(cat $ROUTER_BIN_PATH/keepalived.ts) - lasttime=$(cat $ROUTER_BIN_PATH/keepalived.ts2) - diff=$(($lasttime - $thistime)) - s=0 - if [ $diff -ge 10 ] - then - if [ -e $STRIKE_FILE ] - then - s=`cat $STRIKE_FILE 2>/dev/null` - fi - s=$(($s+1)) - echo $s > $STRIKE_FILE - else - if [ -e $STRIKE_FILE ] - then - rm $STRIKE_FILE - echo keepalived.strikes file was removed! >> $ROUTER_LOG - else - echo keepalived.strikes file does not exist! >> $ROUTER_LOG - fi - fi - #3 strikes rule - if [ $s -gt 2 ] - then - echo Keepalived process is dead! >> $ROUTER_LOG - systemctl stop keepalived >> $ROUTER_LOG 2>&1 - systemctl stop conntrackd >> $ROUTER_LOG 2>&1 + lastcheck=$(cat $CT_FILE 2>/dev/null) +fi +checkdiff=$(($checktime - $lastcheck)) +if [ $checkdiff -ge 0 ] && [ $checkdiff -lt 30 ] +then + exit +fi +echo $checktime > $CT_FILE - #Set fault so we have the same effect as a KeepaliveD fault. - python /opt/cloud/bin/master.py --fault - - pkill -9 keepalived >> $ROUTER_LOG 2>&1 - pkill -9 conntrackd >> $ROUTER_LOG 2>&1 - echo Status: FAULT \(keepalived process is dead\) >> $ROUTER_LOG - exit +s=0 +if [ $diff -gt 10 ] +then + if [ -e $STRIKE_FILE ] + then + s=$(cat $STRIKE_FILE 2>/dev/null) fi + s=$(($s+1)) + echo $s > $STRIKE_FILE + echo "Check time: $checktime, last heartbeat time: $hbtime, time diff: $diff, strike count: $s" >> $ROUTER_LOG +else + rm -f $STRIKE_FILE fi -cp $ROUTER_BIN_PATH/keepalived.ts $ROUTER_BIN_PATH/keepalived.ts2 +if [ $s -gt 3 ] +then + systemctl stop --now keepalived >> $ROUTER_LOG 2>&1 + systemctl stop --now conntrackd >> $ROUTER_LOG 2>&1 + + #Set fault so we have the same effect as a KeepaliveD fault. + python /opt/cloud/bin/master.py --fault + + pkill -9 keepalived >> $ROUTER_LOG 2>&1 || true + pkill -9 conntrackd >> $ROUTER_LOG 2>&1 || true + echo Status: FAULT \(keepalived process is dead\) >> $ROUTER_LOG + exit +fi