From c1ed540af8e714d4ff719b1c441e571baca8bdf3 Mon Sep 17 00:00:00 2001 From: Jayapal Date: Tue, 26 Nov 2013 09:49:22 +0530 Subject: [PATCH] Monitoring python script organized into more methods --- .../debian/config/root/monitorServices.py | 157 +++++++++--------- 1 file changed, 80 insertions(+), 77 deletions(-) diff --git a/systemvm/patches/debian/config/root/monitorServices.py b/systemvm/patches/debian/config/root/monitorServices.py index 4e1b7e09716..3646c81923b 100755 --- a/systemvm/patches/debian/config/root/monitorServices.py +++ b/systemvm/patches/debian/config/root/monitorServices.py @@ -135,6 +135,49 @@ def isPidMatchPidFile(pidfile, pids): fd.close() return StatusCodes.FAILED +def checkProcessRunningStatus(process_name, pidFile): + printd("checking the process " + process_name) + cmd = '' + pids = [] + cmd = 'pidof ' + process_name + printd(cmd) + + #cmd = 'service ' + process_name + ' status' + pout = Popen(cmd, shell=True, stdout=PIPE) + exitStatus = pout.wait() + temp_out = pout.communicate()[0] + + #check there is only one pid or not + if exitStatus == 0: + pids = temp_out.split(' ') + printd("pid(s) of process %s are %s " %(process_name, pids)) + + #there is more than one process so match the pid file + #if not matched set pidFileMatched=False + printd("Checking pid file") + if isPidMatchPidFile(pidFile, pids) == StatusCodes.SUCCESS: + return True,pids; + + printd("pid of exit status %s" %exitStatus) + + return False,pids; + +def restartService(service_name): + + cmd = 'service ' + service_name + ' restart' + cout = Popen(cmd, shell=True, stdout=PIPE, stderr=STDOUT) + return_val = cout.wait() + + if return_val == 0: + printd("The service " + service_name +" recovered successfully ") + msg="The process " +service_name+" is recovered successfully " + raisealert(Log.INFO,msg,service_name) + return True + else: + printd("process restart failed ....") + + return False + def checkProcessStatus( process ): @@ -152,56 +195,28 @@ def checkProcessStatus( process ): if process_name is None: printd ("\n Invalid Process Name") return StatusCodes.INVALID_INP - else: - printd("checking the process " + process_name) - cmd = 'pidof ' + process_name - printd(cmd) - #cmd = 'service ' + process_name + ' status' - pout = Popen(cmd, shell=True, stdout=PIPE) - exitStatus = pout.wait() - temp_out = pout.communicate()[0] - #check there is only one pid or not - if exitStatus == 0: - pids = temp_out.split(' ') - msg="pids: " +temp_out; - printd(msg) + status, pids = checkProcessRunningStatus(process_name, pidfile) - #there is more than one process so match the pid file - #if not matched set pidFileMatched=False - printd("Checking pid file") - if isPidMatchPidFile(pidfile, pids) == StatusCodes.SUCCESS: - pidFileMatched = True; - else: - pidFileMatched = False; - - if exitStatus == 0 and pidFileMatched == True: + if status == True: printd("The process is running ....") return StatusCodes.RUNNING else: - printd('exit status:'+str(exitStatus)) - msg="The process " + process_name +" is not running trying recover " - printd(msg) + printd("Process %s is not running trying to recover" %process_name) #Retry the process state for few seconds + for i in range(1, Config.RETRY_ITERATIONS): - pout = Popen(cmd, shell=True, stdout=PIPE) - exitStatus = pout.wait() - temp_out = pout.communicate()[0] + time.sleep(Config.SLEEP_SEC) if i < Config.RETRY_FOR_RESTART: # this is just for trying few more times - if exitStatus == 0: - pids = temp_out.split(' ') - if isPidMatchPidFile(pidfile, pids) == StatusCodes.SUCCESS: - pidFileMatched = True; - printd("pid file is matched ...") - raisealert(Log.ALERT, "The process detected as running", process_name) - break - else: - printd("pid file is not matched ...") - pidFileMatched = False; - time.sleep(Config.SLEEP_SEC) - continue + status, pids = checkProcessRunningStatus(process_name, pidfile) + if status == True: + raisealert(Log.ALERT, "The process detected as running", process_name) + break + else: + printd("Process %s is not running checking the status again..." %process_name) + continue else: msg="The process " +process_name+" is not running trying recover " raisealert(Log.INFO,process_name,msg) @@ -213,25 +228,10 @@ def checkProcessStatus( process ): printd(cmd) Popen(cmd, shell=True, stdout=PIPE, stderr=STDOUT) - cmd = 'service ' + service_name + ' restart' - - time.sleep(Config.SLEEP_SEC) - #return_val= check_call(cmd , shell=True) - - cout = Popen(cmd, shell=True, stdout=PIPE, stderr=STDOUT) - return_val = cout.wait() - - if return_val == 0: - printd("The process" + process_name +" recovered successfully ") - msg="The process " +process_name+" is recovered successfully " - raisealert(Log.INFO,msg,process_name) - - break; + if restartService(service_name) == True: + break else: - #retry restarting the process for few tries - printd("process restart failing trying again ....") - restartFailed=True - time.sleep(Config.SLEEP_SEC) + restartFailed = True continue #for end here @@ -255,6 +255,7 @@ def monitProcess( processes_info ): dict_unmonit={} umonit_update={} + unMonitPs=False if not path.isfile(Config.UNMONIT_PS_FILE): printd('Unmonit File not exist') @@ -265,42 +266,48 @@ def monitProcess( processes_info ): #time for noting process down time csec = repr(time.time()).split('.')[0] - unMonitPs=False - for process,properties in processes_info.items(): #skip the process it its time stamp less than Config.MONIT_AFTER_MINS - printd ("checking the process %s \n" %process) + printd ("checking the service %s \n" %process) if not is_emtpy(dict_unmonit): if dict_unmonit.has_key(process): ts = dict_unmonit[process] - printd("Time difference=%s" %str(int(csec) - int(ts))) - tmin = (int(csec) - int(ts) )/60 - if ( int(csec) - int(ts) )/60 < Config.MONIT_AFTER_MINS: - raisealert(Log.ALERT, "The %s get monitor after %s minutes " %(process, Config.MONIT_AFTER_MINS)) - printd('process will be monitored after %s min' %(str(int(Config.MONIT_AFTER_MINS) - tmin))) - unMonitPs=True + if checkPsTimeStampForMonitor (csec, ts, properties) == False: + unMonitPs = True continue if checkProcessStatus( properties) != StatusCodes.RUNNING: - printd( "\n Process %s is not Running"%process) + printd( "\n Service %s is not Running"%process) #add this process into unmonit list - printd ("updating the process for unmonit %s\n" %process) + printd ("updating the service for unmonit %s\n" %process) umonit_update[process]=csec - #if dict is not empty write to file else delete it if not is_emtpy(umonit_update): writePsListToUnmonitFile(umonit_update) else: if is_emtpy(umonit_update) and unMonitPs == False: #delete file it is there - if path.isfile(Config.UNMONIT_PS_FILE): - printd("Removing the file %s" %Config.UNMONIT_PS_FILE) - os.remove(Config.UNMONIT_PS_FILE) + removeFile(Config.UNMONIT_PS_FILE) +def checkPsTimeStampForMonitor(csec,ts, process): + printd("Time difference=%s" %str(int(csec) - int(ts))) + tmin = (int(csec) - int(ts) )/60 + + if ( int(csec) - int(ts) )/60 < Config.MONIT_AFTER_MINS: + raisealert(Log.ALERT, "The %s get monitor after %s minutes " %(process, Config.MONIT_AFTER_MINS)) + printd('process will be monitored after %s min' %(str(int(Config.MONIT_AFTER_MINS) - tmin))) + return False + + return True + +def removeFile(fileName): + if path.isfile(fileName): + printd("Removing the file %s" %fileName) + os.remove(fileName) def loadPsFromUnMonitFile(): @@ -358,18 +365,14 @@ def main(): ''' Step1 : Get Config ''' - printd("monitoring started") temp_dict = getConfig() - ''' Step2: Monitor and Raise Alert ''' - #raisealert(Log.INFO, 'Monit started') monitProcess( temp_dict ) - if __name__ == "__main__": main()