Überwachen von Hauptdateien

Immer wenn ein Dienst auf dem Gateway abstürzt, wird eine Hauptdatei erzeugt. Die vom Orchestrator generierten Diagnosepakete sollten so bald wie möglich nach der Generierung einer Hauptdatei abgerufen werden, um die Hauptdatei herunterzuladen und die zugehörigen Protokolle dem VMware-Support zur Verfügung zu stellen.
In folgendem Beispiel wird ein Python-Skript für die Suche nach aktuellen Hauptdateien veranschaulicht:
#! /usr/bin/env python
import subprocess, traceback, os, os.path,glob,datetime,time,sys,re
from pynag.Plugins import PluginHelper,ok,warning,critical,unknown
from subprocess import Popen, PIPE
import time
import os
import commands
import json
helper = PluginHelper()
helper.parse_arguments()

def diag_check():
    regex_patern = "^.*\s+Uploading diag-201[0-9]-.*"
    re_nat = re.compile(regex_patern)
    cmd = 'grep "Uploading diag-201[0-9]" /var/log/mgd.log'
    p1 = subprocess.Popen([cmd], stdout=subprocess.PIPE,stderr=subprocess.PIPE,shell=True)
    stdout_value, stderr_value = p1.communicate()
    m = re_nat.search(stdout_value)
    if m:
        return True
    else:
        return False

def vco_vcg_version():
    with open("/opt/vc/.gateway.info") as data:
         d=json.loads((data.read()))
    vcg=d["gatewayInfo"]["name"]
    #build_number=d["gatewayInfo"]["buildNumber"]
    status,output = commands.getstatusoutput("sudo /opt/vc/sbin/gwd -v 2>&1 | grep rev")
    if status == 0:
        build_number=output.split()[2].rstrip('\n')
    vco=d["configuration"]["managementPlane"]["data"]["managementPlaneProxy"]["primary"]
    return vcg,build_number,vco

status_file = "/tmp/coredump_status_file"
warning_file = "/tmp/warning_file"
if not os.path.isfile(status_file) and not os.access(status_file, os.R_OK):
        os.system("touch /tmp/coredump_status_file")
        os.system("chown nagios:nagios /tmp/coredump_status_file")

if not os.path.isfile(warning_file) and not os.access(status_file, os.R_OK):
        os.system("touch /tmp/warning_file")
        os.system("chown nagios:nagios /tmp/warning_file")

if not os.path.isfile(warning_file) and not os.access(status_file, os.R_OK):
        os.system("touch /tmp/crashlist.txt")
        os.system("chown nagios:nagios /tmp/crashlist.txt")

command = "cat /tmp/coredump_status_file"
command1 = "cat /tmp/warning_file"
files = ["crashlist.txt","warning_file","coredump_status_file","coredump_message"]
for item in files:
    if os.path.isfile("/tmp/"+item):
       st=os.stat("/tmp/"+item)
       if st.st_uid == 0:
          commands.getstatusoutput("sudo chown nagios:nagios /tmp/"+item)

status,output = commands.getstatusoutput(command)
if output == "1":
    status_message = ""
    os.system("chown nagios:nagios /tmp/coredump_message")
    with open("/tmp/coredump_message", "r") as data:
         for line in data.readlines():
             status_message += line
    mtime = os.path.getmtime("/tmp/coredump_status_file")
    cur_time = time.time()
    if int(cur_time) - int(mtime) >= 300:
        os.system('echo -n "0" > /tmp/coredump_status_file')
    helper.status(critical)
    helper.add_summary(status_message)
    helper.exit()
    sys.exit(0)

status_message = ""
newcore = 0
try:
    crashlistpath = '/tmp/crashlist.txt'
    cmd = "stat -c '%Y %n' /velocloud/core/*core.tgz"
    if not os.path.isfile(crashlistpath) and not os.access(crashlistpath, os.R_OK):
        os.system("find /velocloud/core/ -name *core.tgz > /tmp/crashlist.txt")

    with open(crashlistpath, "a+") as f:
        oldcrashlist = f.read()
        corelist = glob.glob("/velocloud/core/*core.tgz")
        corecount = len(corelist)
        if corecount > 0 :
            for line in corelist:
                file_modified = datetime.datetime.fromtimestamp(os.path.getmtime(line))
                if datetime.datetime.now() - file_modified > datetime.timedelta(hours=42*24):
                    os.remove(line)
                if not line in oldcrashlist:
                    newcore +=1
                    status_message +=  '\n' + "Core:" +str(newcore) +" " + line.rsplit('/',1)[1] + " "
                    f.write(line+'\n')
                    cmd1 = "tar -xvf " + line.rstrip('\n') + " -C /tmp  --wildcards --no-anchored '*.txt' "
                    crash = subprocess.Popen(cmd1, shell=True, stdout=subprocess.PIPE)
                    crash.wait()
                    for line1 in crash.stdout:
                        btcmd = "awk '/^Thread 1 /,/^----/' /tmp/" + line1.rstrip('\n') + " | egrep '^#' | sed 's/ 0x0.* in //' | sed 's/ (.*/ /'"
                        bt = subprocess.Popen(btcmd, shell=True, stdout=subprocess.PIPE)
                        status_message += '\n'+ bt.communicate()[0]
        else:
           helper.status(ok)
           status_message = "No Core file"
           f.close()

except Exception as e:
    traceback.print_exc()
    helper.exit(summary="Nagios check could not complete", long_output=str(e), exit_code=unknown, perfdata='')

if corecount and not newcore:
    helper.status(ok)
    status_message = str(corecount)+ " old core file found in /velocloud/core"
    os.system('echo -n "0" > /tmp/coredump_status_file')

elif newcore > 0:
    output = vco_vcg_version()
    vcg_data = "%s;    VCG_Build_Number:%s;     VCO:%s\n" %(output)
    status_message = vcg_data + str(newcore)+ " New Core\n"+ status_message
    with open("/tmp/coredump_message", "w") as data:
        data.writelines(status_message)
    os.system('echo -n "1" > /tmp/warning_file')
    os.system('echo -n "1" > /tmp/coredump_status_file')
    helper.status(critical)
    helper.add_summary(status_message)
    helper.exit()
    sys.exit(0)

status,output_warn = commands.getstatusoutput(command1)
if output_warn == "1" :
    helper.status(warning)
    status_message = "Please generate gateway diag bundle from the VCO if required"
    result = diag_check()
    if result == False:
        if not os.path.isfile("/tmp/coredump_start_time"):
           os.system("touch /tmp/coredump_start_time")
           os.system("chown nagios:nagios /tmp/coredump_start_time")
           start_time = time.time()
           with open("/tmp/coredump_start_time", "w") as data:
               data.write(str(start_time))
        end_time = time.time()
        cmd = "cat /tmp/coredump_start_time"
        status,start_time = commands.getstatusoutput(cmd)
        total_time = end_time - float(start_time)
        if total_time > 10800:
           result = True
    if result == True:
       os.system('echo -n "0" > /tmp/warning_file')
       os.remove ("/tmp/coredump_start_time")
       helper.status(warning)
       status_message = "Please generate the diagbundle for the last crash. if it is taken already, please ignore this message"

helper.add_summary(status_message)
helper.exit()