Whenever a service crashes on the Gateway, a core file is generated. The diagnostic bundles generated from the Orchestrator should be retrieved as soon as possible following the generation of a core file, to download the core file and to provide the associated logs to VMware Support.

The following example illustrates a Python script to check for recent core files:

#! /usr/bin/env python
import subprocess, traceback, os, os.path,glob,datetime,time,sys,re
from pynag.Plugins import PluginHelper,ok,warning,critical,unknown
from subprocess import Popen, PIPE
import time
import os
import commands
import json
helper = PluginHelper()
helper.parse_arguments()

def diag_check():
    regex_patern = "^.*\s+Uploading diag-201[0-9]-.*"
    re_nat = re.compile(regex_patern)
    cmd = 'grep "Uploading diag-201[0-9]" /var/log/mgd.log'
    p1 = subprocess.Popen([cmd], stdout=subprocess.PIPE,stderr=subprocess.PIPE,shell=True)
    stdout_value, stderr_value = p1.communicate()
    m = re_nat.search(stdout_value)
    if m:
        return True
    else:
        return False

def vco_vcg_version():
    with open("/opt/vc/.gateway.info") as data:
         d=json.loads((data.read()))
    vcg=d["gatewayInfo"]["name"]
    #build_number=d["gatewayInfo"]["buildNumber"]
    status,output = commands.getstatusoutput("sudo /opt/vc/sbin/gwd -v 2>&1 | grep rev")
    if status == 0:
        build_number=output.split()[2].rstrip('\n')
    vco=d["configuration"]["managementPlane"]["data"]["managementPlaneProxy"]["primary"]
    return vcg,build_number,vco

status_file = "/tmp/coredump_status_file"
warning_file = "/tmp/warning_file"
if not os.path.isfile(status_file) and not os.access(status_file, os.R_OK):
        os.system("touch /tmp/coredump_status_file")
        os.system("chown nagios:nagios /tmp/coredump_status_file")

if not os.path.isfile(warning_file) and not os.access(status_file, os.R_OK):
        os.system("touch /tmp/warning_file")
        os.system("chown nagios:nagios /tmp/warning_file")

if not os.path.isfile(warning_file) and not os.access(status_file, os.R_OK):
        os.system("touch /tmp/crashlist.txt")
        os.system("chown nagios:nagios /tmp/crashlist.txt")

command = "cat /tmp/coredump_status_file"
command1 = "cat /tmp/warning_file"
files = ["crashlist.txt","warning_file","coredump_status_file","coredump_message"]
for item in files:
    if os.path.isfile("/tmp/"+item):
       st=os.stat("/tmp/"+item)
       if st.st_uid == 0:
          commands.getstatusoutput("sudo chown nagios:nagios /tmp/"+item)

status,output = commands.getstatusoutput(command)
if output == "1":
    status_message = ""
    os.system("chown nagios:nagios /tmp/coredump_message")
    with open("/tmp/coredump_message", "r") as data:
         for line in data.readlines():
             status_message += line
    mtime = os.path.getmtime("/tmp/coredump_status_file")
    cur_time = time.time()
    if int(cur_time) - int(mtime) >= 300:
        os.system('echo -n "0" > /tmp/coredump_status_file')
    helper.status(critical)
    helper.add_summary(status_message)
    helper.exit()
    sys.exit(0)

status_message = ""
newcore = 0
try:
    crashlistpath = '/tmp/crashlist.txt'
    cmd = "stat -c '%Y %n' /velocloud/core/*core.tgz"
    if not os.path.isfile(crashlistpath) and not os.access(crashlistpath, os.R_OK):
        os.system("find /velocloud/core/ -name *core.tgz > /tmp/crashlist.txt")

    with open(crashlistpath, "a+") as f:
        oldcrashlist = f.read()
        corelist = glob.glob("/velocloud/core/*core.tgz")
        corecount = len(corelist)
        if corecount > 0 :
            for line in corelist:
                file_modified = datetime.datetime.fromtimestamp(os.path.getmtime(line))
                if datetime.datetime.now() - file_modified > datetime.timedelta(hours=42*24):
                    os.remove(line)
                if not line in oldcrashlist:
                    newcore +=1
                    status_message +=  '\n' + "Core:" +str(newcore) +" " + line.rsplit('/',1)[1] + " "
                    f.write(line+'\n')
                    cmd1 = "tar -xvf " + line.rstrip('\n') + " -C /tmp  --wildcards --no-anchored '*.txt' "
                    crash = subprocess.Popen(cmd1, shell=True, stdout=subprocess.PIPE)
                    crash.wait()
                    for line1 in crash.stdout:
                        btcmd = "awk '/^Thread 1 /,/^----/' /tmp/" + line1.rstrip('\n') + " | egrep '^#' | sed 's/ 0x0.* in //' | sed 's/ (.*/ /'"
                        bt = subprocess.Popen(btcmd, shell=True, stdout=subprocess.PIPE)
                        status_message += '\n'+ bt.communicate()[0]
        else:
           helper.status(ok)
           status_message = "No Core file"
           f.close()

except Exception as e:
    traceback.print_exc()
    helper.exit(summary="Nagios check could not complete", long_output=str(e), exit_code=unknown, perfdata='')

if corecount and not newcore:
    helper.status(ok)
    status_message = str(corecount)+ " old core file found in /velocloud/core"
    os.system('echo -n "0" > /tmp/coredump_status_file')

elif newcore > 0:
    output = vco_vcg_version()
    vcg_data = "%s;    VCG_Build_Number:%s;     VCO:%s\n" %(output)
    status_message = vcg_data + str(newcore)+ " New Core\n"+ status_message
    with open("/tmp/coredump_message", "w") as data:
        data.writelines(status_message)
    os.system('echo -n "1" > /tmp/warning_file')
    os.system('echo -n "1" > /tmp/coredump_status_file')
    helper.status(critical)
    helper.add_summary(status_message)
    helper.exit()
    sys.exit(0)

status,output_warn = commands.getstatusoutput(command1)
if output_warn == "1" :
    helper.status(warning)
    status_message = "Please generate gateway diag bundle from the VCO if required"
    result = diag_check()
    if result == False:
        if not os.path.isfile("/tmp/coredump_start_time"):
           os.system("touch /tmp/coredump_start_time")
           os.system("chown nagios:nagios /tmp/coredump_start_time")
           start_time = time.time()
           with open("/tmp/coredump_start_time", "w") as data:
               data.write(str(start_time))
        end_time = time.time()
        cmd = "cat /tmp/coredump_start_time"
        status,start_time = commands.getstatusoutput(cmd)
        total_time = end_time - float(start_time)
        if total_time > 10800:
           result = True
    if result == True:
       os.system('echo -n "0" > /tmp/warning_file')
       os.remove ("/tmp/coredump_start_time")
       helper.status(warning)
       status_message = "Please generate the diagbundle for the last crash. if it is taken already, please ignore this message"

helper.add_summary(status_message)
helper.exit()