Whenever a service crashes on the Gateway, a core file is generated. The diagnostic bundles generated from the Orchestrator should be retrieved as soon as possible following the generation of a core file, to download the core file and to provide the associated logs to VMware Support.
The following example illustrates a Python script to check for recent core files:
#! /usr/bin/env python import subprocess, traceback, os, os.path,glob,datetime,time,sys,re from pynag.Plugins import PluginHelper,ok,warning,critical,unknown from subprocess import Popen, PIPE import time import os import commands import json helper = PluginHelper() helper.parse_arguments() def diag_check(): regex_patern = "^.*\s+Uploading diag-201[0-9]-.*" re_nat = re.compile(regex_patern) cmd = 'grep "Uploading diag-201[0-9]" /var/log/mgd.log' p1 = subprocess.Popen([cmd], stdout=subprocess.PIPE,stderr=subprocess.PIPE,shell=True) stdout_value, stderr_value = p1.communicate() m = re_nat.search(stdout_value) if m: return True else: return False def vco_vcg_version(): with open("/opt/vc/.gateway.info") as data: d=json.loads((data.read())) vcg=d["gatewayInfo"]["name"] #build_number=d["gatewayInfo"]["buildNumber"] status,output = commands.getstatusoutput("sudo /opt/vc/sbin/gwd -v 2>&1 | grep rev") if status == 0: build_number=output.split()[2].rstrip('\n') vco=d["configuration"]["managementPlane"]["data"]["managementPlaneProxy"]["primary"] return vcg,build_number,vco status_file = "/tmp/coredump_status_file" warning_file = "/tmp/warning_file" if not os.path.isfile(status_file) and not os.access(status_file, os.R_OK): os.system("touch /tmp/coredump_status_file") os.system("chown nagios:nagios /tmp/coredump_status_file") if not os.path.isfile(warning_file) and not os.access(status_file, os.R_OK): os.system("touch /tmp/warning_file") os.system("chown nagios:nagios /tmp/warning_file") if not os.path.isfile(warning_file) and not os.access(status_file, os.R_OK): os.system("touch /tmp/crashlist.txt") os.system("chown nagios:nagios /tmp/crashlist.txt") command = "cat /tmp/coredump_status_file" command1 = "cat /tmp/warning_file" files = ["crashlist.txt","warning_file","coredump_status_file","coredump_message"] for item in files: if os.path.isfile("/tmp/"+item): st=os.stat("/tmp/"+item) if st.st_uid == 0: commands.getstatusoutput("sudo chown nagios:nagios /tmp/"+item) status,output = commands.getstatusoutput(command) if output == "1": status_message = "" os.system("chown nagios:nagios /tmp/coredump_message") with open("/tmp/coredump_message", "r") as data: for line in data.readlines(): status_message += line mtime = os.path.getmtime("/tmp/coredump_status_file") cur_time = time.time() if int(cur_time) - int(mtime) >= 300: os.system('echo -n "0" > /tmp/coredump_status_file') helper.status(critical) helper.add_summary(status_message) helper.exit() sys.exit(0) status_message = "" newcore = 0 try: crashlistpath = '/tmp/crashlist.txt' cmd = "stat -c '%Y %n' /velocloud/core/*core.tgz" if not os.path.isfile(crashlistpath) and not os.access(crashlistpath, os.R_OK): os.system("find /velocloud/core/ -name *core.tgz > /tmp/crashlist.txt") with open(crashlistpath, "a+") as f: oldcrashlist = f.read() corelist = glob.glob("/velocloud/core/*core.tgz") corecount = len(corelist) if corecount > 0 : for line in corelist: file_modified = datetime.datetime.fromtimestamp(os.path.getmtime(line)) if datetime.datetime.now() - file_modified > datetime.timedelta(hours=42*24): os.remove(line) if not line in oldcrashlist: newcore +=1 status_message += '\n' + "Core:" +str(newcore) +" " + line.rsplit('/',1)[1] + " " f.write(line+'\n') cmd1 = "tar -xvf " + line.rstrip('\n') + " -C /tmp --wildcards --no-anchored '*.txt' " crash = subprocess.Popen(cmd1, shell=True, stdout=subprocess.PIPE) crash.wait() for line1 in crash.stdout: btcmd = "awk '/^Thread 1 /,/^----/' /tmp/" + line1.rstrip('\n') + " | egrep '^#' | sed 's/ 0x0.* in //' | sed 's/ (.*/ /'" bt = subprocess.Popen(btcmd, shell=True, stdout=subprocess.PIPE) status_message += '\n'+ bt.communicate()[0] else: helper.status(ok) status_message = "No Core file" f.close() except Exception as e: traceback.print_exc() helper.exit(summary="Nagios check could not complete", long_output=str(e), exit_code=unknown, perfdata='') if corecount and not newcore: helper.status(ok) status_message = str(corecount)+ " old core file found in /velocloud/core" os.system('echo -n "0" > /tmp/coredump_status_file') elif newcore > 0: output = vco_vcg_version() vcg_data = "%s; VCG_Build_Number:%s; VCO:%s\n" %(output) status_message = vcg_data + str(newcore)+ " New Core\n"+ status_message with open("/tmp/coredump_message", "w") as data: data.writelines(status_message) os.system('echo -n "1" > /tmp/warning_file') os.system('echo -n "1" > /tmp/coredump_status_file') helper.status(critical) helper.add_summary(status_message) helper.exit() sys.exit(0) status,output_warn = commands.getstatusoutput(command1) if output_warn == "1" : helper.status(warning) status_message = "Please generate gateway diag bundle from the VCO if required" result = diag_check() if result == False: if not os.path.isfile("/tmp/coredump_start_time"): os.system("touch /tmp/coredump_start_time") os.system("chown nagios:nagios /tmp/coredump_start_time") start_time = time.time() with open("/tmp/coredump_start_time", "w") as data: data.write(str(start_time)) end_time = time.time() cmd = "cat /tmp/coredump_start_time" status,start_time = commands.getstatusoutput(cmd) total_time = end_time - float(start_time) if total_time > 10800: result = True if result == True: os.system('echo -n "0" > /tmp/warning_file') os.remove ("/tmp/coredump_start_time") helper.status(warning) status_message = "Please generate the diagbundle for the last crash. if it is taken already, please ignore this message" helper.add_summary(status_message) helper.exit()