Skip to content

Instantly share code, notes, and snippets.

@kanaka
Created July 7, 2017 20:03
Show Gist options
  • Save kanaka/4e95f59e68a7aa07d112fa14736252fe to your computer and use it in GitHub Desktop.
Save kanaka/4e95f59e68a7aa07d112fa14736252fe to your computer and use it in GitHub Desktop.
scbootmon
global glob
import glob, re
class dstat_scboot(dstat):
def __init__(self):
self.name = 'scboot monitor'
self.format = ('d', 6, 100)
self.vars = ('count', 'kernel', 'fabric', 'preinit', 'slurm')
self.nick = ('secs', 'kernel', 'fabric', 'initfs', 'slurm')
#self.vars = ('count', 'kernel', 'fabric', 'preinit', 'slurm', 'logins')
#self.nick = ('secs', 'kernel', 'fabric', 'initfs', 'slurm', 'logins')
self.partition = ""
self.count = 0
self.kernel = 0
self.fabric = 0
self.preinit = 0
self.slurm = 0
self.logins = 0
self.finished = False
self.init(self.vars, 1)
def showend(self, totlist, vislist):
ret = dstat.showend(self,totlist,vislist)
if self.finished:
ret = ret + "\nNode boot complete\n"
return ret
def getenv(self, var):
try:
return os.environ[var]
except:
raise Exception, '%s env variable must be set' % var
return False
def check(self):
# Get parameters from the environment
self.log_path = self.getenv("SCv_boot_log_path")
self.boot_inst = self.getenv("scboot_inst")
population = self.getenv("population")
if population:
self.population = int(population)
else:
self.population = -1
if len(self.boot_inst.split("-")) >= 2:
self.bm_partial = False
else:
self.bm_partial = True
self.partition = self.boot_inst.split("-")[0]
self.bm_patt = self.getenv("SCv_bootmark") + " \[%s\]" % self.boot_inst
self.logfile = open(self.log_path, 'r', 1) # Line level buffering
# Seek back a reasonable amount from the end of file
size=os.stat_result(os.stat(self.log_path)).st_size
seekto=size-800000
if seekto < 0: seekto=0
self.logfile.seek(seekto,0)
return True
def extract(self):
while True:
line = self.logfile.readline()
if not line: break
if re.search(self.bm_patt , line):
if line.find("begin preinit") >=0:
self.kernel += 1
elif line.find("end fabricd") >=0:
self.fabric += 1
elif line.find("end preinit") >=0:
self.preinit += 1
elif line.find("end postinit") >=0:
self.logins += 1
try:
sinfo = os.popen("sinfo 2>/dev/null | grep '^%s[^-].*idle ' " \
"| awk '{print $4}' 2>/dev/null"
% self.partition)
self.slurm = 0
for count in sinfo.readlines():
self.slurm += int(count)
except:
self.slurm = -1 # Indicate a failure to read
self.count += 1
self.val['count'] = self.count
self.val['kernel'] = self.kernel
self.val['fabric'] = self.fabric
self.val['preinit'] = self.preinit
self.val['slurm'] = self.slurm
self.val['logins'] = self.logins
# Exit condition
#if self.population >= 0 and self.logins >= self.population:
if self.population >= 0 and \
((self.bm_partial) or self.preinit >= self.population) and \
self.slurm >= self.population:
self.finished = True
op.count = 0
# vim:ts=4:sw=4
#!/usr/bin/python
import os, re, select, sys, time
def getenv(var):
try:
return os.environ[var]
except:
print >> sys.stderr, var, "not set in enviroment"
sys.exit(1)
# Get parameters from the environment
log_path = getenv("SCv_boot_log_path")
boot_inst = getenv("scboot_inst")
bm_patt = getenv("SCv_bootmark") + " \[%s\]" % boot_inst
logfile = open(log_path, 'r', 1) # Line level buffering
logfile.seek(0,2) # The end of the file
keystroke = select.poll()
keystroke.register(sys.stdin)
term_CLEAR_LINE="\r\33[0K"
term_ECHO_ON="\33[12h"
term_ECHO_OFF="\33[12l"
sys.stdout.write(term_ECHO_OFF)
sys.stdout.flush()
count=0
while True:
line = logfile.readline()
if re.search(bm_patt , line):
count += 1
# And here, ladies and gentlemen, is where Python
# eats big wads of cheesecake.
sys.stdout.write(term_CLEAR_LINE)
sys.stdout.write("(%d) " % count)
sys.stdout.write(line[:-1])
sys.stdout.flush()
if count == 8:
break
key = keystroke.poll(1)
print "zip: ", key
print
sys.stdout.write(term_ECHO_ON)
sys.stdout.flush()
#!/bin/bash
usage() {
echo >&2 "
Usage: $(basename $0) scboot_inst population [dstat_options]
scboot_inst scboot instance to monitor
population exit after 'population' nodes reach login
(-1 means never exit)
[dstat_options] additional options to pass to dstat
"
die 2
}
die() {
local ret=$1; shift
echo -e $@ >&2
exit $ret
}
# TODO: get these from cthlib
#source /opt/sicortex/config/cthlib \
# SCv_boot_log_path \
# SCv_bootmark \
# || die 2 "cthlib call failed"
export SCv_boot_log_path="/var/log/msp-messages-`date +%Y%m`"
export SCv_bootmark="BOOT MARK:"
export scboot_inst=${1}; shift
export population=${1}; shift
[ "${scboot_inst}" ] || usage
[ "${population}" ] || usage
dstat -M scboot $*
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment