Skip to content

Instantly share code, notes, and snippets.

@veprbl
Last active March 21, 2016 23:05
Show Gist options
  • Save veprbl/59f4841eb55f80a79829 to your computer and use it in GitHub Desktop.
Save veprbl/59f4841eb55f80a79829 to your computer and use it in GitHub Desktop.
Online status pages for monitoring htcondor jobs
#!/usr/bin/python2
import time
import math
import threading
import htcondor
coll = htcondor.Collector("condor02.rcf.bnl.gov:9664")
schedd_list = coll.locateAll(htcondor.DaemonTypes.Schedd)
schedd_list = filter(lambda s: s['Name'] in ["rcas6006.rcf.bnl.gov", "eic0007.rcf.bnl.gov"], schedd_list)
schedd_list = map(htcondor.Schedd, schedd_list)
def worker(history_it, clusters):
for job in history_it:
cluster_id = job['ClusterId']
clusters[cluster_id].append(job)
def get_jobs(schedd, clusters, threads):
requested_fields = ['ClusterId', 'JobStatus', 'RealExperiment']
new_clusters = set()
jobs = schedd.query("(User == \"[email protected]\") || (User == \"[email protected]\")", requested_fields)
for j in jobs:
cluster_id = j['ClusterId']
new_clusters.add(cluster_id)
if cluster_id not in clusters:
clusters[cluster_id] = []
clusters[cluster_id].append(j)
if new_clusters:
cond = ["ClusterId == %i" % cl_id for cl_id in new_clusters]
cond = "(" + ")||(".join(cond) + ")"
try:
it = schedd.history(cond, requested_fields, 0)
thread = threading.Thread(target=worker, args=(it, clusters))
thread.start()
threads.append(thread)
except RuntimeError, e:
import sys
sys.stderr.write("%s\n" % repr(e))
pass # can't connect to eic
clusters = {}
threads = []
for schedd in schedd_list:
get_jobs(schedd, clusters, threads)
import time
time.sleep(120)
JOB_STATUS_UNEXPANDED = 0
JOB_STATUS_IDLE = 1
JOB_STATUS_RUNNING = 2
JOB_STATUS_REMOVED = 3
JOB_STATUS_COMPLETED = 4
JOB_STATUS_HELD = 5
JOB_STATUS_SUBMISSION_ERR = 6
print """\
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>htcondor status</title>
<!-- Bootstrap -->
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/css/bootstrap.min.css" integrity="sha384-1q8mTJOASx8j1Au+a5WDVnPi2lkFfwwEAa8hDDdjZlpLegxhjVME1fgjWPGmkzs7" crossorigin="anonymous">
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/css/bootstrap-theme.min.css" integrity="sha384-fLW2N01lMqjakBkx3l/M9EahuwpSfeNvV63J5ezn3uZzapT0u7EYsXMjQV+0En5r" crossorigin="anonymous">
<!-- HTML5 shim and Respond.js for IE8 support of HTML5 elements and media queries -->
<!-- WARNING: Respond.js doesn't work if you view the page via file:// -->
<!--[if lt IE 9]>
<script src="https://oss.maxcdn.com/html5shiv/3.7.2/html5shiv.min.js"></script>
<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
<![endif]-->
</head>
<body>
<table class="table table-striped">
<thead>
<tr>
<th>Cluster</th>
<th>RealExperiment</th>
<th>Completed</th>
<th>Running</th>
<th>Failed</th>
<th>Total</th>
<th></th>
</tr>
</thead>
<tbody>
"""
k = clusters.keys()
k.sort()
for cluster_id in k:
cluster_jobs = clusters[cluster_id]
real_experiment = cluster_jobs[0]['RealExperiment']
running_jobs = filter(lambda j: j['JobStatus'] == JOB_STATUS_RUNNING, cluster_jobs)
completed_jobs = filter(lambda j: j['JobStatus'] == JOB_STATUS_COMPLETED, cluster_jobs)
failed_jobs = filter(lambda j: j['JobStatus'] not in [JOB_STATUS_IDLE, JOB_STATUS_RUNNING, JOB_STATUS_COMPLETED], cluster_jobs)
progress_completed = int(math.floor(float(len(completed_jobs)) / len(cluster_jobs) * 100))
progress_running = int(math.ceil(float(len(running_jobs)) / len(cluster_jobs) * 100))
progress_failed = int(math.ceil(float(len(failed_jobs)) / len(cluster_jobs) * 100))
excess = progress_completed + progress_running + progress_failed - 100
if excess > 0:
if progress_completed > excess:
progress_completed -= excess
elif progress_running > excess:
progress_running -= excess
elif progress_failed > excess:
progress_failed -= excess
else:
assert False
print """\
<tr>
<td>{cluster_id}</td>
<td>{real_experiment}</td>
<td>{completed}</td>
<td>{running}</td>
<td>{failed}</td>
<td>{total}</td>
<td style="width: 50%">
<div class="progress">
<div class="progress-bar progress-bar-success" style="width: {progress_completed}%"></div>
<div class="progress-bar" style="width: {progress_running}%;"></div>
<div class="progress-bar progress-bar-danger" style="width: {progress_failed}%"></div>
</div>
</td>
</tr>
""".format(
cluster_id=cluster_id,
real_experiment=real_experiment,
completed=len(completed_jobs),
running=len(running_jobs),
failed=len(failed_jobs),
total=len(cluster_jobs),
progress_completed=progress_completed,
progress_running=progress_running,
progress_failed=progress_failed)
print """\
</tbody>
</table>
<!-- jQuery (necessary for Bootstrap's JavaScript plugins) -->
<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.3/jquery.min.js"></script>
<!-- Include all compiled plugins (below), or include individual files as needed -->
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/js/bootstrap.min.js" integrity="sha384-0mSbJDEHialfmuBBQP6A4Qrprq5OVfW37PRR3j5ELqxss1yVqOtnepnHVP9aJ7xS" crossorigin="anonymous"></script>
</body>
</html>
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment