Last active
July 8, 2016 19:25
-
-
Save giovtorres/2ce295be802554904665 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
""" | |
show-cluster-util | |
This script uses the Python SLURM bindings to get cluster utilization, | |
which accounts for nodes being allocated exclusively or totally allocated. | |
""" | |
import hostlist | |
import pyslurm | |
__author__ = "Giovanni Torres" | |
def human_readable(num, suffix="B"): | |
"""Convert bytes to a human readable form""" | |
if num == 0: | |
return "0.0 GB" | |
else: | |
for unit in ['', 'K', 'M', 'G', 'T', 'P']: | |
if abs(num) < 1024.0: | |
return "%.1f %s%s" % (num, unit, suffix) | |
num /= 1024.0 | |
def get_part_info(): | |
"""Return dictionary of defmempercpu limits.""" | |
all_parts = pyslurm.partition().get() | |
defmempercpu = {} | |
all_nodes = {} | |
for part, partinfo in all_parts.items(): | |
defmempercpu[part] = partinfo["def_mem_per_cpu"] | |
nodes = hostlist.expand_hostlist(partinfo["nodes"]) | |
if part not in ["interactive", "quick", "maint"]: | |
for node in nodes: | |
all_nodes[node] = part | |
return (defmempercpu, all_nodes) | |
def get_util(nodes): | |
""" Return a tuple of cpu and memory percent values. | |
IN: (dict) dictionary of all nodes from pyslurm.node().get() | |
OUT: (tuple) cpu and mem percent util | |
""" | |
all_metrics = {"total_cpus_alloc": 0, | |
"total_cpus_idle": 0, | |
"total_cpus_down": 0, | |
"total_cpus_unalloc": 0, | |
"total_cpus_config": 0, | |
"total_memory_alloc": 0, | |
"total_memory_idle": 0, | |
"total_memory_down": 0, | |
"total_memory_unalloc": 0, | |
"total_memory_overalloc": 0, | |
"total_memory_config": 0, | |
"total_nodes_mixed": 0, | |
"total_nodes_alloc": 0, | |
"total_nodes_idle": 0, | |
"total_nodes_down": 0, | |
"total_nodes_config": 0} | |
defmempercpu, all_nodes = get_part_info() | |
for node in nodes: | |
nodeinfo = nodes.get(node) | |
state = nodeinfo.get("state").upper() | |
cpus_alloc = nodeinfo.get("alloc_cpus") | |
cpus_total = nodeinfo.get("cpus") | |
memory_alloc = nodeinfo.get("alloc_mem") | |
memory_real = nodeinfo.get("real_memory") | |
shared = nodeinfo.get("shared") | |
all_metrics["total_nodes_config"] += 1 | |
all_metrics["total_cpus_config"] += cpus_total | |
all_metrics["total_memory_config"] += memory_real | |
if "DOWN" in state or "DRAIN" in state: | |
all_metrics["total_nodes_down"] += 1 | |
all_metrics["total_cpus_down"] += cpus_total | |
all_metrics["total_memory_down"] += memory_real | |
else: | |
all_metrics["total_cpus_alloc"] += cpus_alloc | |
if ("ALLOCATED" in state) or (shared == 0): | |
all_metrics["total_nodes_alloc"] += 1 | |
all_metrics["total_cpus_unalloc"] += cpus_total - cpus_alloc | |
if memory_alloc > memory_real: | |
all_metrics["total_memory_alloc"] += memory_real | |
all_metrics["total_memory_unalloc"] += 0 | |
all_metrics["total_memory_overalloc"] += memory_alloc - memory_real | |
else: | |
all_metrics["total_memory_alloc"] += memory_alloc | |
all_metrics["total_memory_unalloc"] += memory_real - memory_alloc | |
elif "MIXED" in state: | |
all_metrics["total_nodes_mixed"] += 1 | |
if memory_alloc > memory_real: | |
all_metrics["total_cpus_unalloc"] += cpus_total - cpus_alloc | |
all_metrics["total_memory_alloc"] += memory_real | |
all_metrics["total_memory_unalloc"] += 0 | |
all_metrics["total_memory_overalloc"] += memory_alloc - memory_real | |
elif (memory_real - memory_alloc) < (2 * defmempercpu[all_nodes[node]]): | |
all_metrics["total_cpus_unalloc"] += cpus_total - cpus_alloc | |
all_metrics["total_memory_alloc"] += memory_alloc | |
all_metrics["total_memory_unalloc"] += memory_real - memory_alloc | |
else: | |
all_metrics["total_cpus_idle"] += cpus_total - cpus_alloc | |
all_metrics["total_memory_alloc"] += memory_alloc | |
all_metrics["total_memory_idle"] += memory_real - memory_alloc | |
elif "IDLE" in state: | |
all_metrics["total_nodes_idle"] += 1 | |
all_metrics["total_cpus_idle"] += cpus_total | |
all_metrics["total_memory_idle"] += memory_real | |
else: | |
print node | |
assert all_metrics["total_nodes_alloc"] + \ | |
all_metrics["total_nodes_mixed"] + \ | |
all_metrics["total_nodes_idle"] + \ | |
all_metrics["total_nodes_down"] == all_metrics["total_nodes_config"] | |
assert (all_metrics["total_cpus_alloc"] + | |
all_metrics["total_cpus_idle"] + | |
all_metrics["total_cpus_unalloc"] + | |
all_metrics["total_cpus_down"] == all_metrics["total_cpus_config"]) | |
assert (all_metrics["total_memory_alloc"] + | |
all_metrics["total_memory_idle"] + | |
all_metrics["total_memory_unalloc"] + | |
all_metrics["total_memory_down"] == all_metrics["total_memory_config"]) | |
return all_metrics | |
def display_metrics(metrics): | |
""" Print cluster utilization. | |
IN: (dict) dictionary of all node, cpu and memory states | |
""" | |
print "" | |
print "Total Allocated Nodes : {0:>8}".format( | |
metrics["total_nodes_alloc"]) | |
print "Total Mixed Nodes : {0:>8}".format( | |
metrics["total_nodes_mixed"]) | |
print "Total Idle Nodes : {0:>8}".format( | |
metrics["total_nodes_idle"]) | |
print "Total Down/Offline Nodes : {0:>8}".format( | |
metrics["total_nodes_down"]) | |
print "Total Eligible Nodes : {0:>8}".format( | |
metrics["total_nodes_config"] - metrics["total_nodes_down"]) | |
print "Total Configured Nodes : {0:>8}".format( | |
metrics["total_nodes_config"]) | |
print "" | |
print "Total Allocated CPUs : {0:>8}".format( | |
metrics["total_cpus_alloc"]) | |
print "Total Idle CPUs : {0:>8}".format( | |
metrics["total_cpus_idle"]) | |
print "Total Down CPUs : {0:>8}".format( | |
metrics["total_cpus_down"]) | |
print "Total Unallocatable CPUs : {0:>8}".format( | |
metrics["total_cpus_unalloc"]) | |
print "Total Eligible CPUs : {0:>8}".format( | |
metrics["total_cpus_config"] - metrics["total_cpus_down"]) | |
print "Total Configured CPUs : {0:>8}".format( | |
metrics["total_cpus_config"]) | |
print "Cluster CPU % Unallocatable : {0:>7}%".format( | |
metrics["total_cpus_unalloc"] * 100 / (metrics["total_cpus_config"] - | |
metrics["total_cpus_down"])) | |
print "Cluster CPU % (Alloc + Unalloc) : {0:>7}%".format( | |
(metrics["total_cpus_alloc"] + metrics["total_cpus_unalloc"]) * 100 / ( | |
metrics["total_cpus_config"] - metrics["total_cpus_down"])) | |
print "" | |
print "Total Allocated Memory : {0:>8}".format( | |
human_readable(metrics["total_memory_alloc"] * 1024 * 1024)) | |
print "Total Idle Memory : {0:>8}".format( | |
human_readable(metrics["total_memory_idle"] * 1024 * 1024)) | |
print "Total Down Memory : {0:>8}".format( | |
human_readable(metrics["total_memory_down"] * 1024 * 1024)) | |
print "Total Unallocatable Memory : {0:>8}".format( | |
human_readable(metrics["total_memory_unalloc"] * 1024 * 1024)) | |
print "Total Overallocated Memory : {0:>8}".format( | |
human_readable(metrics["total_memory_overalloc"] * 1024 * 1024)) | |
print "Total Eligible Memory : {0:>8}".format( | |
human_readable((metrics["total_memory_config"] - | |
metrics["total_memory_down"]) * 1024 * 1024)) | |
print "Total Configured Memory : {0:>8}".format( | |
human_readable(metrics["total_memory_config"] * 1024 * 1024)) | |
print "Cluster Memory % Unallocatable : {0:>7}%".format( | |
metrics["total_memory_unalloc"] * 100 / (metrics["total_memory_config"] - | |
metrics["total_memory_down"])) | |
print "Cluster Memory % (Alloc + Unalloc): {0:>7}%".format( | |
(metrics["total_memory_alloc"] + metrics["total_memory_unalloc"]) * 100 / ( | |
metrics["total_memory_config"] - metrics["total_memory_down"])) | |
print "" | |
if __name__ == "__main__": | |
try: | |
# Make sure pyslurm works or else exit here | |
pyslurmnode = pyslurm.node() | |
# Get all node info | |
nodes = pyslurmnode.get() | |
except ValueError as e: | |
print 'Query failed - %s' % (e) | |
sys.exit(1) | |
metrics = get_util(nodes) | |
display_metrics(metrics) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment