Skip to content

Instantly share code, notes, and snippets.

@rmcgibbo
Last active October 26, 2021 15:04
Show Gist options
  • Save rmcgibbo/b846e52f4bec17a8597f to your computer and use it in GitHub Desktop.
Save rmcgibbo/b846e52f4bec17a8597f to your computer and use it in GitHub Desktop.
Summarize free slots on SLURM queues [script].
#!/usr/bin/python
from __future__ import print_function, division
import os
import re
import sys
import itertools
from pprint import pprint
import subprocess
from distutils.spawn import find_executable
from collections import defaultdict
from collections import namedtuple
SCONTROL_BIN = find_executable('scontrol')
status = namedtuple('status', ['partition', 'np_alloc', 'np_total'])
def main():
count = collect()
lines = [['Number of nodes', 'Partition', 'Utilization', 'Free slots']]
lines.append(['-' * len(e) for e in lines[0]])
for k in sorted(count.keys(), key=lambda c: c.np_alloc / c.np_total):
if k.np_total == k.np_alloc:
continue
lines.append([count[k], k.partition, '%s/%s' %
(k.np_alloc, k.np_total), k.np_total - k.np_alloc])
print('Summary of SLURM nodes with free slots\n')
print(format_table(lines))
def collect():
count = defaultdict(lambda: 0)
nodes = scontrol_show('node')
n2p = nodes_to_partition()
for node in nodes:
try:
partition = n2p[node['NodeHostName']]
except KeyError as e:
continue
count[status(partition, int(node['CPUAlloc']), int(node['CPUTot']))] += 1
return count
def nodes_to_partition():
"""Mapping from NodeHostName to PartitionName for each
node"""
partitions = scontrol_show('partition')
n2p = defaultdict(lambda: [])
for partition in partitions:
these_nodes = set()
for group in partition['Nodes'].split(','):
these_nodes.update(expand_bracket(group))
for node in these_nodes:
n2p[node].append(partition['PartitionName'])
for key, value in n2p.items():
n2p[key] = ','.join(value)
return dict(n2p)
def scontrol_show(entity):
"""Wrapper around the `scontrol show` SLURM utility
Parameters
----------
entity : {partition, job, node}
The type of entity to query scontrol for
Returns
-------
vals : list of dics
Each element in the list is a dict containing the information about
one of the requested entities on the system.
"""
if not os.path.exists(SCONTROL_BIN):
raise RuntimeError('This script is for SLURM systems only')
comm = subprocess.Popen([SCONTROL_BIN, 'show', entity],
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = comm.communicate()
assert stderr == ''
lines = stdout.splitlines()
# individual sections are marked by blank lines
breaks = [-1] + [i for i, l in enumerate(lines) if l == '']
# list of tuples with the start/end index of each section
sections = [(breaks[i] + 1, breaks[i + 1]) for i in range(len(breaks) - 1)]
values = []
for i, j in sections:
entry = {}
for line in lines[i:j]:
for items in line.split():
try:
key, val = items.split('=')
except ValueError:
pass
entry[key] = val
values.append(entry)
return values
def expand_bracket(s):
"""Expand SLURM's bracket notation
Example
-------
>>> expand_bracket("sh-1-[1-5]")
['sh-1-1', 'sh-1-2', 'sh-1-3', 'sh-1-4', 'sh-1-5']
>>> expand_bracket('sh-1-[1-5,11-12]')
['sh-1-1', 'sh-1-2', 'sh-1-3', 'sh-1-4', 'sh-1-11', 'sh-1-12']
"""
m = re.match('(.*)\[(\d+)\-(\d+)(?:,(\d+)\-(\d+))*\]', s)
if not m:
return [s]
groups = [g for g in m.groups() if g is not None]
prefix = groups[0]
returnvalue = []
for i in range(1, len(groups), 2):
leading_zeros = groups[i][0] == '0'
n_chars = len(groups[i])
first = int(groups[i])
last = int(groups[i+1])
for j in range(first, last + 1):
if leading_zeros:
suffix = ('%0{n_chars}d'.format(n_chars=n_chars)) % j
else:
suffix = str(j)
returnvalue.append('%s%s' % (prefix, suffix))
return returnvalue
def format_table(rows):
cols = zip(*rows)
col_widths = [max(len(str(value)) + 2 for value in col) for col in cols]
format = ' '.join(['%%-%ds' % width for width in col_widths])
lines = []
for row in rows:
lines.append(format % tuple(row))
return '\n'.join(lines)
if __name__ == '__main__':
main()
$ free-slots
Summary of SLURM nodes with free slots
Number of nodes Partition Utilization Free slots
--------------- --------- ----------- ----------
4 gpu 0/16 16
2 normal 0/16 16
2 dev 0/16 16
1 gpu 12/16 4
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment