Skip to content

Instantly share code, notes, and snippets.

@ehermes
Created June 29, 2015 17:16
Show Gist options
  • Select an option

  • Save ehermes/2775e7c78019c1c1d539 to your computer and use it in GitHub Desktop.

Select an option

Save ehermes/2775e7c78019c1c1d539 to your computer and use it in GitHub Desktop.
#!/home/ehermes/local/bin/python
from __future__ import print_function
import os
import sys
import shutil
import argparse
import subprocess
parser = argparse.ArgumentParser(
description="Submit python script using ASE to the ACI compute cluster",
)
parser.add_argument(
'-np',
'--nproc',
help='number of processors',
default=0,
)
parser.add_argument(
'-n',
'--nodes',
help='number of nodes',
default=0,
)
parser.add_argument(
'-ppn',
help='number of processors per node',
default=0,
)
parser.add_argument(
'-q',
'--queue',
help='job queue to submit script to',
default='',
)
parser.add_argument(
'-t',
'--time',
help='time limit for job',
default='',
)
parser.add_argument(
'-exclude',
help='nodes for job to avoid',
default='',
)
parser.add_argument(
'-nolocal',
help='run in place (on network storage) instead of on local scratch',
dest='local',
default=True,
const=False,
action='store_const',
)
parser.add_argument(
'-depends',
help='job dependencies in SLURM syntax',
default='',
)
parser.add_argument(
'-target',
help='specify nodes for job to run on',
default='',
)
parser.add_argument(
'-freq',
'--backup-frequency',
help='how often to copy files from local scratch to network storage',
default=1800,
)
parser.add_argument(
'-g',
'--gamma',
help='use gamma-point only version of VASP',
default=False,
const=True,
action='store_const',
)
parser.add_argument(
'-nc',
'--noncollinear',
help='use non-collinear version of VASP',
default=False,
const=True,
action='store_const',
)
parser.add_argument(
'script',
help='name of python script to execute',
)
args = parser.parse_args()
nproc = int(args.nproc)
nodes = int(args.nodes)
ppn = int(args.ppn)
queue = args.queue
time = args.time
exclude = args.exclude
local = args.local
depends = args.depends
target = args.target
backupfreq = args.backup_frequency
gamma = args.gamma
noncollinear = args.noncollinear
name = args.script
vasp = '/home/ehermes/local/bin/vasp'
if gamma:
vasp += '.gamma'
elif noncollinear:
vasp += '.noncollinear'
if gamma and noncollinear:
raise ValueError, "Noncollinear calculations cannot use the gamma-point version of vasp!"
if name.endswith('.py'):
name = name[:-3]
wd = os.getcwd()
if not queue:
if ppn <= 16:
queue = 'univ'
elif 16 < ppn <= 20:
queue = 'univ2'
else:
raise ValueError, "PPN must be less than 20! Current value is {}".format(ppn)
if queue == 'univ2':
max_ppn = 20
else:
max_ppn = 16
if ppn == 0 and nproc == 0 and nodes == 0:
ppn = 1
nproc = 1
nodes = 1
elif ppn == 0 and nproc == 0 and nodes != 0:
ppn = max_ppn
nproc = nodes * ppn
elif ppn == 0 and nproc != 0 and nodes == 0:
if nproc % max_ppn == 0:
ppn = max_ppn
nodes = nproc / ppn
else:
nodes = nproc / max_ppn + 1
if nproc % nodes != 0:
nproc = nproc + nodes - nproc % nodes
print("WARNING: Rounding number of processors to {}".format(nproc), \
file=sys.stderr)
ppn = nproc / nodes
elif ppn == 0 and nproc != 0 and nodes != 0:
ppn = nodes / nproc
if nodes % nproc != 0:
ppn += 1
nproc = ppn * nodes
print("WARNING: Rounding number of processors to {}".format(nproc), \
file=sys.stderr)
elif ppn != 0 and nproc == 0 and nodes == 0:
nproc = ppn
nodes = 1
elif ppn != 0 and nproc == 0 and nodes != 0:
nproc = ppn * nodes
elif ppn != 0 and nproc != 0 and nodes == 0:
if nproc % ppn != 0:
raise ValueError, "Processors per node ({0}) is not a divisor of number of processes ({1})!".format(ppn, nproc)
nodes = nproc / ppn
elif ppn != 0 and nproc != 0 and nodes != 0:
assert ppn * nodes == nproc, "Number of processors ({0}) is not consistent with the number of nodes ({1}) and processors per node ({2})!".format(nproc, nodes, ppn)
if ppn > max_ppn:
raise ValueError, "Too many processors per node! Current value is {0}, maximum allowed is {1}".format(ppn, max_ppn)
if not time:
if queue.startswith('univ'):
time = '7-00:00:00'
elif queue == 'pre':
time = '1-00:00:00'
else:
raise ValueError, "Unrecognized queue {}!".format(queue)
script = """#!/home/ehermes/local/bin/python
#SBATCH --job-name={name}
#SBATCH --ntasks-per-node={ppn}
#SBATCH --ntasks={nproc}
#SBATCH --nodes={nodes}
#SBATCH --partition={queue}
#SBATCH --time={time}
#SBATCH --error={wd}/{name}.%J.err
#SBATCH --output={wd}/{name}.%J.out
#SBATCH --exclusive
#SBATCH --hint=nomultithread
""".format(name=name, ppn=ppn, nproc=nproc, nodes=nodes, queue=queue,
time=time, wd=wd)
if exclude:
script += "#SBATCH --exclude={}\n".format(exclude)
if depends:
script += "#SBATCH --depends={}\n".format(depends)
if target:
script += "#SBATCH --target={}\n".format(target)
# Imports and stuff go here
script += """
import os
import subprocess
import time
jobid = os.environ['SLURM_JOB_ID']
user = os.environ['USER']
tmpdir = '/scratch/local/{}/{}'.format(user, jobid)
os.environ['OMP_NUM_THREADS'] = '1'
os.environ['VASP_PP_PATH'] = '/home/ehermes/local/apps/vasp'
"""
if local:
script += """
nodelist = []
slurm_nodelist = os.environ['SLURM_NODELIST'][4:]
if slurm_nodelist.startswith('['):
slurm_nodelist = slurm_nodelist[1: -1]
slurm_nodelist = slurm_nodelist.split(',')
for node in slurm_nodelist:
subnode = node.split('-')
for i in range(int(subnode[0]), int(subnode[-1]) + 1):
nodelist.append(str(i).zfill(3))
for node in nodelist:
mkdir = subprocess.check_call(
['ssh', 'aci-{{}}'.format(node),
'mkdir -p {{}}'.format(tmpdir)]
)
def copy_from():
for node in nodelist:
copy_from = subprocess.check_call(
['ssh', 'aci-{{}}'.format(node),
'rsync -raz {{1}}/* {{0}}/'.format('{wd}', tmpdir)]
)
os.environ['TMP_WD'] = '{wd}'
os.environ['TMP_TMPDIR'] = tmpdir
os.environ['TMP_VASP'] = '{vasp}'
os.environ['TMP_NODELIST'] = ' '.join(nodelist)
os.environ['VASP_SCRIPT'] = '/home/ehermes/local/apps/vasp/vasp_script.py'
python = '/home/ehermes/local/bin/python'
with open('{name}.out', 'w') as out:
jobproc = subprocess.Popen([python, '{name}.py'], stdout=out)
lastbackup = time.time()
while jobproc.poll() is None:
time.sleep(1)
now = time.time()
if now - lastbackup >= {backupfreq}:
copy_from()
lastbackup = now
else:
print "Job done!"
""".format(wd=wd, vasp=vasp, name=name, backupfreq=backupfreq)
scriptname = 'run.{}.py'.format(os.getpid())
with open(scriptname, 'w') as f:
f.write(script)
subprocess.check_call(['/usr/bin/sbatch', scriptname])
os.remove(scriptname)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment