Created
June 29, 2015 17:16
-
-
Save ehermes/2775e7c78019c1c1d539 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/home/ehermes/local/bin/python | |
| from __future__ import print_function | |
| import os | |
| import sys | |
| import shutil | |
| import argparse | |
| import subprocess | |
| parser = argparse.ArgumentParser( | |
| description="Submit python script using ASE to the ACI compute cluster", | |
| ) | |
| parser.add_argument( | |
| '-np', | |
| '--nproc', | |
| help='number of processors', | |
| default=0, | |
| ) | |
| parser.add_argument( | |
| '-n', | |
| '--nodes', | |
| help='number of nodes', | |
| default=0, | |
| ) | |
| parser.add_argument( | |
| '-ppn', | |
| help='number of processors per node', | |
| default=0, | |
| ) | |
| parser.add_argument( | |
| '-q', | |
| '--queue', | |
| help='job queue to submit script to', | |
| default='', | |
| ) | |
| parser.add_argument( | |
| '-t', | |
| '--time', | |
| help='time limit for job', | |
| default='', | |
| ) | |
| parser.add_argument( | |
| '-exclude', | |
| help='nodes for job to avoid', | |
| default='', | |
| ) | |
| parser.add_argument( | |
| '-nolocal', | |
| help='run in place (on network storage) instead of on local scratch', | |
| dest='local', | |
| default=True, | |
| const=False, | |
| action='store_const', | |
| ) | |
| parser.add_argument( | |
| '-depends', | |
| help='job dependencies in SLURM syntax', | |
| default='', | |
| ) | |
| parser.add_argument( | |
| '-target', | |
| help='specify nodes for job to run on', | |
| default='', | |
| ) | |
| parser.add_argument( | |
| '-freq', | |
| '--backup-frequency', | |
| help='how often to copy files from local scratch to network storage', | |
| default=1800, | |
| ) | |
| parser.add_argument( | |
| '-g', | |
| '--gamma', | |
| help='use gamma-point only version of VASP', | |
| default=False, | |
| const=True, | |
| action='store_const', | |
| ) | |
| parser.add_argument( | |
| '-nc', | |
| '--noncollinear', | |
| help='use non-collinear version of VASP', | |
| default=False, | |
| const=True, | |
| action='store_const', | |
| ) | |
| parser.add_argument( | |
| 'script', | |
| help='name of python script to execute', | |
| ) | |
| args = parser.parse_args() | |
| nproc = int(args.nproc) | |
| nodes = int(args.nodes) | |
| ppn = int(args.ppn) | |
| queue = args.queue | |
| time = args.time | |
| exclude = args.exclude | |
| local = args.local | |
| depends = args.depends | |
| target = args.target | |
| backupfreq = args.backup_frequency | |
| gamma = args.gamma | |
| noncollinear = args.noncollinear | |
| name = args.script | |
| vasp = '/home/ehermes/local/bin/vasp' | |
| if gamma: | |
| vasp += '.gamma' | |
| elif noncollinear: | |
| vasp += '.noncollinear' | |
| if gamma and noncollinear: | |
| raise ValueError, "Noncollinear calculations cannot use the gamma-point version of vasp!" | |
| if name.endswith('.py'): | |
| name = name[:-3] | |
| wd = os.getcwd() | |
| if not queue: | |
| if ppn <= 16: | |
| queue = 'univ' | |
| elif 16 < ppn <= 20: | |
| queue = 'univ2' | |
| else: | |
| raise ValueError, "PPN must be less than 20! Current value is {}".format(ppn) | |
| if queue == 'univ2': | |
| max_ppn = 20 | |
| else: | |
| max_ppn = 16 | |
| if ppn == 0 and nproc == 0 and nodes == 0: | |
| ppn = 1 | |
| nproc = 1 | |
| nodes = 1 | |
| elif ppn == 0 and nproc == 0 and nodes != 0: | |
| ppn = max_ppn | |
| nproc = nodes * ppn | |
| elif ppn == 0 and nproc != 0 and nodes == 0: | |
| if nproc % max_ppn == 0: | |
| ppn = max_ppn | |
| nodes = nproc / ppn | |
| else: | |
| nodes = nproc / max_ppn + 1 | |
| if nproc % nodes != 0: | |
| nproc = nproc + nodes - nproc % nodes | |
| print("WARNING: Rounding number of processors to {}".format(nproc), \ | |
| file=sys.stderr) | |
| ppn = nproc / nodes | |
| elif ppn == 0 and nproc != 0 and nodes != 0: | |
| ppn = nodes / nproc | |
| if nodes % nproc != 0: | |
| ppn += 1 | |
| nproc = ppn * nodes | |
| print("WARNING: Rounding number of processors to {}".format(nproc), \ | |
| file=sys.stderr) | |
| elif ppn != 0 and nproc == 0 and nodes == 0: | |
| nproc = ppn | |
| nodes = 1 | |
| elif ppn != 0 and nproc == 0 and nodes != 0: | |
| nproc = ppn * nodes | |
| elif ppn != 0 and nproc != 0 and nodes == 0: | |
| if nproc % ppn != 0: | |
| raise ValueError, "Processors per node ({0}) is not a divisor of number of processes ({1})!".format(ppn, nproc) | |
| nodes = nproc / ppn | |
| elif ppn != 0 and nproc != 0 and nodes != 0: | |
| assert ppn * nodes == nproc, "Number of processors ({0}) is not consistent with the number of nodes ({1}) and processors per node ({2})!".format(nproc, nodes, ppn) | |
| if ppn > max_ppn: | |
| raise ValueError, "Too many processors per node! Current value is {0}, maximum allowed is {1}".format(ppn, max_ppn) | |
| if not time: | |
| if queue.startswith('univ'): | |
| time = '7-00:00:00' | |
| elif queue == 'pre': | |
| time = '1-00:00:00' | |
| else: | |
| raise ValueError, "Unrecognized queue {}!".format(queue) | |
| script = """#!/home/ehermes/local/bin/python | |
| #SBATCH --job-name={name} | |
| #SBATCH --ntasks-per-node={ppn} | |
| #SBATCH --ntasks={nproc} | |
| #SBATCH --nodes={nodes} | |
| #SBATCH --partition={queue} | |
| #SBATCH --time={time} | |
| #SBATCH --error={wd}/{name}.%J.err | |
| #SBATCH --output={wd}/{name}.%J.out | |
| #SBATCH --exclusive | |
| #SBATCH --hint=nomultithread | |
| """.format(name=name, ppn=ppn, nproc=nproc, nodes=nodes, queue=queue, | |
| time=time, wd=wd) | |
| if exclude: | |
| script += "#SBATCH --exclude={}\n".format(exclude) | |
| if depends: | |
| script += "#SBATCH --depends={}\n".format(depends) | |
| if target: | |
| script += "#SBATCH --target={}\n".format(target) | |
| # Imports and stuff go here | |
| script += """ | |
| import os | |
| import subprocess | |
| import time | |
| jobid = os.environ['SLURM_JOB_ID'] | |
| user = os.environ['USER'] | |
| tmpdir = '/scratch/local/{}/{}'.format(user, jobid) | |
| os.environ['OMP_NUM_THREADS'] = '1' | |
| os.environ['VASP_PP_PATH'] = '/home/ehermes/local/apps/vasp' | |
| """ | |
| if local: | |
| script += """ | |
| nodelist = [] | |
| slurm_nodelist = os.environ['SLURM_NODELIST'][4:] | |
| if slurm_nodelist.startswith('['): | |
| slurm_nodelist = slurm_nodelist[1: -1] | |
| slurm_nodelist = slurm_nodelist.split(',') | |
| for node in slurm_nodelist: | |
| subnode = node.split('-') | |
| for i in range(int(subnode[0]), int(subnode[-1]) + 1): | |
| nodelist.append(str(i).zfill(3)) | |
| for node in nodelist: | |
| mkdir = subprocess.check_call( | |
| ['ssh', 'aci-{{}}'.format(node), | |
| 'mkdir -p {{}}'.format(tmpdir)] | |
| ) | |
| def copy_from(): | |
| for node in nodelist: | |
| copy_from = subprocess.check_call( | |
| ['ssh', 'aci-{{}}'.format(node), | |
| 'rsync -raz {{1}}/* {{0}}/'.format('{wd}', tmpdir)] | |
| ) | |
| os.environ['TMP_WD'] = '{wd}' | |
| os.environ['TMP_TMPDIR'] = tmpdir | |
| os.environ['TMP_VASP'] = '{vasp}' | |
| os.environ['TMP_NODELIST'] = ' '.join(nodelist) | |
| os.environ['VASP_SCRIPT'] = '/home/ehermes/local/apps/vasp/vasp_script.py' | |
| python = '/home/ehermes/local/bin/python' | |
| with open('{name}.out', 'w') as out: | |
| jobproc = subprocess.Popen([python, '{name}.py'], stdout=out) | |
| lastbackup = time.time() | |
| while jobproc.poll() is None: | |
| time.sleep(1) | |
| now = time.time() | |
| if now - lastbackup >= {backupfreq}: | |
| copy_from() | |
| lastbackup = now | |
| else: | |
| print "Job done!" | |
| """.format(wd=wd, vasp=vasp, name=name, backupfreq=backupfreq) | |
| scriptname = 'run.{}.py'.format(os.getpid()) | |
| with open(scriptname, 'w') as f: | |
| f.write(script) | |
| subprocess.check_call(['/usr/bin/sbatch', scriptname]) | |
| os.remove(scriptname) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment