Skip to content

Instantly share code, notes, and snippets.

@MikeDacre
Created June 9, 2017 02:19
Show Gist options
  • Save MikeDacre/85b00221fa8aac0c6656ca7f54e08c6d to your computer and use it in GitHub Desktop.
Save MikeDacre/85b00221fa8aac0c6656ca7f54e08c6d to your computer and use it in GitHub Desktop.
Run PASCAL: Allows easy running of https://www2.unil.ch/cbg/index.php?title=Pascal from any path
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Rewrite of the shell script for easy running from anywhere with cluster support.
Place this in the root of pascal and link to it from your path
"""
from __future__ import print_function
import os as _os
import sys as _sys
import subprocess as _sub
import argparse as _argparse
import multiprocessing as _mp
VERBOSE = False # Show debug output
###############################################################################
# Main Execution Functions #
###############################################################################
def run_pascal(sample_file, **kwargs):
"""Run the pascal Java command."""
cmnd = (
'export DYLD_LIBRARY_PATH={path}/lib:$LD_LIBRARY_PATH; \n'
'export LD_LIBRARY_PATH="{path}/lib:{path}/lib/openBLASlib/lib:'
'{path}/lib/fortranlibs:$LD_LIBRARY_PATH"; \n'
'export OPENBLAS_NUM_THREADS=1; \n'
'java -ea -Xmx{mem} -jar {path}/jars/pascalDeployed.jar {args}'
)
path, java64 = get_runtime_info()
mem = '32g' if java64 else '2g'
strtdir = _os.path.abspath(_os.curdir)
outdir = kwargs['outdir'] if 'outdir' in kwargs else '.'
outdir = _os.path.abspath(outdir)
argstr = get_pascal_arg_string(sample_file, **kwargs)
if not _os.path.isdir(outdir):
_os.mkdir(outdir)
try:
_os.chdir(path)
if VERBOSE:
print(cmnd.format(path=path, mem=mem, args=argstr))
print('Running Pascal on {}'.format(sample_file))
out, err, code = _run(cmnd.format(path=path, mem=mem, args=argstr))
finally:
_os.chdir(strtdir)
return out, err, code
###############################################################################
# Helper Functions #
###############################################################################
def get_pascal_arg_string(sample_file, **kwargs):
"""Take all keyword arguments and return an argument string for pascal."""
allowed_args = ['chr', 'up', 'down', 'maxsnp', 'outsuffix', 'custom',
'mergeddistance', 'mafcutoff', 'genesetfile', 'customdir']
argstr = '--pval={} '.format(_os.path.abspath(sample_file))
# Handle outdir parsing
if 'outdir' in kwargs:
# outdir = _os.path.basename(kwargs.pop('outdir'))
outdir = _os.path.abspath(kwargs.pop('outdir'))
argstr += '--outdir={} '.format(outdir)
# Gene Scoring Method
if 'genescoring' in kwargs:
scoremethod = kwargs.pop('genescoring')
else:
scoremethod = 'sum'
scoremethod = scoremethod.lower()
if scoremethod not in ['max', 'sum']:
raise ValueError('Invalid genescoring method: {}. Choose max or sum'
.format(scoremethod))
argstr += '--maxvegas ' if scoremethod == 'max' else '--analyticvegas '
# Calculate pathway scores
if 'runpathway' in kwargs:
v = kwargs.pop('runpathway')
m = kwargs.pop('mergedistance') if 'mergedistance' in kwargs else None
if v:
if not m:
m = 1
argstr += '--runpathway --mergedistance={}'.format(m)
# Handle simple arguments
for k, v in kwargs.items():
if not v:
continue
if k in allowed_args:
argstr += '--{}={} '.format(k, v)
else:
_sys.stderr.write('Ignoring unrecognized argument --{}\n'
.format(k))
return argstr
def get_runtime_info():
"""Return a information sufficient to run pascal.
Returns
-------
path : string
absolute path to pascal's root
java64 : bool
True if 64 bit java
"""
path = _os.path.dirname(_os.path.realpath(__file__))
if 'jars' not in _os.listdir(path):
path = _os.path.dirname(path)
if 'jars' not in _os.listdir(path):
raise OSError('Cannot find PATH to Pascal, make sure this script' +
' is in the root or bin directory of the Pascal' +
' package.')
assert _os.path.isdir(path)
assert _os.path.isdir(_os.path.join(path, 'lib'))
assert _os.path.isdir(_os.path.join(path, 'jars'))
_, err, code = _run('java -version')
if int(code) != 0 or not err:
raise OSError('Java does not appear to be installed as ' +
'`java --version` fails')
if '64-Bit' in err:
java64 = True
else:
java64 = False
_sys.stderr.write(
"java 32-Bit called instead of 64-Bit. You might run into "
"memory problems.\nConsider installing the 64-Bit java VM.\n"
)
return path, java64
def _run(command, raise_on_error=False):
"""Run a command with subprocess the way it should be.
Parameters
----------
command : str
A command to execute, piping is fine.
raise_on_error : bool
Raise a subprocess.CalledProcessError on exit_code != 0
Returns
-------
stdout : str
stderr : str
exit_code : int
"""
pp = _sub.Popen(command, shell=True, universal_newlines=True,
stdout=_sub.PIPE, stderr=_sub.PIPE)
out, err = pp.communicate()
code = pp.returncode
if raise_on_error and code != 0:
raise _sub.CalledProcessError(
returncode=code, cmd=command, output=out, stderr=err
)
return out, err, code
###############################################################################
# Command Line Parsing #
###############################################################################
def argument_parser():
"""Parse all arguments."""
path, _ = get_runtime_info()
parser = _argparse.ArgumentParser(
description=__doc__,
formatter_class=_argparse.RawDescriptionHelpFormatter
)
parser.add_argument('-v', '--verbose', action='store_true',
help='Show debug output')
# Universal arguments
shared = _argparse.ArgumentParser(add_help=False)
shared.add_argument('-o', '--outdir',
help='Directory to output files to')
shared.add_argument('--outsuffix',
help=(
'Adds an additional string to the output file '
'names produced.'
))
shared.add_argument('--chr', type=int, help='limit to this chromosome')
shared.add_argument('--up', type=int, default=50000,
help=(
'Gives the number of base-pairs upstream of the '
'transcription start site that are still counted '
'as belonging to the gene region. The default is '
"50’000."
))
shared.add_argument('--down', type=int, default=50000,
help=(
'Gives the number of base-pairs downstream of '
'the gene-body that are still counted as '
'belonging to the gene region. The default is '
"50’000."
))
shared.add_argument('--maxsnp', type=int, default=3000,
help=(
'Sets the aximum number of SNPs per gene. If a '
'gene has more SNPs in its region it will not '
'calculate the score. If the option is set to -1, '
'all genes will be computed. The default is 3000.'
))
shared.add_argument('--genescoring', type=str, choices={'sum', 'max'},
default='sum', help=(
'Chooses the genescoring method. The default is '
'sum. This option should be supplied with either'
'max or sum.'
))
shared.add_argument('--runpathway', action='store_true',
help=(
'Chooses whether Pascal should be calculate '
'pathway scores.'
))
shared.add_argument('--mergedistance', type=int, default=1,
help=(
'Gives the genomic distance in mega-bases that '
'the program uses to fuse nearby genes during the '
'pathway analysis. Only has an effect if '
'runpathway is on. The default is 1.'
))
shared.add_argument('--mafcutoff', type=float, default=0.05,
help=(
'SNPs with maf below that value in the european '
'sample of 1KG will be ignored. The default is '
'0.05'
))
shared.add_argument('--genesetfile',
default = _os.path.join(
path, 'resources/genesets/msigdb/' +
'msigBIOCARTA_KEGG_REACTOME.gmt'
),
help=(
'Gives the file name to a gmt-file where the gene '
'sets are defined. The default is '
'resources/genesets/msigdb/'
'msigBIOCARTA_KEGG_REACTOME.gmt'
))
cref = shared.add_argument_group(
'custom reference',
description=(
'Pascal allows to use custom reference populations instead of the '
'1KG-EUR sam- ple (For an example about formatting and parameter '
'setting, check the bash script examples/exampleRunFromTped.sh). '
'To make use of this option, you have to provide your genotype '
'information in gnu-zipped, space-separated and 1-2-coded '
'tped-files split by chromosome. the tped format has been '
'popularized by the plink-tool for GWAS analysis.'
)
)
cref.add_argument('--customdir',
help=(
'Gives the path to where the gnu-zipped tped-files '
'are stored.'
))
cref.add_argument('--custom',
help=(
'Gives the prefix of a gnuzipped tped-files. The '
'total filename per chromosome is '
'<prefix>.chr<chrNr>.tped.gz.'
))
modes = parser.add_subparsers(title='mode', dest='mode')
# Run simply with only one sample
single = modes.add_parser(
'single', parents=[shared],
help="Run pascal directly on a single dataset"
)
single.add_argument('snp_file', help='File of rsid\\tpval to analyze')
return parser
###############################################################################
# Run Directly #
###############################################################################
def main(argv=None):
"""Run as a script."""
if not argv:
argv = _sys.argv[1:]
parser = argument_parser()
args = parser.parse_args(argv)
if not args.mode:
parser.print_help()
_sys.exit(1)
if args.verbose:
global VERBOSE
VERBOSE = True
# Convert to dictionary
opts = vars(args)
if VERBOSE:
print(opts)
outdir = args.outdir
outdir = _os.path.abspath(outdir if outdir.strip() else _os.curdir)
mode = opts.pop('mode')
if mode == 'single':
sample_file = opts.pop('snp_file')
name = _os.path.join(
outdir,
_os.path.splitext(_os.path.basename(sample_file))[0]
)
out, err, code = run_pascal(sample_file, **opts)
with open(name + '.out', 'w') as fout:
fout.write(out)
with open(name + '.err', 'w') as fout:
fout.write(err)
_sys.exit(code)
if __name__ == '__main__' and '__file__' in globals():
_sys.exit(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment