Last active
December 16, 2015 10:49
-
-
Save brentp/5422829 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /bin/bash | |
# Align paired bisulfite-converted DNA reads to a genome. | |
# This assumes that reads1.fastq are all from the converted strand | |
# (i.e. they have C->T conversions) and reads2.fastq are all from the | |
# reverse-complement (i.e. they have G->A conversions). | |
# "GNU parallel" needs to be installed. | |
[ $# -eq 5 ] || { | |
cat <<EOF | |
Typical usage: | |
lastdb -w 2 -u bisulfite_f.seed my_f mygenome.fa | |
lastdb -w 2 -u bisulfite_r.seed my_r mygenome.fa | |
$(basename $0) my_f my_r reads1.fastq reads2.fastq readgroup_id > results.maf | |
EOF | |
exit 2 | |
} | |
# Try to get the LAST programs into the PATH, if they aren't already: | |
PATH=$PATH:$(dirname $0)/../src:$(dirname $0)/../scripts | |
tmp=${TMPDIR-/tmp}/$$ | |
trap 'rm -f $tmp.*' EXIT | |
cat > $tmp.fmat << 'EOF' | |
A C G T | |
A 6 -18 -18 -18 | |
C -18 6 -18 3 | |
G -18 -18 6 -18 | |
T -18 -18 -18 3 | |
EOF | |
cat > $tmp.rmat << 'EOF' | |
A C G T | |
A 3 -18 -18 -18 | |
C -18 6 -18 -18 | |
G 3 -18 6 -18 | |
T -18 -18 -18 6 | |
EOF | |
cat > $tmp.script << 'EOF' | |
t=$1.$$ | |
lastal -p $1.fmat -s1 -Q1 -e120 -i1 "$2" "$4" > $t.t1f | |
lastal -p $1.rmat -s0 -Q1 -e120 -i1 "$3" "$4" > $t.t1r | |
last-merge-batches.py $t.t1f $t.t1r > $t.t1 | |
rm $t.t1f $t.t1r | |
lastal -p $1.fmat -s0 -Q1 -e120 -i1 "$2" "$5" > $t.t2f | |
lastal -p $1.rmat -s1 -Q1 -e120 -i1 "$3" "$5" > $t.t2r | |
last-merge-batches.py $t.t2f $t.t2r > $t.t2 | |
rm $t.t2f $t.t2r | |
last-pair-probs.py -m0.1 $t.t1 $t.t2 | | |
perl -F'(\s+)' -ane '$F[12] =~ y/ta/CG/ if /^s/ and $s++ % 2; print @F' | |
rm $t.t1 $t.t2 | |
EOF | |
# use less as it does zless too | |
# Convert C to t, and all other letters to uppercase: | |
perl -pe 'y/Cca-z/ttA-Z/ if $. % 4 == 2' <(less $3) | split -l900000 -a5 - $tmp.1 | |
# Convert G to a, and all other letters to uppercase: | |
perl -pe 'y/Gga-z/aaA-Z/ if $. % 4 == 2' <(less $4) | split -l900000 -a5 - $tmp.2 | |
parallel -j 12 --gnu --xapply sh $tmp.script $tmp "$1" "$2" ::: $tmp.1* ::: $tmp.2* > $tmp.merge.maf | |
rg=$5 | |
maf-convert.py sam -d tmp.merge.maf -r "ID:$rg SM:$rg" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import os | |
import os.path as op | |
import subprocess | |
def last_meth(last_ex_dir, fqs, reference, prefix, cleanup=False): | |
prefix = prefix.rstrip("/") + "/" | |
try: | |
os.makedirs(prefix) | |
except OSError: | |
pass | |
cmd_base = 'lastal -p %(last_ex_dir)s/bisulfite_%(dir)s.mat -s%(i)d' | |
cmd_base += ' %(prefix)s%(dir)sIndex -j1 -Q1 -d120 %(fq)s > %(maf)s' | |
def sh(cmd): | |
print >>sys.stderr, "[cmd]", cmd | |
return subprocess.Popen(cmd, stdout=sys.stdout, stderr=sys.stderr, | |
shell=True, executable=os.environ.get('SHELL')) | |
if not op.exists(prefix + "fIndex.prj"): | |
p1 = sh('lastdb -u %(last_ex_dir)s/bisulfite_f.seed \ | |
%(prefix)sfIndex %(reference)s' % locals()) | |
p2 = sh('lastdb -u %(last_ex_dir)s/bisulfite_r.seed \ | |
%(prefix)srIndex %(reference)s' % locals()) | |
p1.wait() and p2.wait() | |
cmds, merge_mafs = [], [] | |
for i, fq in enumerate(fqs): | |
base = op.splitext(op.basename(fq))[0] | |
if fq.endswith(".gz"): | |
fq = "<(zless %s)" % fq | |
base = op.splitext(base)[0] | |
mafs = [] | |
for i, dir in zip((1, 0), 'fr'): | |
maf = "%(prefix)s/%(base)s-%(dir)s-%(i)d.maf" % locals() | |
mafs.append(maf) | |
cmds.append(cmd_base % locals()) | |
merge_maf = "%(prefix)s/%(base)s.maf" % locals() | |
cmds.append('last-merge-batches.py %s > %s' \ | |
% (" ".join(mafs), merge_maf)) | |
merge_mafs.append(merge_maf) | |
if len(fqs) > 1: | |
mm = " ".join(merge_mafs) | |
shared = "".join(l for l, r in zip(merge_mafs[0], merge_mafs[1]) if l == r) | |
shared = shared.rstrip("_-. ").replace("_.", ".").replace("-.", ".") | |
cmds.append("last-pair-probs.py -m 0.1 -s1000 %s > %s" % (mm, shared)) | |
merge_mafs = [shared] | |
cmds.append('maf-convert.py sam -d %s | samtools view -bS - | samtools sort - %s' \ | |
% (merge_mafs[0], merge_mafs[0][:-4])) | |
with open(prefix + "run.sh", "w") as fh: | |
print >>fh, "\n".join(cmds) | |
procs = [sh(cmd) for cmd in cmds[:2]] | |
if len(fqs) > 1: | |
procs.extend([sh(cmd) for cmd in cmds[3:5]]) | |
for p in procs: p.wait() | |
procs = [sh(cmds[2])] | |
if len(fqs) > 1: | |
procs.append(sh(cmds[5])) | |
for p in procs: p.wait() | |
for cmd in cmds[(6 if len(fqs) > 1 else 3):]: | |
sh(cmd).wait() | |
return merge_mafs[0][:-4] + ".bam" | |
if __name__ == "__main__": | |
import argparse | |
p = argparse.ArgumentParser(__doc__) | |
p.add_argument("--last-ex-dir") | |
p.add_argument("--reference", help="reference fasta") | |
p.add_argument("--prefix", help="output prefix", default='last-meth') | |
p.add_argument("fastqs", nargs="+") | |
a = p.parse_args() | |
last_meth(a.last_ex_dir, a.fastqs, a.reference, a.prefix) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment