Skip to content

Instantly share code, notes, and snippets.

@brentp
Last active December 16, 2015 10:49
Show Gist options
  • Save brentp/5422829 to your computer and use it in GitHub Desktop.
Save brentp/5422829 to your computer and use it in GitHub Desktop.
#! /bin/bash
# Align paired bisulfite-converted DNA reads to a genome.
# This assumes that reads1.fastq are all from the converted strand
# (i.e. they have C->T conversions) and reads2.fastq are all from the
# reverse-complement (i.e. they have G->A conversions).
# "GNU parallel" needs to be installed.
[ $# -eq 5 ] || {
cat <<EOF
Typical usage:
lastdb -w 2 -u bisulfite_f.seed my_f mygenome.fa
lastdb -w 2 -u bisulfite_r.seed my_r mygenome.fa
$(basename $0) my_f my_r reads1.fastq reads2.fastq readgroup_id > results.maf
EOF
exit 2
}
# Try to get the LAST programs into the PATH, if they aren't already:
PATH=$PATH:$(dirname $0)/../src:$(dirname $0)/../scripts
tmp=${TMPDIR-/tmp}/$$
trap 'rm -f $tmp.*' EXIT
cat > $tmp.fmat << 'EOF'
A C G T
A 6 -18 -18 -18
C -18 6 -18 3
G -18 -18 6 -18
T -18 -18 -18 3
EOF
cat > $tmp.rmat << 'EOF'
A C G T
A 3 -18 -18 -18
C -18 6 -18 -18
G 3 -18 6 -18
T -18 -18 -18 6
EOF
cat > $tmp.script << 'EOF'
t=$1.$$
lastal -p $1.fmat -s1 -Q1 -e120 -i1 "$2" "$4" > $t.t1f
lastal -p $1.rmat -s0 -Q1 -e120 -i1 "$3" "$4" > $t.t1r
last-merge-batches.py $t.t1f $t.t1r > $t.t1
rm $t.t1f $t.t1r
lastal -p $1.fmat -s0 -Q1 -e120 -i1 "$2" "$5" > $t.t2f
lastal -p $1.rmat -s1 -Q1 -e120 -i1 "$3" "$5" > $t.t2r
last-merge-batches.py $t.t2f $t.t2r > $t.t2
rm $t.t2f $t.t2r
last-pair-probs.py -m0.1 $t.t1 $t.t2 |
perl -F'(\s+)' -ane '$F[12] =~ y/ta/CG/ if /^s/ and $s++ % 2; print @F'
rm $t.t1 $t.t2
EOF
# use less as it does zless too
# Convert C to t, and all other letters to uppercase:
perl -pe 'y/Cca-z/ttA-Z/ if $. % 4 == 2' <(less $3) | split -l900000 -a5 - $tmp.1
# Convert G to a, and all other letters to uppercase:
perl -pe 'y/Gga-z/aaA-Z/ if $. % 4 == 2' <(less $4) | split -l900000 -a5 - $tmp.2
parallel -j 12 --gnu --xapply sh $tmp.script $tmp "$1" "$2" ::: $tmp.1* ::: $tmp.2* > $tmp.merge.maf
rg=$5
maf-convert.py sam -d tmp.merge.maf -r "ID:$rg SM:$rg"
import sys
import os
import os.path as op
import subprocess
def last_meth(last_ex_dir, fqs, reference, prefix, cleanup=False):
prefix = prefix.rstrip("/") + "/"
try:
os.makedirs(prefix)
except OSError:
pass
cmd_base = 'lastal -p %(last_ex_dir)s/bisulfite_%(dir)s.mat -s%(i)d'
cmd_base += ' %(prefix)s%(dir)sIndex -j1 -Q1 -d120 %(fq)s > %(maf)s'
def sh(cmd):
print >>sys.stderr, "[cmd]", cmd
return subprocess.Popen(cmd, stdout=sys.stdout, stderr=sys.stderr,
shell=True, executable=os.environ.get('SHELL'))
if not op.exists(prefix + "fIndex.prj"):
p1 = sh('lastdb -u %(last_ex_dir)s/bisulfite_f.seed \
%(prefix)sfIndex %(reference)s' % locals())
p2 = sh('lastdb -u %(last_ex_dir)s/bisulfite_r.seed \
%(prefix)srIndex %(reference)s' % locals())
p1.wait() and p2.wait()
cmds, merge_mafs = [], []
for i, fq in enumerate(fqs):
base = op.splitext(op.basename(fq))[0]
if fq.endswith(".gz"):
fq = "<(zless %s)" % fq
base = op.splitext(base)[0]
mafs = []
for i, dir in zip((1, 0), 'fr'):
maf = "%(prefix)s/%(base)s-%(dir)s-%(i)d.maf" % locals()
mafs.append(maf)
cmds.append(cmd_base % locals())
merge_maf = "%(prefix)s/%(base)s.maf" % locals()
cmds.append('last-merge-batches.py %s > %s' \
% (" ".join(mafs), merge_maf))
merge_mafs.append(merge_maf)
if len(fqs) > 1:
mm = " ".join(merge_mafs)
shared = "".join(l for l, r in zip(merge_mafs[0], merge_mafs[1]) if l == r)
shared = shared.rstrip("_-. ").replace("_.", ".").replace("-.", ".")
cmds.append("last-pair-probs.py -m 0.1 -s1000 %s > %s" % (mm, shared))
merge_mafs = [shared]
cmds.append('maf-convert.py sam -d %s | samtools view -bS - | samtools sort - %s' \
% (merge_mafs[0], merge_mafs[0][:-4]))
with open(prefix + "run.sh", "w") as fh:
print >>fh, "\n".join(cmds)
procs = [sh(cmd) for cmd in cmds[:2]]
if len(fqs) > 1:
procs.extend([sh(cmd) for cmd in cmds[3:5]])
for p in procs: p.wait()
procs = [sh(cmds[2])]
if len(fqs) > 1:
procs.append(sh(cmds[5]))
for p in procs: p.wait()
for cmd in cmds[(6 if len(fqs) > 1 else 3):]:
sh(cmd).wait()
return merge_mafs[0][:-4] + ".bam"
if __name__ == "__main__":
import argparse
p = argparse.ArgumentParser(__doc__)
p.add_argument("--last-ex-dir")
p.add_argument("--reference", help="reference fasta")
p.add_argument("--prefix", help="output prefix", default='last-meth')
p.add_argument("fastqs", nargs="+")
a = p.parse_args()
last_meth(a.last_ex_dir, a.fastqs, a.reference, a.prefix)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment