brentp · December 16, 2015 10:49
diff --git a/last-bisulfite-paired.sh b/last-bisulfite-paired.sh
 #! /bin/bash

 # Align paired bisulfite-converted DNA reads to a genome.

 # This assumes that reads1.fastq are all from the converted strand
 # (i.e. they have C->T conversions) and reads2.fastq are all from the
 # reverse-complement (i.e. they have G->A conversions).

 # "GNU parallel" needs to be installed.

 [ $# -eq 5 ] || {
    cat <<EOF
 Typical usage:

  lastdb -w 2 -u bisulfite_f.seed my_f mygenome.fa
  lastdb -w 2 -u bisulfite_r.seed my_r mygenome.fa

  $(basename $0) my_f my_r reads1.fastq reads2.fastq readgroup_id > results.maf

 EOF
    exit 2
 }

 # Try to get the LAST programs into the PATH, if they aren't already:
 PATH=$PATH:$(dirname $0)/../src:$(dirname $0)/../scripts

 tmp=${TMPDIR-/tmp}/$$
 trap 'rm -f $tmp.*' EXIT

 cat > $tmp.fmat << 'EOF'
    A   C   G   T
 A   6 -18 -18 -18
 C -18   6 -18   3
 G -18 -18   6 -18
 T -18 -18 -18   3
 EOF

 cat > $tmp.rmat << 'EOF'
    A   C   G   T
 A   3 -18 -18 -18
 C -18   6 -18 -18
 G   3 -18   6 -18
 T -18 -18 -18   6
 EOF

 cat > $tmp.script << 'EOF'
 t=$1.$$

 lastal -p $1.fmat -s1 -Q1 -e120 -i1 "$2" "$4" > $t.t1f
 lastal -p $1.rmat -s0 -Q1 -e120 -i1 "$3" "$4" > $t.t1r
 last-merge-batches.py $t.t1f $t.t1r > $t.t1
 rm $t.t1f $t.t1r

 lastal -p $1.fmat -s0 -Q1 -e120 -i1 "$2" "$5" > $t.t2f
 lastal -p $1.rmat -s1 -Q1 -e120 -i1 "$3" "$5" > $t.t2r
 last-merge-batches.py $t.t2f $t.t2r > $t.t2
 rm $t.t2f $t.t2r

 last-pair-probs.py -m0.1 $t.t1 $t.t2 |
 perl -F'(\s+)' -ane '$F[12] =~ y/ta/CG/ if /^s/ and $s++ % 2; print @F'
 rm $t.t1 $t.t2
 EOF

 # use less as it does zless too
 # Convert C to t, and all other letters to uppercase:
 perl -pe 'y/Cca-z/ttA-Z/ if $. % 4 == 2' <(less $3) | split -l900000 -a5 - $tmp.1

 # Convert G to a, and all other letters to uppercase:
 perl -pe 'y/Gga-z/aaA-Z/ if $. % 4 == 2' <(less $4) | split -l900000 -a5 - $tmp.2

 parallel -j 12 --gnu --xapply sh $tmp.script $tmp "$1" "$2" ::: $tmp.1* ::: $tmp.2* > $tmp.merge.maf

 rg=$5
 maf-convert.py sam -d tmp.merge.maf -r "ID:$rg SM:$rg"
diff --git a/last_meth.py b/last_meth.py
 import sys
 import os
 import os.path as op
 import subprocess

 def last_meth(last_ex_dir, fqs, reference, prefix, cleanup=False):
    prefix = prefix.rstrip("/") + "/"
    try:
        os.makedirs(prefix)
    except OSError:
        pass

    cmd_base = 'lastal -p %(last_ex_dir)s/bisulfite_%(dir)s.mat -s%(i)d'
    cmd_base += ' %(prefix)s%(dir)sIndex -j1 -Q1 -d120 %(fq)s > %(maf)s'

    def sh(cmd):
        print >>sys.stderr, "[cmd]", cmd
        return subprocess.Popen(cmd, stdout=sys.stdout, stderr=sys.stderr,
                          shell=True, executable=os.environ.get('SHELL'))

    if not op.exists(prefix + "fIndex.prj"):
        p1 = sh('lastdb -u %(last_ex_dir)s/bisulfite_f.seed \
                %(prefix)sfIndex %(reference)s' % locals())
        p2 = sh('lastdb -u %(last_ex_dir)s/bisulfite_r.seed \
                %(prefix)srIndex %(reference)s' % locals())
        p1.wait() and p2.wait()

    cmds, merge_mafs = [], []
    for i, fq in enumerate(fqs):
        base = op.splitext(op.basename(fq))[0]
        if fq.endswith(".gz"):
            fq = "<(zless %s)" % fq
            base = op.splitext(base)[0]

        mafs = []
        for i, dir in zip((1, 0), 'fr'):
            maf = "%(prefix)s/%(base)s-%(dir)s-%(i)d.maf" % locals()
            mafs.append(maf)
            cmds.append(cmd_base % locals())
        merge_maf = "%(prefix)s/%(base)s.maf" % locals()
        cmds.append('last-merge-batches.py %s > %s' \
                % (" ".join(mafs), merge_maf))
        merge_mafs.append(merge_maf)

    if len(fqs) > 1:
        mm = " ".join(merge_mafs)
        shared = "".join(l for l, r in zip(merge_mafs[0], merge_mafs[1]) if l == r)
        shared = shared.rstrip("_-. ").replace("_.", ".").replace("-.", ".")
        cmds.append("last-pair-probs.py -m 0.1 -s1000 %s > %s" % (mm, shared))
        merge_mafs = [shared]

    cmds.append('maf-convert.py sam -d %s | samtools view -bS - | samtools sort - %s' \
                    % (merge_mafs[0], merge_mafs[0][:-4]))

    with open(prefix + "run.sh", "w") as fh:
        print >>fh, "\n".join(cmds)


    procs = [sh(cmd) for cmd in cmds[:2]]
    if len(fqs) > 1:
        procs.extend([sh(cmd) for cmd in cmds[3:5]])
    for p in procs: p.wait()

    procs = [sh(cmds[2])]
    if len(fqs) > 1:
        procs.append(sh(cmds[5]))

    for p in procs: p.wait()

    for cmd in cmds[(6 if len(fqs) > 1 else 3):]:
        sh(cmd).wait()

    return merge_mafs[0][:-4] + ".bam"

 if __name__ == "__main__":

    import argparse

    p = argparse.ArgumentParser(__doc__)
    p.add_argument("--last-ex-dir")
    p.add_argument("--reference", help="reference fasta")
    p.add_argument("--prefix", help="output prefix", default='last-meth')
    p.add_argument("fastqs", nargs="+")

    a = p.parse_args()

    last_meth(a.last_ex_dir, a.fastqs, a.reference, a.prefix)
	#! /bin/bash

	# Align paired bisulfite-converted DNA reads to a genome.

	# This assumes that reads1.fastq are all from the converted strand
	# (i.e. they have C->T conversions) and reads2.fastq are all from the
	# reverse-complement (i.e. they have G->A conversions).

	# "GNU parallel" needs to be installed.

	[ $# -eq 5 ] \|\| {
	cat <<EOF
	Typical usage:

	lastdb -w 2 -u bisulfite_f.seed my_f mygenome.fa
	lastdb -w 2 -u bisulfite_r.seed my_r mygenome.fa

	$(basename $0) my_f my_r reads1.fastq reads2.fastq readgroup_id > results.maf

	EOF
	exit 2
	}

	# Try to get the LAST programs into the PATH, if they aren't already:
	PATH=$PATH:$(dirname $0)/../src:$(dirname $0)/../scripts

	tmp=${TMPDIR-/tmp}/$$
	trap 'rm -f $tmp.*' EXIT

	cat > $tmp.fmat << 'EOF'
	A C G T
	A 6 -18 -18 -18
	C -18 6 -18 3
	G -18 -18 6 -18
	T -18 -18 -18 3
	EOF

	cat > $tmp.rmat << 'EOF'
	A C G T
	A 3 -18 -18 -18
	C -18 6 -18 -18
	G 3 -18 6 -18
	T -18 -18 -18 6
	EOF

	cat > $tmp.script << 'EOF'
	t=$1.$$

	lastal -p $1.fmat -s1 -Q1 -e120 -i1 "$2" "$4" > $t.t1f
	lastal -p $1.rmat -s0 -Q1 -e120 -i1 "$3" "$4" > $t.t1r
	last-merge-batches.py $t.t1f $t.t1r > $t.t1
	rm $t.t1f $t.t1r

	lastal -p $1.fmat -s0 -Q1 -e120 -i1 "$2" "$5" > $t.t2f
	lastal -p $1.rmat -s1 -Q1 -e120 -i1 "$3" "$5" > $t.t2r
	last-merge-batches.py $t.t2f $t.t2r > $t.t2
	rm $t.t2f $t.t2r

	last-pair-probs.py -m0.1 $t.t1 $t.t2 \|
	perl -F'(\s+)' -ane '$F[12] =~ y/ta/CG/ if /^s/ and $s++ % 2; print @F'
	rm $t.t1 $t.t2
	EOF

	# use less as it does zless too
	# Convert C to t, and all other letters to uppercase:
	perl -pe 'y/Cca-z/ttA-Z/ if $. % 4 == 2' <(less $3) \| split -l900000 -a5 - $tmp.1

	# Convert G to a, and all other letters to uppercase:
	perl -pe 'y/Gga-z/aaA-Z/ if $. % 4 == 2' <(less $4) \| split -l900000 -a5 - $tmp.2

	parallel -j 12 --gnu --xapply sh $tmp.script $tmp "$1" "$2" ::: $tmp.1* ::: $tmp.2* > $tmp.merge.maf

	rg=$5
	maf-convert.py sam -d tmp.merge.maf -r "ID:$rg SM:$rg"
	import sys
	import os
	import os.path as op
	import subprocess

	def last_meth(last_ex_dir, fqs, reference, prefix, cleanup=False):
	prefix = prefix.rstrip("/") + "/"
	try:
	os.makedirs(prefix)
	except OSError:
	pass

	cmd_base = 'lastal -p %(last_ex_dir)s/bisulfite_%(dir)s.mat -s%(i)d'
	cmd_base += ' %(prefix)s%(dir)sIndex -j1 -Q1 -d120 %(fq)s > %(maf)s'

	def sh(cmd):
	print >>sys.stderr, "[cmd]", cmd
	return subprocess.Popen(cmd, stdout=sys.stdout, stderr=sys.stderr,
	shell=True, executable=os.environ.get('SHELL'))

	if not op.exists(prefix + "fIndex.prj"):
	p1 = sh('lastdb -u %(last_ex_dir)s/bisulfite_f.seed \
	%(prefix)sfIndex %(reference)s' % locals())
	p2 = sh('lastdb -u %(last_ex_dir)s/bisulfite_r.seed \
	%(prefix)srIndex %(reference)s' % locals())
	p1.wait() and p2.wait()

	cmds, merge_mafs = [], []
	for i, fq in enumerate(fqs):
	base = op.splitext(op.basename(fq))[0]
	if fq.endswith(".gz"):
	fq = "<(zless %s)" % fq
	base = op.splitext(base)[0]

	mafs = []
	for i, dir in zip((1, 0), 'fr'):
	maf = "%(prefix)s/%(base)s-%(dir)s-%(i)d.maf" % locals()
	mafs.append(maf)
	cmds.append(cmd_base % locals())
	merge_maf = "%(prefix)s/%(base)s.maf" % locals()
	cmds.append('last-merge-batches.py %s > %s' \
	% (" ".join(mafs), merge_maf))
	merge_mafs.append(merge_maf)

	if len(fqs) > 1:
	mm = " ".join(merge_mafs)
	shared = "".join(l for l, r in zip(merge_mafs[0], merge_mafs[1]) if l == r)
	shared = shared.rstrip("_-. ").replace("_.", ".").replace("-.", ".")
	cmds.append("last-pair-probs.py -m 0.1 -s1000 %s > %s" % (mm, shared))
	merge_mafs = [shared]

	cmds.append('maf-convert.py sam -d %s \| samtools view -bS - \| samtools sort - %s' \
	% (merge_mafs[0], merge_mafs[0][:-4]))

	with open(prefix + "run.sh", "w") as fh:
	print >>fh, "\n".join(cmds)


	procs = [sh(cmd) for cmd in cmds[:2]]
	if len(fqs) > 1:
	procs.extend([sh(cmd) for cmd in cmds[3:5]])
	for p in procs: p.wait()

	procs = [sh(cmds[2])]
	if len(fqs) > 1:
	procs.append(sh(cmds[5]))

	for p in procs: p.wait()

	for cmd in cmds[(6 if len(fqs) > 1 else 3):]:
	sh(cmd).wait()

	return merge_mafs[0][:-4] + ".bam"

	if __name__ == "__main__":

	import argparse

	p = argparse.ArgumentParser(__doc__)
	p.add_argument("--last-ex-dir")
	p.add_argument("--reference", help="reference fasta")
	p.add_argument("--prefix", help="output prefix", default='last-meth')
	p.add_argument("fastqs", nargs="+")

	a = p.parse_args()

	last_meth(a.last_ex_dir, a.fastqs, a.reference, a.prefix)