brwnj · August 15, 2016 21:09
diff --git a/joinreads.py b/joinreads.py
 #!/usr/bin/env python
 # encoding: utf-8
 """
 Join R1 and R2, or print unpaired reads of R1 or R2

 This script assumes there is whitespace separating the readname from the pair number:

                                       > <
 @HW-ST997:140:D0UA5ACXX:8:1101:1497:2138 1:N:0:ATCACG
 @HW-ST997:140:D0UA5ACXX:8:1101:1497:2138 2:N:0:ATCACG
                                       > <
 """
 import sys
 import bz2
 import gzip
 import string
 import os.path as op
 from itertools import islice
 from toolshed import nopen

 __author__ = "Joe Brown"
 __author_email__ = "[email protected]"

 COMPLEMENT = string.maketrans('ACGTNSRYMKWHBVD','TGCANSRYMKWHBVD')

 def read_fastq(fq):
    r"""fastq parser that returns name, seq, and qual."""
    while True:
        values = list(islice(fq, 4))
        if len(values) == 4:
            id1, seq, id2, qual = values
        elif len(values) == 0:
            raise StopIteration
        else:
            raise EOFError("unexpected end of file")
        assert id1.startswith('@'),\
                ">> Fastq out of sync at read:\n%s\n" % id1
        assert id2.startswith('+'),\
                ">> Fastq out of sync at read:\n%s\n" % id1
        assert len(seq) == len(qual),\
                ">> Sequence and Quality are not the same length \
                for read:\n%s\n" % id1
        yield id1[1:-1], seq[:-1], qual[:-1]

 def reversecomplement(seq):
    """return reverse complement of seq."""
    return seq.translate(COMPLEMENT)[::-1]

 def fastqtodict(fastq):
    """returns dict of read name to sequence"""
    fdict = {}
    with nopen(fastq) as fq:
        for name, seq, qual in read_fastq(fq):
            # explicitly state space to facilitate future changes
            fdict[name.split(" ")[0]] = seq
    return fdict

 def main(args):
    # set dictionary based on mode
    if args.mode == "R2":
        fastqdict = fastqtodict(args.R1)
        fastq = args.R2
    else:
        fastqdict = fastqtodict(args.R2)
        fastq = args.R1
    
    with nopen(fastq) as fq:
        for name, seq, qual in read_fastq(fq):
            try:
                # explicitly state space to facilitate future changes
                name = name.split(" ")[0]
                cseq = fastqdict.get(name)
                if args.reverse_complement:
                    cseq = reversecomplement(cseq)
                print "@%s\n%s%s" % (name, seq, cseq)
            except KeyError:
                # without pairs
                if not args.mode == "paired":
                    print "@%s\n%s" % (name, seq)

 if __name__ == "__main__":
    import argparse
    p = argparse.ArgumentParser(description=__doc__,
                    formatter_class=argparse.RawDescriptionHelpFormatter)
    p.add_argument('R1', help="fastq of R1")
    p.add_argument('R2', help="fastq of R2")
    p.add_argument("--mode", "-m", choices=["paired", "R1", "R2"], 
                    default="paired", help="prints <mode> to stdout where \
                        R1 is equivalent to reads without a pair in R2 [%(default)s]")
    p.add_argument("--reverse_complement", "-r", action="store_false", 
                    default=True, help="reverse complement R2 when joining [%(default)s]")
    args = p.parse_args()
    main(args)
	#!/usr/bin/env python
	# encoding: utf-8
	"""
	Join R1 and R2, or print unpaired reads of R1 or R2

	This script assumes there is whitespace separating the readname from the pair number:

	> <
	@HW-ST997:140:D0UA5ACXX:8:1101:1497:2138 1:N:0:ATCACG
	@HW-ST997:140:D0UA5ACXX:8:1101:1497:2138 2:N:0:ATCACG
	> <
	"""
	import sys
	import bz2
	import gzip
	import string
	import os.path as op
	from itertools import islice
	from toolshed import nopen

	__author__ = "Joe Brown"
	__author_email__ = "[email protected]"

	COMPLEMENT = string.maketrans('ACGTNSRYMKWHBVD','TGCANSRYMKWHBVD')

	def read_fastq(fq):
	r"""fastq parser that returns name, seq, and qual."""
	while True:
	values = list(islice(fq, 4))
	if len(values) == 4:
	id1, seq, id2, qual = values
	elif len(values) == 0:
	raise StopIteration
	else:
	raise EOFError("unexpected end of file")
	assert id1.startswith('@'),\
	">> Fastq out of sync at read:\n%s\n" % id1
	assert id2.startswith('+'),\
	">> Fastq out of sync at read:\n%s\n" % id1
	assert len(seq) == len(qual),\
	">> Sequence and Quality are not the same length \
	for read:\n%s\n" % id1
	yield id1[1:-1], seq[:-1], qual[:-1]

	def reversecomplement(seq):
	"""return reverse complement of seq."""
	return seq.translate(COMPLEMENT)[::-1]

	def fastqtodict(fastq):
	"""returns dict of read name to sequence"""
	fdict = {}
	with nopen(fastq) as fq:
	for name, seq, qual in read_fastq(fq):
	# explicitly state space to facilitate future changes
	fdict[name.split(" ")[0]] = seq
	return fdict

	def main(args):
	# set dictionary based on mode
	if args.mode == "R2":
	fastqdict = fastqtodict(args.R1)
	fastq = args.R2
	else:
	fastqdict = fastqtodict(args.R2)
	fastq = args.R1

	with nopen(fastq) as fq:
	for name, seq, qual in read_fastq(fq):
	try:
	# explicitly state space to facilitate future changes
	name = name.split(" ")[0]
	cseq = fastqdict.get(name)
	if args.reverse_complement:
	cseq = reversecomplement(cseq)
	print "@%s\n%s%s" % (name, seq, cseq)
	except KeyError:
	# without pairs
	if not args.mode == "paired":
	print "@%s\n%s" % (name, seq)

	if __name__ == "__main__":
	import argparse
	p = argparse.ArgumentParser(description=__doc__,
	formatter_class=argparse.RawDescriptionHelpFormatter)
	p.add_argument('R1', help="fastq of R1")
	p.add_argument('R2', help="fastq of R2")
	p.add_argument("--mode", "-m", choices=["paired", "R1", "R2"],
	default="paired", help="prints <mode> to stdout where \
	R1 is equivalent to reads without a pair in R2 [%(default)s]")
	p.add_argument("--reverse_complement", "-r", action="store_false",
	default=True, help="reverse complement R2 when joining [%(default)s]")
	args = p.parse_args()
	main(args)
No results found