Last active
August 15, 2016 21:09
-
-
Save brwnj/3861828 to your computer and use it in GitHub Desktop.
join paired-end or print unique single-end reads
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# encoding: utf-8 | |
""" | |
Join R1 and R2, or print unpaired reads of R1 or R2 | |
This script assumes there is whitespace separating the readname from the pair number: | |
> < | |
@HW-ST997:140:D0UA5ACXX:8:1101:1497:2138 1:N:0:ATCACG | |
@HW-ST997:140:D0UA5ACXX:8:1101:1497:2138 2:N:0:ATCACG | |
> < | |
""" | |
import sys | |
import bz2 | |
import gzip | |
import string | |
import os.path as op | |
from itertools import islice | |
from toolshed import nopen | |
__author__ = "Joe Brown" | |
__author_email__ = "[email protected]" | |
COMPLEMENT = string.maketrans('ACGTNSRYMKWHBVD','TGCANSRYMKWHBVD') | |
def read_fastq(fq): | |
r"""fastq parser that returns name, seq, and qual.""" | |
while True: | |
values = list(islice(fq, 4)) | |
if len(values) == 4: | |
id1, seq, id2, qual = values | |
elif len(values) == 0: | |
raise StopIteration | |
else: | |
raise EOFError("unexpected end of file") | |
assert id1.startswith('@'),\ | |
">> Fastq out of sync at read:\n%s\n" % id1 | |
assert id2.startswith('+'),\ | |
">> Fastq out of sync at read:\n%s\n" % id1 | |
assert len(seq) == len(qual),\ | |
">> Sequence and Quality are not the same length \ | |
for read:\n%s\n" % id1 | |
yield id1[1:-1], seq[:-1], qual[:-1] | |
def reversecomplement(seq): | |
"""return reverse complement of seq.""" | |
return seq.translate(COMPLEMENT)[::-1] | |
def fastqtodict(fastq): | |
"""returns dict of read name to sequence""" | |
fdict = {} | |
with nopen(fastq) as fq: | |
for name, seq, qual in read_fastq(fq): | |
# explicitly state space to facilitate future changes | |
fdict[name.split(" ")[0]] = seq | |
return fdict | |
def main(args): | |
# set dictionary based on mode | |
if args.mode == "R2": | |
fastqdict = fastqtodict(args.R1) | |
fastq = args.R2 | |
else: | |
fastqdict = fastqtodict(args.R2) | |
fastq = args.R1 | |
with nopen(fastq) as fq: | |
for name, seq, qual in read_fastq(fq): | |
try: | |
# explicitly state space to facilitate future changes | |
name = name.split(" ")[0] | |
cseq = fastqdict.get(name) | |
if args.reverse_complement: | |
cseq = reversecomplement(cseq) | |
print "@%s\n%s%s" % (name, seq, cseq) | |
except KeyError: | |
# without pairs | |
if not args.mode == "paired": | |
print "@%s\n%s" % (name, seq) | |
if __name__ == "__main__": | |
import argparse | |
p = argparse.ArgumentParser(description=__doc__, | |
formatter_class=argparse.RawDescriptionHelpFormatter) | |
p.add_argument('R1', help="fastq of R1") | |
p.add_argument('R2', help="fastq of R2") | |
p.add_argument("--mode", "-m", choices=["paired", "R1", "R2"], | |
default="paired", help="prints <mode> to stdout where \ | |
R1 is equivalent to reads without a pair in R2 [%(default)s]") | |
p.add_argument("--reverse_complement", "-r", action="store_false", | |
default=True, help="reverse complement R2 when joining [%(default)s]") | |
args = p.parse_args() | |
main(args) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment