Skip to content

Instantly share code, notes, and snippets.

@mgalardini
Created December 1, 2017 11:55
Show Gist options
  • Save mgalardini/4bc16dc0101aaacce675d719579cee66 to your computer and use it in GitHub Desktop.
Save mgalardini/4bc16dc0101aaacce675d719579cee66 to your computer and use it in GitHub Desktop.
Find exact matches in fastq files
#!/usr/bin/env python
def get_options():
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('reference',
help='Reference fasta file')
parser.add_argument('reads',
nargs='*',
help='Reads FASTQ file')
parser.add_argument('--quality',
type=int,
default=30,
help='Minimum Phred score for matches [Default: 30]')
parser.add_argument('--split',
default='_L001',
help='String used to split the first file name [Default: "_L001"]')
return parser.parse_args()
if __name__ == "__main__":
options = get_options()
import os
import gzip
from Bio import SeqIO
hap = {x.id: x for x in SeqIO.parse(options.reference, 'fasta')}
c = {}
i = 0
for f in options.reads:
for s in SeqIO.parse(gzip.open(f), 'fastq'):
i += 1
hits = set()
for k,v in hap.items():
c[k] = c.get(k, 0)
f = s.seq.find(v.seq)
r = s.reverse_complement().seq.find(v.seq)
if f != -1:
ss = s[f:f+len(v)]
elif r != -1:
ss = s.reverse_complement()[r:r+len(v)]
else:
continue
if min(ss.letter_annotations['phred_quality']) < options.quality:
continue
hits.add(k)
if len(hits) == 1:
c[hits.pop()] += 1
for k in sorted(c):
print('%s\t%s\t%d\t%.5f' % (os.path.split(options.reads[0])[-1].split(options.split)[0],
k, c[k], float(c[k])/i))
print('%s\tdiscarded\t%d\t%.3f' % (os.path.split(options.reads[0])[-1].split(options.split)[0],
i - sum(c.values()), float(i - sum(c.values()))/i))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment