Skip to content

Instantly share code, notes, and snippets.

@brentp
Created October 22, 2010 16:01
Show Gist options
  • Save brentp/640806 to your computer and use it in GitHub Desktop.
Save brentp/640806 to your computer and use it in GitHub Desktop.
filter a fastq to unique reads
"""
Reads a FastQ file from stdin and writes a file with unique records to sdout
usage:
%s < in.fastq > out.unique.fastq
see: http://hackmap.blogspot.com/2010/10/bloom-filter-ing-repeated-reads.html
"""
from bloomfaster import Elf
import collections
import sys
__doc__ %= sys.argv[0]
if len(sys.argv) > 1:
print sys.argv
print __doc__
sys.exit()
records = sum(1 for _ in sys.stdin) / 4
print >>sys.stderr, records, "records in file"
# say 1 out of 1000 is false positive.
bloom = Elf(records, error_rate=1e-3)
sys.stdin.seek(0)
readline = sys.stdin.readline
checks = []
header = readline().rstrip()
while header:
seq = readline().rstrip()
readline(); readline()
if seq in bloom:
checks.append(seq)
bloom.add(seq)
header = readline().rstrip()
# now checks contains anything that could be a duplicate according to
# the bloomfilter. for some, they were false positives.
# for actual duplicated, just choose the first, but can also sort by quality.
sys.stdin.seek(0)
checks = frozenset(checks)
print >>sys.stderr, "checking %s potential duplicates in a python set" \
% len(checks)
putative_false_positives = collections.defaultdict(int)
while True:
header = readline().rstrip()
if not header: break
seq = readline().rstrip()
plus = readline().rstrip()
qual = readline().rstrip()
# it appeared only once, so just print it.
if not seq in checks:
print "\n".join((header, seq, plus, qual))
continue
# it appeared in the bloom-filter > 1x, so track and make sure not
# to print any others with same sequence.
putative_false_positives[seq] += 1
if putative_false_positives[seq] > 1:
continue
print "\n".join((header, seq, plus, qual))
false_positives = sum(1 for count in putative_false_positives.values() \
if count == 1)
print >>sys.stderr, false_positives, "false-positive duplicates in the bloom filter"
$ time python fastq-unique.py < /usr/local/src/methylcode/data/dme/en.wt.1.fastq > unique.fastq
36754782 records in file
checking 2525155 potential duplicates in a python set
4699 false-positive duplicates in the bloom filter
real 11m31.077s
user 11m1.937s
sys 0m23.093s
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment