Skip to content

Instantly share code, notes, and snippets.

@audy
Created September 8, 2010 16:25
Show Gist options
  • Select an option

  • Save audy/570368 to your computer and use it in GitHub Desktop.

Select an option

Save audy/570368 to your computer and use it in GitHub Desktop.
DESCRIPTION = '''
Duplicate filtering for FASTQ/FASTA data.
If there are at least N duplicates in X basepairs, discard the read (in both pairs).
usage python filter.py <sequences> <N> <X>
'''
import sys
from itertools import izip
from collections import defaultdict
import string
_complement = string.maketrans('GATCRYgatcry','CTAGYRctagyr')
def duplicate_filter(*args, **kwargs):
''' removes sequences that have duplicates
from paired-end illumina data '''
hashes = defaultdict(int)
count = 0
county = 0
filename = kwargs['filename']
with open(filename) as handle:
for record in Fasta(handle, filetype='fasta'):
county += 1
if record.seq in hashes:
# do not print
count += 1
continue
else:
print record
hashes[record.seq] = 1
print >> sys.stderr, 'removed %s out of %s records' % (count, county)
class Fasta:
''' iterates through a fastq file, returning dnaobj objects '''
def __init__(self, *args, **kwargs):
try:
self.filetype = kwargs['filetype']
except KeyError:
raise Exception, 'use: Fasta(handle, filetype=\'fastq or fastq\')'
self.handle = args[0]
self.county = 0
def __iter__(self):
if self.filetype == 'fastq':
counter = 0
rec = { 0: '', 1: '', 2: '', 3: '' }
for line in self.handle:
if line.strip() == '': continue
if counter < 3:
rec[counter] = line.strip()
counter += 1
elif counter == 3:
rec[counter] = line.strip()
counter = 0
yield Dna(rec[0], rec[1], rec[3])
elif self.filetype == 'fasta':
header, sequence = '', []
for line in self.handle:
if line[0] == '>':
if sequence: yield Dna(header, '\n'.join(sequence))
header = line
sequence = []
else:
sequence.append(line.strip())
yield Dna(header, '\n'.join(sequence))
class Dna:
''' An object representing either a FASTA or FASTQ record '''
def __init__(self, header, sequence, quality = False):
self.header = header[1:-1]
self.seq = sequence
self.qual = quality
if quality:
self.type = 'fastq'
else:
self.type = 'fasta'
if self.type == 'fastq' and len(self.seq) != len(self.qual):
raise IOError, \
'Seq length and qual length do not agree: %s' % (self.header)
def __str__(self):
''' returns a FASTA/Q formatted string '''
if not self.qual:
return ('>%s\n%s') % \
(self.header, self.seq)
else:
return('@%s\n%s\n+%s\n%s') % \
(self.header, self.seq, self.header, self.qual)
def __len__(self):
return len(''.join(self.seq))
def __repr__(self):
return '<dnaobj.%s instance: %s>' % (self.type, self.header)
@property
def complement(self):
''' returns complement of sequence '''
return self.seq.translate(_complement)
@property
def revcomp(self):
''' returns reverse complement of sequence '''
return self.complement[::-1]
@property
def rqual(self):
''' returns reverse quality'''
return self.qual[::-1]
if __name__ == '__main__':
from sys import argv
filename = argv[1]
duplicate_filter(filename=filename)
#!/usr/bin/env python
# Counts total and unique sequences in an unwrapped FASTA file.
# Austin G. Davis-Richardson
import sys
hashes = []
with open(sys.argv[1]) as handle:
for line in handle:
if line.startswith('>'): continue
else:
hashes.append(hash(line.strip()))
print 'total: %s' % len(hashes)
print 'unique: %s' % len(set(hashes))
@audy

audy commented Sep 8, 2010

Copy link
Copy Markdown
Author

#10% of RDP (rdp.cme.msu.edu) is redundant?

$ python rd.py rdp_arch_bac_isolatesonly_198573.fa 
total: 198573
unique: 178477

Then do: 1 - 178477/198574 = 10.12% (!!!)

A more elaborate script yields a different answer:
$ python duplicate_filter.py rdp_arch_bac_isolatesonly_198573.fa > /dev/null
removed 20096 out of 198573 records

~5%

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment