Last active
December 15, 2015 22:18
-
-
Save BenLangmead/5331666 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def neighbors1mm(kmer, alpha): | |
""" Generate all neighbors at Hamming distance 1 from kmer """ | |
for j in xrange(len(kmer)-1, -1, -1): | |
for c in alpha: | |
if c == kmer[j]: continue | |
yield kmer[:j] + c + kmer[j+1:] | |
def correct1mm(read, k, kmerhist, alpha, thresh): | |
""" Return an error-corrected version of read. k = k-mer length | |
for k-mer count profile. kmerhist maps distinct kmers to their | |
counts. alpha is the alphabet. thresh is threshold above | |
which k-mers are considered "frequent". """ | |
# Iterate over k-mers in read | |
for i in xrange(0, len(read)-(k-1)): | |
kmer = read[i:i+k] | |
# If k-mer is infrequent... | |
if kmerhist.get(kmer, 0) <= thresh: | |
# Look for a frequent neighbor | |
for newkmer in neighbors1mm(kmer, alpha): | |
if kmerhist.get(newkmer, 0) > thresh: | |
# Found a frequent neighbor; replace old kmer | |
# with neighbor | |
read = read[:i] + newkmer + read[i+k:] | |
break | |
# Return possibly-corrected read | |
return read | |
def kmerHist(reads, k): | |
""" Return map from distinct k-mer to # times it occurs """ | |
kmerhist = {} | |
for read in reads: | |
for kmer in [ read[i:i+k] for i in xrange(0, len(read)-(k-1)) ]: | |
kmerhist[kmer] = kmerhist.get(kmer, 0) + 1 | |
return kmerhist | |
def correctAll1mm(reads, k, alpha, thresh=1): | |
""" Return list of error-corrected reads """ | |
khist = kmerHist(reads, k) | |
return [ correct1mm(read, k, khist, alpha, thresh) for read in reads ] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment