Last active
December 16, 2015 00:59
-
-
Save BenLangmead/5351690 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import sys | |
import string | |
import numpy | |
import math | |
def fastaKmerParser(fns, k): | |
""" Parse all (overlapping) k-mers of length k from all the FASTA | |
filenames provided. """ | |
for fn in fns: | |
with open(fn, 'r') as fh: | |
kmer = [] | |
off = 0 | |
for ln in fh: | |
if ln[0] == '>': | |
kmer = [] | |
off = 0 | |
else: | |
for c in ln: | |
if c.isalpha(): | |
if len(kmer) == k: | |
kmer.pop(0) | |
kmer.append(c.upper()) | |
off += 1 | |
if len(kmer) == k: | |
yield (''.join(kmer), off - k) | |
def fastaKmerParserIslands(fns, k, isles): | |
""" Yield kmer, boolean pairs where boolean indicates whether the | |
kmer lies entirely within an island or not """ | |
curIsland = 0 | |
for kmer, off in fastaKmerParser(fns, k): | |
while curIsland < len(isles) and off >= isles[curIsland][1]: | |
curIsland += 1 | |
if curIsland < len(isles) and off >= isles[curIsland][0]: | |
if off + k <= isles[curIsland][1]: | |
yield kmer, "island" | |
else: | |
yield kmer, "non-island" | |
def parseIslands(fn): | |
""" Parse a file with island annotations, per the output from Hao's | |
model-based approach: http://rafalab.jhsph.edu/CGI/ """ | |
isles = [] | |
with open(fn, 'r') as fh: | |
for ln in fh: | |
chr, st, en, _ = string.split(ln, '\t', 3) | |
st, en = int(st), int(en) | |
isles.append((st-1, en)) | |
return isles | |
def islandDinucs(fns, ifn, stopAfter=None): | |
""" Compile dinucleotide count tables for dinucleotides inside or | |
outside CpG islands """ | |
hist = {} | |
n = 0 | |
isles = parseIslands(ifn) | |
# fastaKmerParserIslands parses all dinucleotides from the FASTA | |
# file; returns the k-mer along with a label stating whether kmer | |
# came from inside or outside an island | |
for kmer, lab in fastaKmerParserIslands(fns, 2, isles): | |
if lab not in hist: hist[lab] = {} | |
hist[lab][kmer] = hist[lab].get(kmer, 0) + 1 | |
n += 1 | |
if stopAfter is not None and n >= stopAfter: | |
break # stop early; helpful for testing | |
return hist | |
def islandTransitionTables(fns, ifn, stopAfter=None): | |
""" Given FASTA filenames and island annotation filename, obtain | |
the island / non-island dinucleotide frequency tables. Turn | |
them into transition probabilities for the island & non-island | |
markov chains. Also, make log ratio table. """ | |
hist = islandDinucs(fns, ifn, stopAfter) | |
iTab = numpy.zeros(shape=(4, 4), dtype=float) | |
nTab = numpy.zeros(shape=(4, 4), dtype=float) | |
for i in xrange(0, 4): | |
srcc = "ACGT"[i] | |
itot, ntot = 0, 0 | |
for j in xrange(0, 4): | |
dstc = "ACGT"[j] | |
itot += hist["island"].get(srcc + dstc, 0) | |
ntot += hist["non-island"].get(srcc + dstc, 0) | |
assert itot > 0 | |
assert ntot > 0 | |
for j in xrange(0, 4): | |
dstc = "ACGT"[j] | |
icnt = hist["island"].get(srcc + dstc, 0) | |
ncnt = hist["non-island"].get(srcc + dstc, 0) | |
iTab[i, j] = icnt / float(itot) | |
nTab[i, j] = ncnt / float(ntot) | |
return iTab, nTab | |
def classify(seq, lrTab): | |
""" Classify seq using the already-computed table of dinucleotide | |
log ratios """ | |
bits = 0 | |
nucmap = { 'A':0, 'C':1, 'G':2, 'T':3 } | |
for dinuc in [ seq[i:i+2] for i in xrange(0, len(seq)-1) ]: | |
i, j = nucmap[dinuc[0]], nucmap[dinuc[1]] | |
bits += lrTab[i, j] | |
return bits |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment