Created
March 9, 2011 07:18
-
-
Save dwinter/861818 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class ORFFinder: | |
"""Find the longest ORF in a given sequence | |
"seq" is a string, if "start" is not provided any codon can be the start of | |
and ORF. If muliple ORFs have the longest length the first one encountered | |
is printed | |
""" | |
def __init__(self, seq, start=[], stop=["TAG", "TAA", "TGA"]): | |
self.seq = seq | |
self.start = start | |
self.stop = stop | |
self.result = ("+",0,0,0,0) | |
self.winner = 0 | |
def _reverse_comp(self): | |
swap = {"A":"T", "T":"A", "C":"G", "G":"C"} | |
return "".join(swap[b] for b in self.seq) | |
def _print_current(self): | |
print "frame %s%s position %s:%s (%s nucleotides)" % self.result | |
def codons(self, frame): | |
""" A generator that yields DNA in one codon blocks | |
"frame" counts for 0. This function yelids a tuple (triplet, index) with | |
index relative to the original DNA sequence | |
""" | |
start = frame | |
while start + 3 <= len(self.seq): | |
yield (self.seq[start:start+3], start) | |
start += 3 | |
def run_one(self, frame_number, direction): | |
""" Search in one reading frame """ | |
codon_gen = self.codons(frame_number) | |
while True: | |
try: | |
c , index = codon_gen.next() | |
except StopIteration: | |
break | |
# Lots of conditions here: checks if we care about looking for start | |
# codon then that codon is not a stop | |
if c in self.start or not self.start and c not in self.stop: | |
orf_start = index + 1 # we'll return the result as 1-indexed | |
end = False | |
while True: | |
try: | |
c, index = codon_gen.next() | |
except StopIteration: | |
end = True | |
if c in self.stop: | |
end = True | |
if end: | |
orf_end = index + 3 # because index is realitve to start of codon | |
L = (orf_end - orf_start) + 1 | |
if L > self.winner: | |
self.winner = L | |
self.result = (direction, frame_number+1, orf_start, orf_end, L) | |
break | |
def run(self): | |
direction = "+" | |
for frame in range(3): | |
self.run_one(frame, direction) | |
direction = "-" | |
self.seq = self._reverse_comp() | |
for frame in range(3): | |
self.run_one(frame, direction) | |
self._print_current() | |
def little_test(): | |
test = ORFFinder("ACGTACGTACGTACGT").run() | |
def bigger_test(): | |
from Bio import SeqIO | |
a = SeqIO.parse("test.fasta", "fasta") | |
for seq in a: | |
print seq.id | |
ORFFinder(seq.seq.tostring()).run() | |
if __name__ == "__main__": | |
print "\nrunning original test sequence..." | |
little_test() | |
print "\nrunning Brad's test file..." | |
bigger_test() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment