-
-
Save chapmanb/862240 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class ORFFinder: | |
"""Find the longest ORF in a given sequence | |
"seq" is a string, if "start" is not provided any codon can be the start of | |
and ORF. If muliple ORFs have the longest length the first one encountered | |
is printed | |
""" | |
def __init__(self, seq, start=[], stop=["TAG", "TAA", "TGA"]): | |
self.seq = seq | |
self.start = start | |
self.stop = stop | |
self.result = ("+",0,0,0,0) | |
self.winner = 0 | |
def _reverse_comp(self): | |
swap = {"A":"T", "T":"A", "C":"G", "G":"C"} | |
return "".join(swap[b] for b in self.seq) | |
def _print_current(self): | |
print "frame %s%s position %s:%s (%s nucleotides)" % self.result | |
def codons(self, frame): | |
""" A generator that yields DNA in one codon blocks | |
"frame" counts for 0. This function yelids a tuple (triplet, index) with | |
index relative to the original DNA sequence | |
""" | |
start = frame | |
while start + 3 <= len(self.seq): | |
yield (self.seq[start:start+3], start) | |
start += 3 | |
def run_one(self, frame_number, direction): | |
""" Search in one reading frame """ | |
orf_start = None | |
for c, index in self.codons(frame_number): | |
if (c not in self.stop and (c in self.start or not self.start) | |
and orf_start is None): | |
orf_start = index + 1 # we'll return the result as 1-indexed | |
elif c in self.stop and orf_start: | |
self._update_winner(orf_start, index, direction, frame_number) | |
orf_start = None | |
if orf_start: | |
self._update_winner(orf_start, index, direction, frame_number) | |
def _update_winner(self, orf_start, index, direction, frame_number): | |
orf_end = index + 3 # because index is realitve to start of codons | |
L = (orf_end - orf_start) + 1 | |
if L > self.winner: | |
self.winner = L | |
self.result = (direction, frame_number+1, orf_start, orf_end, L) | |
def run(self): | |
direction = "+" | |
for frame in range(3): | |
self.run_one(frame, direction) | |
direction = "-" | |
self.seq = self._reverse_comp() | |
for frame in range(3): | |
self.run_one(frame, direction) | |
self._print_current() | |
def little_test(): | |
test = ORFFinder("ACGTACGTACGTACGT").run() | |
def bigger_test(): | |
from Bio import SeqIO | |
a = SeqIO.parse("test-in.fa", "fasta") | |
for seq in a: | |
print seq.id | |
ORFFinder(seq.seq.tostring()).run() | |
if __name__ == "__main__": | |
print "\nrunning original test sequence..." | |
little_test() | |
print "\nrunning Brad's test file..." | |
bigger_test() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment