Skip to content

Instantly share code, notes, and snippets.

@galvanic
Created February 18, 2014 20:34
Show Gist options
  • Select an option

  • Save galvanic/9079474 to your computer and use it in GitHub Desktop.

Select an option

Save galvanic/9079474 to your computer and use it in GitHub Desktop.
Translate a nucleic acid sequence into all 6 open reading frames
from itertools import chain, islice
"""
Improvements:
Measure how fast/good this script is. Is it space efficient ?
No point having fancy programming if it's not faster (fancy bit being the sliding window)
Maybe take into account the case of letters (if upper or lower case).
I feel like I could remove some of the repetition.
"""
with open("codons.txt", "rU") as f:
file = f.readlines()
file = [line.strip("\n") for line in file]
TRANSLATE = dict( zip( file[::2], file[1::2] ) )
def window(seq, n=3):
"Returns a sliding window (of width n) over data from the iterable"
" s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ... "
it = iter(seq)
result = tuple(islice(it, n))
if len(result) == n:
yield result
for elem in it:
result = result[1:] + (elem,)
yield result
def translate3ORFs(sequence):
"""
Input a sequence of type string.
Outputs a list of the 3 ORFs as strings.
"""
ORF1, ORF2, ORF3 = "", "", ""
# translate the sequence, with the obvious ORF, starting at 1st position
for index, codon in enumerate(window(sequence, 3)):
codon = "".join(codon)
if TRANSLATE.has_key(codon):
aa = TRANSLATE[codon]
else:
aa = "-"
if index % 3 == 0:
ORF1 += aa
elif index % 3 == 1:
ORF2 += aa
elif index % 3 == 2:
ORF3 += aa
else:
pass
return [ORF1, ORF2, ORF3]
def getSequence(filename, verbose=False):
with open("%s.fasta" % filename, "rU") as f:
file = f.readlines()
seqid = file[0].strip(">")
file = [line.rstrip() for line in file[1:]]
NAseq = list(chain(*file))
NAseq = "".join(NAseq)
if verbose:
print NAseq, "\n"
return seqid, NAseq
def getOppositeStrand(sequence):
# is this the most efficient way ?!
sequence.replace("a", "x")
sequence.replace("t", "a")
sequence.replace("x", "t")
sequence.replace("c", "x")
sequence.replace("g", "c")
sequence.replace("x", "g")
return sequence
def main(filename):
sequence = getSequence(filename)[1]
ORF123s = translate3ORFs(sequence)
ORF456s = translate3ORFs(getOppositeStrand(sequence))
ORFs = chain(ORF123s, ORF456s)
return "\n".join(ORFs)
if __name__ == '__main__':
print main("sequence")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment