galvanic · February 18, 2014 20:34
diff --git a/translateNA.py b/translateNA.py
 from itertools import chain, islice

 """
 Improvements:
 Measure how fast/good this script is. Is it space efficient ?
 No point having fancy programming if it's not faster (fancy bit being the sliding window)
 Maybe take into account the case of letters (if upper or lower case).
 I feel like I could remove some of the repetition.
 """

 with open("codons.txt", "rU") as f:
    file = f.readlines()
 file = [line.strip("\n") for line in file]
 TRANSLATE = dict( zip( file[::2], file[1::2] ) )


 def window(seq, n=3):
    "Returns a sliding window (of width n) over data from the iterable"
    "   s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ...                   "
    it = iter(seq)
    result = tuple(islice(it, n))
    if len(result) == n:
        yield result    
    for elem in it:
        result = result[1:] + (elem,)
        yield result


 def translate3ORFs(sequence):
    """
    Input a sequence of type string.
    Outputs a list of the 3 ORFs as strings.
    """

    ORF1, ORF2, ORF3 = "", "", ""
    # translate the sequence, with the obvious ORF, starting at 1st position
    for index, codon in enumerate(window(sequence, 3)):

        codon = "".join(codon)
        if TRANSLATE.has_key(codon):
            aa = TRANSLATE[codon]
        else:
            aa = "-"

        if index % 3 == 0:
            ORF1 += aa
        elif index % 3 == 1:
            ORF2 += aa
        elif index % 3 == 2:
            ORF3 += aa
        else:
            pass

    return [ORF1, ORF2, ORF3]


 def getSequence(filename, verbose=False):

    with open("%s.fasta" % filename, "rU") as f:
        file = f.readlines()

    seqid = file[0].strip(">")
    file = [line.rstrip() for line in file[1:]]
    NAseq = list(chain(*file))
    NAseq = "".join(NAseq)

    if verbose:
        print NAseq, "\n"

    return seqid, NAseq


 def getOppositeStrand(sequence):
    # is this the most efficient way ?!

    sequence.replace("a", "x")
    sequence.replace("t", "a")
    sequence.replace("x", "t")

    sequence.replace("c", "x")
    sequence.replace("g", "c")
    sequence.replace("x", "g")

    return sequence


 def main(filename):

    sequence = getSequence(filename)[1]
    ORF123s = translate3ORFs(sequence)
    ORF456s = translate3ORFs(getOppositeStrand(sequence))
    ORFs = chain(ORF123s, ORF456s)

    return "\n".join(ORFs)


 if __name__ == '__main__':
    print main("sequence")
	from itertools import chain, islice

	"""
	Improvements:
	Measure how fast/good this script is. Is it space efficient ?
	No point having fancy programming if it's not faster (fancy bit being the sliding window)
	Maybe take into account the case of letters (if upper or lower case).
	I feel like I could remove some of the repetition.
	"""

	with open("codons.txt", "rU") as f:
	file = f.readlines()
	file = [line.strip("\n") for line in file]
	TRANSLATE = dict( zip( file[::2], file[1::2] ) )


	def window(seq, n=3):
	"Returns a sliding window (of width n) over data from the iterable"
	" s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ... "
	it = iter(seq)
	result = tuple(islice(it, n))
	if len(result) == n:
	yield result
	for elem in it:
	result = result[1:] + (elem,)
	yield result


	def translate3ORFs(sequence):
	"""
	Input a sequence of type string.
	Outputs a list of the 3 ORFs as strings.
	"""

	ORF1, ORF2, ORF3 = "", "", ""
	# translate the sequence, with the obvious ORF, starting at 1st position
	for index, codon in enumerate(window(sequence, 3)):

	codon = "".join(codon)
	if TRANSLATE.has_key(codon):
	aa = TRANSLATE[codon]
	else:
	aa = "-"

	if index % 3 == 0:
	ORF1 += aa
	elif index % 3 == 1:
	ORF2 += aa
	elif index % 3 == 2:
	ORF3 += aa
	else:
	pass

	return [ORF1, ORF2, ORF3]


	def getSequence(filename, verbose=False):

	with open("%s.fasta" % filename, "rU") as f:
	file = f.readlines()

	seqid = file[0].strip(">")
	file = [line.rstrip() for line in file[1:]]
	NAseq = list(chain(*file))
	NAseq = "".join(NAseq)

	if verbose:
	print NAseq, "\n"

	return seqid, NAseq


	def getOppositeStrand(sequence):
	# is this the most efficient way ?!

	sequence.replace("a", "x")
	sequence.replace("t", "a")
	sequence.replace("x", "t")

	sequence.replace("c", "x")
	sequence.replace("g", "c")
	sequence.replace("x", "g")

	return sequence


	def main(filename):

	sequence = getSequence(filename)[1]
	ORF123s = translate3ORFs(sequence)
	ORF456s = translate3ORFs(getOppositeStrand(sequence))
	ORFs = chain(ORF123s, ORF456s)

	return "\n".join(ORFs)


	if __name__ == '__main__':
	print main("sequence")
No results found