Created
August 10, 2016 09:22
-
-
Save diallobakary4/2d1ff8d4dd502e36ba80abb6b160fd06 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# Protein Translation Problem: Translate an RNA string into an amino acid string. | |
# Input: An RNA string Pattern and the array GeneticCode. | |
# Output: The translation of Pattern into an amino acid string Peptide. | |
from collections import defaultdict | |
import itertools | |
genetic_code = {'ACC': 'T', 'GCA': 'A', 'AAG': 'K', 'AAA': 'K', 'GUU': 'V', 'AAC': 'N', 'AGG': 'R', | |
'UGG': 'W', 'GUC': 'V', 'AGC': 'S', 'ACA': 'T', 'AGA': 'R', 'AAU': 'N', 'ACU': 'T', | |
'GUG': 'V', 'CAC': 'H', 'ACG': 'T', 'AGU': 'S', 'CCA': 'P', 'CAA': 'Q', 'CCC': 'P', | |
'UGU': 'C', 'GGU': 'G', 'UCU': 'S', 'GCG': 'A', 'CGA': 'R', 'CAG': 'Q', 'CGC': 'R', | |
'UAU': 'Y', 'CGG': 'R', 'UCG': 'S', 'CCU': 'P', 'GGG': 'G', 'GGA': 'G', 'GGC': 'G', | |
'CCG': 'P', 'UCC': 'S', 'UAC': 'Y', 'CGU': 'R', 'GAA': 'E', 'AUA': 'I', 'AUC': 'I', | |
'CUU': 'L', 'UCA': 'S', 'AUG': 'M', 'UGA': ' ', 'CUG': 'L', 'GAG': 'E', 'AUU': 'I', | |
'CAU': 'H', 'CUA': 'L', 'UAA': ' ', 'GCC': 'A', 'UUU': 'F', 'GAC': 'D', 'GUA': 'V', | |
'UGC': 'C', 'GCU': 'A', 'UAG': ' ', 'CUC': 'L', 'UUG': 'L', 'UUA': 'L', 'GAU': 'D', | |
'UUC': 'F'} | |
#Input: any sequence of string (RNA for example) | |
#Output: a list of constitutive codons (triplets) | |
def seq_to_codon(seq): | |
n = 3 | |
return [seq[i:i+n] for i in range(0, len(seq), n)] | |
RNA_seq = "AUGGGAAUGACUGUCCUCAAUCGUAUCCCCGGGCUAACGUCUACGAUGGUAAAGCCCAAUGCAGUCCCAAUUCUCUCCAACAGAGGUACGAAGACUCCAAUUUAUAUUAAUCAGUUACGAAGCAGCCGGCAGUUAUCCUGCAUGUGUGAUUAUCACUGGGUUGCCUAUGGGAAGCUAUGUGUUAUCAUGACCCGCCCAGAUAGCUCCAUGUCUUCCGUAGCGGGAGAAGCUUGGAUAGCUGCAGUCGGGCCGUUGGGCCAGCGCGGAACGGAUUCUGUACUAAGAUGCACUGUAGGAACUUCACGGCCUGUCACUCUCUGGCAUGUCGGGACACAUACACAUAAUGCAAAUCUUCAAACGAAUGUUGGAGCGGUUCGAGAUCUACAAUUGCUGUUAACGAUUCGAUUGCGUUUUGGCGGCACAGGAUAUGCCCUGGGCGCGUGUUCCUCGCCGACUCGUUUAACUAAAAGUGAACCUGGCCUUGUGGUGUCGCAUGCCCUCGUUUCGAGAUCUGGCGGGAUGCGCACACUGGAAGCACGCAUGACUCUGCCUAGAAAUGCCGAUAGUUGCUUUAUCCCCCACAAAAUAGCGUGCCCGUCGCCGAGCGCGCUAGAAACAGUCACCGAAACGUGGCGCACCCCCAAAGCUAGGACCAAGAGCGCCCGAGAGAAUGUUCACACCAUCGUAGGCGAACAAAUUUACCACGAUAACGGUGCACAGCGUUGGAUCUCUCCGUCUCAGGCAACCAACCAACAAUUGGCUGGAUGGCAGGGUAACGGAGGGUGUACCUAUACAUUAGUGUGCAUUUACUGUUUACGAAGUGGGAGUGGCUUGGGCGAACUCAACCAAGUAGAGGCGAGCUUCUCGUCGCUGCCUCACAACGCAAUGGAGGUAGCGUUCAACGAAUGUGGCAAUGACCGGAGUAGCGAAUGCGCCAAUGUGAGACCACCGCAAUGGUCUGGGUUGAUGCACAGAGUGUCGCUUGGUCUGGGGGAUUAUAUAAUGACAGAACAGCAAUCGGGGGUUGAUACAAAUUUUCUUCCAGGGAACGACGGUGUUAAGAAUAUAGAGAGAAUAAAGACCAGGGACAGUGUGGUUCUCGGGAGGCCUACGUCGACCCCGAGGUAUCAUCGACGCAGACCAAAAGUCCACUACAUAGAUCGCAUACCCAUUGGAGUUCAUGAGCGCGCGGUUCGACGCAUGGCAGCUAGCACGCUUGUAUGCUCAAACCCUAUGGUUUAUCGGCUCGGAAUGAGGGGAAUCGCGAAAUUGAGAUUGGUGCGAAUGUGGAUUGACUUUGGAUACGGUCCGCGCGCACGCAUAAUCUUGGGACCACCUCUAGGCCUCAUCAGCAUCUCGUCCGGCGAUGUGCUUAUCCUCAAGCGAUCGCAAGCAUGUGGGUCCCUCUGUUAUGCGAUUUUGAGCGCACCAAUAGGGACCUGGUAUGAGUUGUUUUUUCCUACCCUAAUCGAGGAACUAAAAGACACGCACACGUUGAACCCAGAGACACGUAGCCUCUUAAGCAGGGGACCGGGGGAGGCUUUCAUUUCAUGCCGCUCCUUGGCGGUACGUCGAAUAGGUUACUGCACUGGCACCACCCCAACUCCACGGGUUAACUACCGUGAGACGACCCAUAAACAUUAUACUACUGUGCGCUGGAUCAACCGGACGGUAUGUCGUCCAGACCAAGACUUCGGCGCUUACGCCAACAGCGCGGCAUUUGUUGCUAGGAAAUCAGCCCUAAUGCGGCCGCGGACUUCAGUAGUGCCACAUUAUCCCAAUUAUUCUGCACGUCAGACAGUGGAGACUUUUGGCUUGUCCGGGCGGACACGUCUGAGCCUAAAGACGCAGUACCGCACAAACUUUCCGGCCGGUCCACUAUUAGUUAUUAUCGCGGAAACGUGUCAAAAUGGGGUACGGGGGAAUUGGUUGAUGGGACAGCCAGACCAAGCCACCACUCUAGAACAGGGCUACGAAGAUCCCAGCAGGAUAACAUAUGCGUUACAUCACGAAGUCAUAUCAGAAGACAGAGAGACCACCAAUUGUGGGGAAAAACUCAUGGGACAUAGCCUAGCUUUACAAGCUCACCAUCUACAUCCCCUAGUAAUUCUGCCUGUCCGGUUUACUGUCGAUCAUGAUUUGCAUAGACUGCACUCAACCACCUACAAUGAUCCAGAAAUCCUACGCCAGACCCCUCUACUAGGAAUGCUGUUCCGGUGCCCUGGUCUCAUCAGCUACGCGUUACCACUGGGGAUGAUCACAGCUAUCCCCUAUUAUGUCUGCCAGCAGUGGGGAGGAAACGGGAUUUUUGUCUCAAGAAUCUCAUCUGACGACGGUACCGAGUUUAGGGGUCCUCAUGAUUUCUUAGGCAGAGGCAAGUUCCCUAGUCCUCGCGGUCUGUUAAUCACUUUGUACGUCGUAGGUGGCGGGAACUUAGUAUGUAUCCGGUUGCGCGGGCCUCAGGAUAUAAUUAGCAAGGAUCCACGACUCUUGGCUGUAAGAACGUCUUCAGGUGGCUUGUCAGCAAUCACACUGCCUGCGUGCGCAUCGUUACUGCGGCCAACCGUAGUUAUUGGUGGCUACAAUACAUCGUACGAUCCCGCUCCCAAGGUUACAGGCCGGGGCUACAUACGGAUAAGUGCACGUCGCGAAGUGGGGCUGAUUCGUAUCCUUCGUGAGUUGUCUGAGACGGGGACACGUGCCAAUCACGGAAAAGAGACUUUCCAUCUGAUGCUCUUCAUGCAAUUCAAGGGGGGCCGUCAUCACGGAGGGUUCGAGGUCUUUGUAGAUACCCACACUCUUGAACGUCCCGCCGCAUGUAUCUACUUUAUGCAGAGUCGGCGCCCUCCUCUCAGACUAUCGGUCCCCUGCCCAAUCUUCCACCAUCGAUAUAUGGGCAUAAUUUGUGAGGAACCCCGGAAGGUGUGGCGUUCUAUUUUGUAUCCUCUCUGGCUAUUCUUCACGGCGUUAACAACCGUGGGAGUCGACAACAUAAAUCGAAUGAGAGGAGAGAACGUCAGGGACCGUCAUUCCGGGUCUUGGCAAAACCUAAUACGUUUCAGUAUGUCUAAUUCAAGCCCCUCUUCUAAGAAAGUACCUUCAGAGACAAUCCUUUGUCCGAGAAAUAUUGUUGGCUGCUACGCAUGCGUGCCUUCCAGGUUUCAGGGACUUUGGGGUCAUCACCGAGUUUUUGCACGGUAUAGCGCUGGACCCAUACCUAUGUCCACUUCAGUUAAGCAAAACUUGGGUAUGAGGUUGGGUUGGCAAGGCUGGCCAGGCCCUAAGCAACUAUUUCUUAAGAUAUACGGUGAAUCUUGCUGGGAUGCUGACUUACCACUCGGUGCGCAUUCCGCUUCGUGGCACAUCACAGAACGAGCACACAAUGAUAGGGUCAGUGCACCAGCGGGUGCUUUUCCCGUGGUGUCAAUUGGUUGGCUAGAAAAGUGUUAUAUCGCGUGGGUCGGCAUAUCGCGGUAUGCUGUCCAUGGCAAAAGGAGGAUGACUAGACGCCCCACGCCAGGUCCAGGACGGGGUUUUUACCUCUACAAUAAUGAGCGCGAGACCGCGCUGUCUAUCCGCUUCUCGCCAACGACACCUACGUUAGAACAGUUUCCCAGGUACUGUUGCGAAGGGGGGAGAAGCUCUCAGAUGGCGAGCUUGUCUAAGAGGGAAUGUUGCUCCACGCUUCCCGGAGGGACCCUGCUUAGAGGAGUGCCAAAACUUCUACGCAAUCCAGUCGAGUGCCUCAGUGCAUGUGUAAGGACAGAAAUACUAACUAAGACAUAUCGCUAUAGCGCUGGUUACGUGAUUAUCCCCAGCCGGGAACUUGAUCCAAGCUAUGGCAACGUCGGUCCGGUGAGGAACUCAUGCGGUAACUGGCAGCGCUCUUACGGGAGAGUAGGCGCCAUACACGAAAGCUUUCCCAAGCAUGACACAGAAAGCGUACGGGGGGCCCCUGAUUGUCGUCGUGUGUGCCUCCGACGCUACCUUGCUGGCGUCGUAGAGGUGUGUAAUAGUAACGGCGAGAUCUAUCAGAACCAACAAAACGACUGGAUUCCCGGGUCAUUCAUCGUACGACUACCGACUACCCGAGUAACCUUGAGUGUAGACCGCGCGGUUUGUGCUACGUGGUCGAAUCCAGUUCCGCAAACUGCAUUUGAGCAGGGCAUUGAACACCAUAGAAUAUGGUCGGAGAGGUGGCCGUGGAUAACGAUGCGGAUUUGUCUUUCCGGUACGGUGUACUUAAGCCUCAGUGCGCUUAAACAACAAAGCGUACACAUCAGGAAACACCACAGGUAUACGAAAGGAAACCCCUAUGCCAAAUGCUCAUGUAUUCGGCGCUCUCGUGCCCGAUAUGGGAAGCCCUUAAUUGGCGUUUCGCAUUUAAGAGAAACGCGUCAGAUGUUGAGGUCCGCAAUAGAGCUAUUUGCACUACGGGGUGCUGGUAGAAAGGAUCGAUCCGACGGUGUGCAGUUGCGCCAGGCUAAAGUCGACGAGGAGUUGGCAAUACUACUAGCAGACAGAAAUACUAGUCUGGACGGCGCGCUAUUGACCGUGCCUAUCCCUAUUCGCCCCCUUAACUUAGGUGGCACAUACAGCGACUUUUGGCGUGGACGUAAUUACAUCUGGCCUAGAACCGAUGAGUCCGCAAUGACAUUCAUCGUCAACCCCUCACCGUUCACGAACAGUAAGAGAAAGCCGUACCGGGGUGUUGUGGCGAAAAAUCAAUUACCCAAUGACUUCACUGCUUUUGGAGCCUCGCGAGACAAUGUUGGGGUUGCGCUCCUACGGCCAGUGACAGGUAAUCUUUUCUUUGUCCAUUCUCUUAAUUGGUCAGCAACUGAGCAAGAGAGAAACCAACUCUCCCAUCGUCCGAACGAUCAGGAUAAUACGGAGCUUUUCAUGCGCACUAGCCGAGCAACGUUUUGCCUCCGGGACGGUCGGAGGAUCCCCCACCAUUGUAAUGUGGCUACACCAGUCCUUAGAACCAUCGGAUCAAACGAUGGUAGAUACGCUCCGCCCUAUUCUAGGAGAAGGGGAAUUCUGCGGGCACGCAAAUAUUCUGUUGGGGCCACCCGCGAACUUUGUGAAAGCGCUUGCUCUCGUGUGAACGAGCGUACGUAUAUUAUAUUUGCUACUUGCCUCCGUAAGCUGGUAUGGCAAGGACUUUCCAAGGGUAUUUUAUCAUCCACUCCGAGGCUGGCCGUUCUUCUCCGCCUGAAUAAACUACUCUAUUUAGAACCCAAUGAGCAUUCUGGAAGGAUGCUACCGCGAGCCCUAGAAGCUUACGUGGAGGCCGUAGCGGCUUCCUUAUCGCAAAAACCCACAAGACCCCCGAAUCAGCGUCCUUCAAAUAUCCCGAUCUGCGAUUCCAUCCACCCGAAGUUUAGUGUAAAGACAGUUAACCACGCGUUGAGGAGCGUGGGUAUGGUACUGUCCAUACUAAAUGGGGGCGGCGCAAGCGCGGCUGCAUGUCAGAACCUAACAACGCCUGUUACGCCCCGGGGUAAGAUUCGUUUCCUUCGGAAAUUCUGGACGUCGGUACGUAGACUCCCCACUAAUACCAAGUGGAGUCUUACUGAUGGGUUAGGAAAUUCAGUAUUAAGGCGGAAUUUAUUGAUUGGGCGUGUAUACAUACACACCAUCGAAAGCCCAUCCCGGUAUAUACUCUUGAACAGAAGGAAGCCUGAGCGGACUCCUACCAUCUGUGGUUAUGUCGCUGUGCUAUGCUGUAGCUAUUAUUUCGUCGUUGUCGUUUGCAAAGAUGUAACUGGUGGUUCACAUGUGGGCCACAUGGGUCAGGGGGAUCACGAUAAAGCGCGUGGCGAAACGCGGUUGCCUUCCAAACUGCGAAGGGGACGAAAUGUAUAUCCGCCUUCUGAUAACAUUGAUAUCCGGCAUCAUUGUGAUGGACUGAACGCGCGAGACGAGAAUGACAGCCUACGAUUCUCUGGCUUAGAUCUUUCUCCCCUGAUCCAAGAUAUCCGAAAGGGCGCCGGUGGGUUGUACGUAACGCAUACUUUCUAUAGGCGUUGUACAGGGGUAACUGCCUGGGGCGGUUACCGUGCAAGGGAGUCUAUCAGAAGGAAUCGGAGUAAUACAGGCACUUUUGUACUACUGAUACAGUCACGAAUCCGCUUUAAGCGCUCUCUGAGGUUAGAAUGCUCCACCCGAGGAAGACGCAUCAUUGUACGCUACAAACCAACCGAUUUCGAUCAUUUGCUUGCCCUGCUACGAGGUGCAGAGUCUGGCCGGCUUCGUCUAUGUCAAGCCCCGCUCGGUUAUCUGUGGACGGUGUGUAACAAGUCUUUAGUGAGAAUCUUCCCGUUACCGCUCACACGUAAAGCAUCGAGCAGGAAUAAUAGAUUAAGACUGAUUCCUAACGAACAGCUAACCGGCGCCAUCAGCGGCCCAUCCAGGGAGGAUAUUCGCGAUGGAACCCCCGGGUCUUCCGGCGUCGUAGAAUGCGAGCCGGCCAGGUCCCAGAAUGGUUGGUGCAGAUUCCGAUUUUUCCAACGAGUACGCGCUAUCCCUGUAAGUGGCCCAAGGACUCGAAAGAGCGUCCCCCUGACUAAAAGCGGAGUCAGUUUACAGGGACCGGGUGAGAACGCUCGUGGGUACAAACCCUUAUUACGCACUCCAUCCAUGAUGUCUCUGAUCACGGUGGUCGGCUUGAUGCACUGCCACUCGGUUGAGGAUGGAACCUACAGGGGAGUUGUCAAUUGUGUAGGAUCGGCAUGUAAGUCCCUAAUAAGGCAAGUGCCACUAGGGUCCGCUGACGCUCGGGCCCCCAGCUGGGCGGGCCAAGAAAUUUGUAUUACUGGGUUCCCUUUUCCGCUAUCGUCGAGAUACAGGCGAUCGGGGUGUCGUUCAGAAUGGAUCGAACGCGAAAGGUUCAAAUCGCUCAUGCAUGUCCGUCGGCUGGUAUUUACUAGGGCGUAUAGACCUCCAAACACGGACUGCAUGUCUCGAGCAUUGCAAACUGGCGUAAGGUACCUAGCACAUGGUUACAAGAUCCGGCCACCCGUCCACCAUACCACGAACGGCCGUCUAGUUUCUCGACGACGGUAUAGCGGCUGUAUAUCAACCUUGAAGGAUGGAUUAAGACGAGUGACGAGCAUUGACUGUCGAGGUCAUCAAGGGCCGGUGGACCGGAGACGGGAGUGUCAGGCCUGCCUCCAGGCAUUAAGAGUUAGGUUACCGAAGCACGUCAAGCUCGCUUUAUCACCUCCAUCUAUGCGGCCACCUAGAGGUUCUUUCAUCCGACGGUGGCGAACAAGCUACCGGGAGGUCAUCCAGAUGAGUCAUUUACUUCGAACGUUGAUUUGGCACGGCGAAUUAAUAUCGUUAACGCAAAGCGCGGUACGGGCAAUCAUAUACAACGCUGCCCGAGAUACCCACGUCGCCCUCGCGACUACCAUGGGCAGGCGGUGUCGCUUCAAGCACUCACACAUUUUAUACCGCAAGGGUUGCGCCGGAAAUGCCUCGGGACCGAGCAACUCACCCGAACAACAGGGGUUACUCAGACCGCACAGGAAACUGAGCAAGAGAAAUCAUUUUGUUAGUAUAUCACUAGGGGUUCAGAAUCGACGGCGUCGUACCAGUGGCCCCUUAUGCACGCAGCUGGCACAUCGGCUUCUCGCGUUUAGGCGUCCGAGUUUUGGACAGGCAAGCAACCAACUGAGAACACGAAAGGGGCGAUGGCUCCUACCGCCACCAGCCGUUCAAGCGUCCUGGAAGUCCUGGUUUCUUCAAGGAUGUUAUGGGACGCACGUUAGUCGCUUAGUAUGUGGCCCUGCUCUCCGUCUUCCUGGGCUAACAAAACUGCAGAUAUUAAUUCACAUUAGAGUCGGCCGUGUUGCAGUCUUAGGUAGGCUUGCUGCCCCCCCGGUGAUAAACACAUCGGGCUGGGGGGCAACAAUAGCUAUCUUGCUUUGUGCAUGCGAUUCGCACACAGUCCUUACUAGGGGGCUGUCAGGUACACUAUUACACCUUACGAUCACAAAUGGACGUCUCUCAUCGGGAUACAAAAACCGGGAUUGUAGUAAGGGCACCUUGGUCGAAAACCCAGAGAGCACUAAAACUCAGAGUCGCACUCAAGUCGUUACGGAGCUUCGCGGCGCGAGCACAAUUGUUGCGGGUACGGGUCGCUCGAUAUCUGCACCUGCCCGAACAAGCAUAAAGUUGCCUGAUGGGCGCCGCGUCGAUUACAAAUACGAUUGGGUUCUUUUGUAUACCCGCGUGUUAUGGUUGGGAGGUUUUAAUUGGCCUGCACCUACACCUUGCGCCCACGCUACCAUUAGCGCCCGACGCAUCCCUCACCAUGAACCACUAUCCCAGGCUCUGAGCACGGCAGAGAAUGAAUCAACAACCUCGCAGUCUUACGGGCGCAAACAAGUUCCACCGCUCCGCACCUCUGUGGCUGCGGUACUGAUUAUAUACUCGUUUUGGGCGUCAUCACAACACACUCGUUCCAACUUCCGGACAUUGACGCGCAUGGUCGUGGCUGUUGCCAGAAUGCCUCCUACGGUCGCCCGGCAACGAGACGGGGAACUGUUGGCCGGUCCAACUUCUCGUCCCCUACGCGGGUUGUGGCAGGUGCUCCACCACACCGGGUCCUACAGCUGGCUUCAAGUGUUGCGCGACUUGAGGCGUUUUUGCACUCUCUCGGUACCUCGCCCUACCGCCGCAGUGACCUCGGCGUACGAUUUGCUACAUCCCGUGACGCCCUCUCUACCGACGCCGUUUGUCAAUAUUCUUCCGGACACCAGCAUACGGUGCUGA" | |
#Input : a sequence of RNA | |
#Output : the sequence of protein | |
def codon_to_protein(RNA_seq): | |
codons = seq_to_codon(RNA_seq) | |
protein = "" | |
for codon in codons: | |
protein += genetic_code[codon] | |
return protein | |
#Function testing | |
#print codon_to_protein(RNA_seq) | |
#Input : a sequence of protein in one letter abbreviation | |
#Output : number of possible RNA string codons (all codons) | |
def protein_to_DNA(Prot_seq): | |
# building a dict of aa to codons {'C': ['UGU', 'UGC']...} | |
aa_to_codons = defaultdict(list) | |
for k, v in genetic_code.iteritems(): | |
aa_to_codons[v].append(k) | |
#total number of possible RNA sequences | |
total_RNA = 1 | |
#All possible RNA sequences [[],...] | |
codons = [] | |
for e in Prot_seq: | |
total_RNA *= len(aa_to_codons[e]) | |
codons.append(aa_to_codons[e]) | |
return codons | |
# Given: A sequence of amino acid, a protein | |
# Return: All possible RNA sequences of the give protein (list of strings) | |
def all_Prot_RNA(protein): | |
# list of possible RNA molecules for a protein sequence | |
possible_RNAs = [ ''.join(x) for x in list(itertools.product(*protein_to_DNA(protein))) ] | |
return possible_RNAs | |
# Given: A DNA string ss of length at most 1000 bp. | |
# Return: The reverse complement scsc of ss | |
def DNArc(DNA): | |
# In DNA strings, symbols 'A' and 'T' are complements of each other, as are 'C' and 'G'. | |
# The reverse complement of a DNA string ss is the string scsc formed by reversing the symbols of ss, | |
# then taking the complement of each symbol (e.g., the reverse complement of "GTCA" is "TGAC"). | |
rvDNA = "" #reverse DNA that will be returned | |
for base in DNA: | |
if base == "A": | |
rvDNA = rvDNA + "T" | |
elif base == "T": | |
rvDNA = rvDNA + "A" | |
elif base == "G": | |
rvDNA = rvDNA + "C" | |
elif base == "C": | |
rvDNA = rvDNA + "G" | |
rvDNA = rvDNA[::-1] | |
return rvDNA | |
# Given: A DNA string tt having length at most 1000 nt. | |
# Return: The transcribed RNA string of tt. | |
def DNA_to_RNA(DNAstring): | |
# An RNA string is a string formed from the alphabet containing 'A', 'C', 'G', and 'U'. | |
# Given a DNA string tt corresponding to a coding strand, | |
# its transcribed RNA string uu is formed | |
# by replacing all occurrences of 'T' in tt with 'U' in uu. | |
#The resulting ARN string from transcription | |
ARNstring = "" | |
#changing the T in U to make a new ARNstring | |
for e in DNAstring: | |
if e == "T": | |
ARNstring = ARNstring + "U" | |
else: | |
ARNstring = ARNstring + e | |
return ARNstring | |
# Given: A RNA string tt having length at most 1000 nt. | |
# Return: The transcribed DNA string of tt. | |
def RNA_to_DNA(RNAstring): | |
DNAstring = "" | |
for e in RNAstring: | |
if e == "U": | |
DNAstring = DNAstring + "T" | |
else: | |
DNAstring = DNAstring + e | |
return DNAstring | |
# Peptide Encoding Problem: Find substrings of a genome encoding a given amino acid sequence. | |
# Input: A DNA string Text, an amino acid string Peptide, and the array GeneticCode. | |
# Output: All substrings of Text encoding Peptide (if any such substrings exist). | |
DNA ="GCGGTTTTGCGCATTAATACCGGTCCGCGCCCGAGCGAGAAACGGGAGGGCAGTTCAACGGGACTTTTGTGGTTCCTAATTATTTGAGATAAGTAATCTTCCTGCAGTTCGTGCATGGCGTCCCAGAACGGTTTAATCAGTGATTGTAAACTCATGTTTACTACTGTGCAAAACTAGGACCATCTCTATCATTGAAGCACGGTCGAAGCGGCCACATCTGTGGTATCGCGAGCCAGAGCCCTAGGCGCGAAGTTTTGCATTGCACCGCGGTCTCCTAGGAGGTCCAATGGGCTGTGGCAGGCCAAGATGCGTACACGTATCATGCGTCAAAAGTGCCTCATTAGGGTAGAATGGTTAACGGCTTACTTCGCCCAGGAAAGGCTCCGTTAGGCCCGTGCCTATCAGTACCTCGTAAGCGTGTTCCCCGAAGCTTTACGGCAATTTGGCCCTGGGAAGAGACTGTCGTGTCCACGACGATCCCTCGAATTACATTGTTATGCGCCGTAAGGTTCCGAATTTACAACGGCGGTGACGCTGCAATTAGCCAAATCAGTAGGGGGTGGTGAGATGCGGATCCGTTTACCCGCCAGGGCTGGGCCGCTTGGCATCATGTGAAATTGCATTGCGTCACCAACCGATTTTGGATTAATGCTTGATACCAATGGTTAGAGTCGCAGTTTGACACATCGAATGTCACGGGAGACTACCGTTGCTTTATTCGAAGGATGAACCTTGGTGAATACGTGCTCCGAGGCATGACAACTAAAAGGCGATAACGTCATGCACCTGGGCCGACTATTGGCTACGGACTCACAGACTTTTTAAGTCACGGGAGTCCGGGTGTACGGGAGGAATCGTTTATACTCCGTACATCAGGCTTGATTGCCGCGCCAGGCCATCACCAAAGTAAGGCCGCCTAATCTTCCCCGACCTAAATCTGTAGGCGATGCGATGCAATCCGATTGTGGAGTAACGCGTTCGCCCAGGCCTAAATCCGTGGGTGATGCGATGCAAGCCGCGACGCCCTTCAGTCTGCACTAGACACACGTGTGAGCCAAAAGTAGGGCGGTTTAAGAGGCAAAATATGGCCACACCGCCACCGTTCATTTAGGAACGCTGTCACTAATCGCTTCTGGAGTGTTTCTCCATCCGATCTCCGTGTTAGTTGATGAAAGGTAACCTGTTATTCAATATAACGTTGTCAGCTTCAACGGGGAGCAAATTGCTAGCTTGTCATTGTGCCTCAATGTATAAGTGTCCGACCAGACTGGAGCATACCACCGGCGGCTTATACCCACTACCTTATCCTCTTCACTGGTACATGTAACTATGGTAGCACGGGAGTAATCAGCGCTCCATACAGTGTCGCTCCGCTGATATTATAAAGCTGTAGGAGAAACTTAAACGACACAACTTAGATAACACCTCTCGGGCAATGAATGGCGGCCAAATACCAGAAGCAGCTCCACTCGCTGACCCAACCAAATAATATTGTACTCTGGTCGAGCTCTTGTCCTGTTTGAAAACGATGGGCGGGCGCGTTTCTGTAGTATATACATTGGTTGGCGGCTGTATTGCATTGCGTCCCCTACGCTTTTGGGGCTGTTCTAATACTTCCCGTCTGTTCAGTTGAACGGAGGTAAATTTTCCAGATTAGGCAACTGCCCAGGACCGCGCGCTCGCATCAGGAAGGACAGCATCCGGCCTCCGCAACCAAAGAGTGTTGGCGACGCTATGCAACATACGGTGCTTCGACGGGAGGATCACCGTTTGGCACTGCCGAGCCAGAGAATAGTAGCGGCCGGCCAAGAGTACGCATACACTACCGGATGTGTGGTCTGATGTAAAGGTTGGGCTGAAAGAAACATCAGAGGTTTCCTATCGTTCAAGCAAATTCATATCAGAATGTGTCCTGAGGAGAGCGGGCAGGCGACTTATGACAACATGTGATCGCTAGGCTGACTCCTCGAGGGCGATAACGTGTGGCTTCTAAGCCAGATTGACTCGAGAGAGGGCAAGCACTAAACGAAGTGGGTGCAGATGTGACTGCATGGAACTAAAATGGAACTAATAGGATTAGTGGTGCTGAGTTTTCCCAGGTGTACCTATTGAGGCAGGATTCCCTGTGTCTGATCCGTCTTGCCATGGTCAGGCCGAGTCCATTGTGTCCTAAACTCTTGTTTACTAGTTCAGGCAATGCTATCTGCCGAATGGTGAGGTCCGCCGCCTTGCCATGTGGGTGTACGCGATTACCCGCTCGGTTACCGCAGTCGACATGCAAGTCGCGTCTTGCGATACAATATCGCGCCGATGAAAAAGACCCAATATTTTCCCCGCACAACTTCCATATAGGATAGAGCAGCTAGTTGATTTAGAGTATGAACCGAAATCTGTCGGAGACGCAATGCAGCAAATGCCAATCCTCTGAGAAATTCACAGTACCCCGGTTCCCGAAGAGTGTAGGCGATGCAATGCAGAAAGAAAAGTTCCGACCACGGTCGGTTAAATGCATATTCGGCGGGTCGACATTTGAGCCCTGTTACGATTTCAAGCTCTGGACGCGGCCGTTAGGCAGACGGAGGTACGAAGCCAGCCTTGCACCTGCTCCATCCTGTTTGGTGGCAGGGTGCTGAAGTTAACATATAAGGGGGTCAATCGTGCGTCTACAGGTCGACCGAGTCAGCGGATCTTCAGGCCACTACTTGATGGGTGAGCTCTTCATTGTAGGCGCTTCTACTTGCAAGGGTGCGTATCGGTATACGGCTGCATCGCGTCACCAACGCTTTTGGGTTGGATACCCGCTGGCGCGCATGAAAGGATCCCCTTCATGTCCAATGTCAAAGGGACATTTCCACATATAATCTTGATTGTGACACGCATCCAAAAATAAAAGAGACGTCACTTCCGAAACCCAATACCTGACAAAATCTATACTTCAAGTTGACAATCTGACTAGAAATTATTGGGAAGAAAACCCGTTATTATGGAGGGTCCGTGGCGAATGTATGGAAATTTATTAATCAGCAATACCAGTGGCTTGAGCGACTGAACATCTTTGACCCGTCACCGATTCGCCGGAACGCCGATATTCGAAGGGGTTAGGTTGGATGCCTTCCCTCCTACTGACCCCAGCCGGCAGCGCAATGCAGAGTGTGCCCTGAAGTAACTAGTATTGAGGGATTGTATACCCTATCGCGACAAGAAGACAGAGAGCGAATATAAGCCAAGGCTTGAACACCCCTGGCCGCTCAGATGCTCGATCGTTTTTGAAACGTTGCATCGCGTCTCCAACACTCTTAGGTAACGCATGTAAGCGATGAGATCGTCGCACCCATCTATGATTCGCTTCACGCGCGCATAAGGCTCGGTAGCTTTTCCGCATATCAGGGTTCGGTCTTGTCACTGTTTATCAAGGTCCATTCTATCGTCGATTTACAAAGTCGGTACCCGGCTTGTCGCGGATAACACGGCACCTAGGTGGACGACCCGGCTCACATGGTCACGCCTAGAGGCGCAAGATCGAGTGTAACTGTGTTTGGTTGTGGGTATAAGACACATGTGCCCGAGCAGACGACTTTCTTTCCGAAGTCGGTTGGAGACGCCATGCAGTGTTAGTAGAAACAGGGATACTTAGGGTCGCCACCCAGCGTCCTGAGAGTCCACACCTGTCCATGGATAATTAGTACTAACTTTTATACGTTGACTAATCGAAGCTTGACAAAGTTTCTGAATCGCCTTGTGACCGCGTTTGCGGGAGTTACTACGGATGAACTAGGATTGCGTAGACGTGTTCCTGTGTACCCTGCATAGCGTCGCCAACTGACTTGGGAACAGTTAAGTTCCCCCATCTGGTTAGAAATTTTTGGATGGAAATAATAGCTTACAGAAGAACTCCCATAGCACACTATCTTAATGACCAGGCCAAGTCCAAATCGATAGACATAACGTCAGACAGTCATGTTGGGGTAATATTTGGCACGTTGATTTTATCAAGTTACAGGTTCCCTCTGGCTCTCCTGTAGATACTTAACACAGTCACCCTTACATAGATTCTTAAGATGCTTTAGTCAAAACTAAAGTGTATGTGAATTGGCGTCAAGCGTGTATTGGGAGAGGTTACCTTCTTGCCCTATGGGATGTTTGTTGCATACTGAACTTATTGTGATGCATATAAGTTCTAGGAGGCCGTATTGTCTTTGCCAGTAAATCAAGCTGCGTTTAATTGTACCGAAATCCGTGGGCGACGCAATGCAGTCAAATGTGAAACTCCCCCTTCCACCGGCAGCACTTAAGTTTCCGTCGGACTAGTGTACAAGACCGTATCGAGGATAATCTCTGGTAATCGGAAATCTCTCAAAGCAGGACTGACCGAAACAGCGAGTTGGCCTATTGAGATGCATAGTGTAACTACTGCAGACCGCTGGCCACTACAAACGACTATACCAAATGTATATAGAAAGGTAGCTCAGTGTGGCCATTAAGTGTTTATCGCTTTCATGAGCCCTAGTCACGACAAACATAACTCATAGCATGATAGCTTGAATATCCGAAAGGGTACTCTTTGCAACGGTGATTATTTAGCCGCGACTCGACGAGTTTTAACATAATTCGTATTGAGACCAGGGGGACTGTAGGCCGTCACTTTATACGAGCGTCGGTGCGCCTCCGCCTGACGTCACACTCGTGTTCCCCGATCCTTCGAGAATAATGACAGACGATATACTTAATTCAGCTAGATCCTATACTCACTATGGGACACATCTGCATTGCGTCACCAACGGACTTAGGTATGCGCAGGGTACTACTAAAGTAAGCGGGTGTCAATGCCAGGAGAATGACATCAAAGGGGAGGACAGTCAAGATGCGAAACCTGCAAGGCTTTGTCTGACTGCTGAGACCTGATAAAGGTCGGCAGTCTTGAGGCTGGGAAAGCTGTTGGGCCTCCTCGCAAGGGAAGAATGGTACCGCCTTTCCATGAAGAGCGCGCTCATACCGAATCTAAGTTGGGTGAGTGACTGTAAAACGTATGACCGAAACCTGTGAGGCCGGGGGGTTGCATAGCATCTCCAACCGATTTGGGCAACGGCAACACGGTCTCCAGGGTGGTATATAGTCCCTGTCTTTTCCCGAACTTTCATGTCGTGTAAGCGACTCTTTCGGTTATTGAACACGATGGGTAGACGCGTCGCGCCACATGAACCACACCTTGGAATGCTCGGAGTCCTCATACGCACGCCGTGGATTCTCTAGCACCAAGAGATGACATGGACGTTCCGAGGTTCATTTCTGAAAGTCGTTTTGTCACCACAGCAGCACCCTGGGCTGCATGGGAGGAGAGTCATTAACGCAGAAGTTTGCTTGTGCTTCCTATTAGGCACCATAGCGATGTACAACCCTCGGCAGCTGAGATGTTCGCCCTTTGGTGTACCTGCTTACGTAGTCCCAATCAAAAAAAATATAGCTGCGCGTTATCATTGCTGAATACGGAATGAGCGGGGCATCGGTGTAGTTGTGAAGTACCGTTACCCAGCCTCCCCGAAAGTGTTAGCGGGCAAACGCTATGTCAAATGTTCATCAAAGTCACTCAATGCCGAACTTGACTGATAGGCTAGAGTGATGGTTACCTATCTAAGTCCTTATGTATGAGGCCTGATGTTACTAAGGTGCAGTAGTAAGGGGGACAGGAGACTAGCCCAATAGACTGGTCGACCGCCCCTGGGCCGTCCCTACTCAGTCACCTGGCCAACATGTCTCCTAGGCCAATATGTAGTATTAGTTTGGGTTTGTAATGTCAGCTTGTACTCGTTATCGTCACACGCGCCCACAATTCTATCCGTCTTTAGGCGACCAAGGAGATGTATCTTGTCCCGGGGAATCAAACCAGATTACTCCATGTAGTGCTGTGCGCTTAAATGGTTAAGCAATGATTAGTTACCCATACAGCGCTTTGGAAGCCGGGCGAACATTTTACCTCTATTGTGACGAAAGTTGGTGGGAGTATCTGGTCCTCGTTGTAGGAACCACTAACGCATACTCCGTATCCCTAACGAGTGAGAGTGGAGAGACCGGGACTACATCATGGTCTAACCAACACAGGGATGGCAAAAGGCAGCCACTGGTTAAGGACGCGCAAAGAGGCTGTATACGCAAACAGGGGCTATAACGATACCACGAATGGCCGCAGTCCAGATCCCCAATTAATGAGCGGGTCCCTGTAAGGGGCTATGCCAGGCTTAGGCGTCTCCGCTGCAAATGGATGGCACACATACCTGTGGGGAAGTGGAACTGAGGGCCAAGACGAGAACCCGTACAGGGGACATAAGTACCCCGCTGTTCCAGGCTCTGACCGGCGTGATGTAGGAGGAGCCGAACGACAAGTTGTACAAATCTCCAACTTTTGTAAGCGTCCGAATCACTTTGGTCCAGATTAGGTAATCAGGCATGCTCTGAGAGGTCGTGGTAAGTGTAGACTATTCAAGGACGGTAACTCATGGGAGACGGGATAATAAGCGATTCACGGGCATCGGCCCATGAGTGGAACGATCATACAGCTGCAATTGGTGTGCTGTAGCGCAACAACCGAACACCCGCGCCAAGTATTGCCCGGTCACTCTCTCCGTACTGGGCCTACTTTTAGACCACACTGAACCGATACCTTTGCATTGCGTCCCCGACGCTTTTCGGACCCCCCTGGGCGGCGTAAGATACTCCCAGACATTAGCCAAGTATCCATGGGGCTGGATTTAGCCGTCCAGTTTCACCTCAGATATCAACCAGCAGGGCCCTTGGGTTCCCCCTAACCTGGGTTGTGCAACTTGAACTCGGGAAGGGTCGAAGTCCGAACCCCCCCAGGTCGAGCCCACACGACGTCCTTGTTATCCCAGTGGGTTCATCAACGCGATTCACTGGTCCTGGGTGGCGATTTACCAACATCTCTCAATACGCACTAGGCGCGCCTGGCCACCTGAGGACGCTTCCTTTCCGCCTCATTGCTGCCACGAGCCCTACTCTGCACTTAATGTCTGGTATTTCTCCGCGGTTCCCTGCCGACTCCCGTTGGGGGACTACTTTCAGCAGTTCTCAACCAAGAATTTACGGGTTGAGCGACCGTAACACGCTGCAAGGGGTGCGGACACATTATGTCTATCTCCTATAGTAGAGGAACTATTAAGACAAGGGATTGCTACCTTTTGTGGCCCCGTGATATGCAAGAGATTTTGTTCGCATCTTGCTCCGCTTAGAATAACAGGGGCGGTAGGTGAAACCTTAAGAAGGCGGTGTCATGCCTAGCCGAAAAGCGTTGGTGACGCTATGCAAGGGTTGGCCCTCGTCAAGGGTCCGGGGTGCGGTGCATGACTGCTTAGCCGTATCTCCCAGTTGAAAGCATGGCGGTCAATTGTGTAAGGTACCTTGCCTAAGAGCGTGGGAGATGCAATGCAGCGACGGACTTGGGTGTTTAGGATCGCATATGACAAACAATATTCCTGTGAAAAGGCCCAGCCTCTGGTACGACGTGGCCTAACTCTTGCAACCGGCTTGTGATGATTTATAATTTAGGCAACAACCTTCGCTGGTTTTCAGAGGATTTGCAAGATAGTGGTGGATCGCTGCCCCCATTCAACTTGGAGCGCTGATCATGAAACCAATAATTCGTCCTCTCGGAAATTGTAGTCTTTTGCTCCCGTTTCTCGTAATGCGCCGACCGGCATTAGTATGGCGGGGAGTTACCCCCGCGGGGAGTCCGTTAGGCATTACCTGCGTGACAATGGTCGGAATATTGCTATGGTGACCCACAACGGACGGAGTTTCACAATTGAGGATGGGATTCCGTTTTAGTCTAGGTAGGAACGCAATACAGTTAGATTGCGGTACACCCAAGTCGGTTGGCGATGCGATGCAGGCGTTTGTGATGGTTCCGACAAATTGGCTGTGTCAAGATTATATCCGTGGGTGATCGGTTAGCCTGTAGGATAGAGGCTCCTAGGTGTCCACGTGACAACGCTAACCAGATACAGTCACAAGCGTAGGCGGCTGTGTTCGCATTCGTGATGCTATGTATCCCTGTGTGGCCTTTTGTGTTGAACTTCCAAACGTTGGGTCGGTTCAAACTGGCGCACACACATGCCGATGGCCAGCCGCACTAAGGCACCGAAAAGTGTCGGCGACGCCATGCAAGCACATGTGGATCGTTTCCGTTCTTATTGCTAGTGCTCACTCCGTGAGCGCCAAAGTAGAACAGTTTTCTTACTTACGGCCCGCCGAGATTGTGAATAAAAGGAAAAGTGACTCAGTATTTGGCGATCGGCGACCCACACATGCAACTGTCTAGCCTACGATCGACTATTTGGCAATGGAGTCCACGAGTCGAGACCAGGGTGCGGGTGTAAGTATCTCGTATGATACTGTA" | |
protein = "VKLFPWFNQY" | |
def prot_in_genome(genome,protein,genetic_code): | |
genome = genome | |
protein = protein | |
genetic_code = genetic_code | |
# find all possible RNAs for protein | |
RNAs = all_Prot_RNA(protein) | |
#transform them to DNA | |
DNAs = [ RNA_to_DNA(e) for e in RNAs] | |
#add their reverse complements | |
rvDNAs = [DNArc(e) for e in DNAs] | |
DNAs = DNAs + rvDNAs | |
print DNAs[1] | |
#Checking each possible DNA in the genome | |
# list of all motifs found | |
encoding_DNAs = [] | |
for e in DNAs : | |
if e in genome: | |
print "Yes" | |
# I should all occurence of a motif | |
[encoding_DNAs.append(e) for i in range(genome.count(e))] | |
return encoding_DNAs | |
# with open("Bacillus brevis genome.txt", "r") as data: | |
# genome = data.read() | |
# genome = genome.replace("\n","") | |
# if "\n" in genome: | |
# print "yessss" | |
# | |
# with open("motifs.txt", "w") as result: | |
# result.write(genome) | |
# | |
# motifs = prot_in_genome(genome, protein, genetic_code) | |
# print len(motifs) | |
# with open("motifs.txt", "w") as result: | |
# for e in prot_in_genome(genome, protein, genetic_code): | |
# result.writelines(e +"\n") | |
# How many subpeptides does a cyclic peptide of length n have? : n*(n-1) | |
# key: the single letter aa, value the mass of the aa | |
aa_masses = {'A': 71, 'C': 103, 'E': 129, 'D': 115, 'G': 57, 'F': 147, 'I': 113, | |
'H': 137, 'K': 128, 'M': 131, 'L': 113, 'N': 114, 'Q': 128, 'P': 97, | |
'S': 87, 'R': 156, 'T': 101, 'W': 186, 'V': 99, 'Y': 163} | |
#Input: An amino acid string Peptide. | |
# Output: The linear spectrum of Peptide. | |
def linearSpectrum(peptide): | |
# Peptide = NQEL, PrefixMass = (0, 114, 242, 371, 484), | |
prefixMass = [] | |
prefixMass.append(0) | |
for e in peptide : | |
#add the mass of a to the mass of all precedent peptides | |
prefixMass.append(prefixMass[-1] + aa_masses[e]) | |
# list of the masses of every subportion of the peptide | |
linearspectrum = [] | |
linearspectrum.append(0) | |
for i in range(len(prefixMass) - 1): | |
for j in range(i+1,len(prefixMass)): | |
print i,j | |
# todo maybe we should a dict with key: the subportion of the peption, value: its mass | |
linearspectrum.append(prefixMass[j] - prefixMass[i]) | |
linearspectrum.sort() | |
return linearspectrum | |
# Generating Theoretical Spectrum Problem: Generate the theoretical spectrum of a cyclic peptide. | |
# Input: An amino acid string Peptide. | |
# Output: Cyclospectrum(Peptide). | |
def cyclicSpectrum(peptide): | |
# Peptide = NQEL, PrefixMass = (0, 114, 242, 371, 484), | |
prefixMass = [] | |
prefixMass.append(0) | |
for e in peptide : | |
#add the mass of a to the mass of all precedent peptides | |
prefixMass.append(prefixMass[-1] + aa_masses[e]) | |
#Total mass of the peptide | |
peptide_mass = prefixMass[-1] | |
# list of the masses of every subportion of the peptide | |
cyclicSpectrum = [] | |
cyclicSpectrum.append(0) | |
for i in range(len(prefixMass) - 1): | |
for j in range(i+1,len(prefixMass)): | |
print i,j | |
# todo maybe we should a dict with key: the subportion of the peption, value: its mass | |
cyclicSpectrum.append(prefixMass[j] - prefixMass[i]) | |
if i > 0 and j < len(peptide): | |
# Mass(LN) = Mass(NQEL) - Mass(QE) = 484 − 257 = 227. | |
cyclicSpectrum.append(peptide_mass - (prefixMass[j] - prefixMass[i])) | |
cyclicSpectrum.sort() | |
return cyclicSpectrum | |
for e in cyclicSpectrum("NTKDKHAHILYNTRC"): | |
print e, | |
# The brute force cyclopeptide sequencing algorithm BFCyclopeptideSequencing generates | |
# all possible peptides whose mass is equal to ParentMass(Spectrum) and then checks | |
# which of these peptides has theoretical spectra matching Spectrum. | |
# | |
# BFCyclopeptideSequencing(Spectrum) | |
# for every peptide with Mass(Peptide) equal to ParentMass(Spectrum) | |
# if Spectrum = Cyclospectrum(Peptide) | |
# output Peptide | |
# max_number_aa = totalmass/lighestmass_aa | |
# min_number_aa = totalmass/highestmass_aa | |
# list of all combi with min_number_aa ,possible_RNAs = [ ''.join(x) for x in list(itertools.product(*protein_to_DNA(protein))) ] | |
# keep track of those that do match mass (are less than mass) | |
# then continue making combination with them to reach max_number_aa | |
# for number between min_number_aa and max_number_aa: | |
# range (max_number_aa, debut, fin,pas) | |
# mass(combinaison number aa ) = ? mass_peptide ? | |
# possible_RNAs = [ ''.join(x) for x in list(itertools.product(*protein_to_DNA(protein))) ] | |
# same spectrum ? | |
# keep track of those who are not same spectrum | |
# output |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment