Skip to content

Instantly share code, notes, and snippets.

@Swarchal
Created February 16, 2018 14:23
Show Gist options
  • Select an option

  • Save Swarchal/49fb567dfd55d04abcf1d6118afaf7aa to your computer and use it in GitHub Desktop.

Select an option

Save Swarchal/49fb567dfd55d04abcf1d6118afaf7aa to your computer and use it in GitHub Desktop.
codon_table = {
'AUA':'I', 'AUC':'I', 'AUU':'I', 'AUG':'M',
'ACA':'U', 'ACC':'U', 'ACG':'U', 'ACU':'U',
'AAC':'N', 'AAU':'N', 'AAA':'K', 'AAG':'K',
'AGC':'S', 'AGU':'S', 'AGA':'R', 'AGG':'R',
'CUA':'L', 'CUC':'L', 'CUG':'L', 'CUU':'L',
'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCU':'P',
'CAC':'H', 'CAU':'H', 'CAA':'Q', 'CAG':'Q',
'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGU':'R',
'GUA':'V', 'GUC':'V', 'GUG':'V', 'GUU':'V',
'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCU':'A',
'GAC':'D', 'GAU':'D', 'GAA':'E', 'GAG':'E',
'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGU':'G',
'UCA':'S', 'UCC':'S', 'UCG':'S', 'UCU':'S',
'UUC':'F', 'UUU':'F', 'UUA':'L', 'UUG':'L',
'UAC':'Y', 'UAU':'Y', 'UAA':'.', 'UAG':'.',
'UGC':'C', 'UGU':'C', 'UGA':'.', 'UGG':'W'
}
amino_table = {
"I": ["AUU", "AUC", "AUA"],
"L": ["CUU", "CUC", "CUA", "CUG", "UUA", "UUG"],
"V": ["GUU", "GUC", "GUA", "GUG"],
"F": ["UUU", "UUC"],
"M": ["AUG"],
"C": ["UGU", "UGC"],
"A": ["GCU", "GCC," "GCA", "GCG"],
"G": ["GGU", "GGC", "GGA", "GGG"],
"P": ["CCU", "CCC", "CCA", "CCG"],
"T": ["ACU", "ACC", "ACA", "ACG"],
"S": ["UCU", "UCC", "UCA", "UCG", "AGU", "AGC"],
"Y": ["UAU", "UAC"],
"W": ["UGG"],
"Q": ["CAA", "CAG"],
"N": ["AAU", "AAC"],
"H": ["CAU", "CAC"],
"E": ["GAA", "GAG"],
"D": ["GAU", "GAC"],
"K": ["AAA", "AAG"],
"R": ["CGU", "CGC", "CGA", "CGG", "AGA", "AGG"],
".": ["UAA", "UAG", "UGA"]
}
import sys
from collections import OrderedDict
import multiprocessing
import nucleotides
def parse_input(file_path):
"""parse input into a dictionary {sequence: [(start, stop)]}"""
# NOTE: changed input to 0-indexed
lines = [l.strip() for l in open(file_path).readlines()]
n_entires = int(lines.pop(0))
output = OrderedDict()
count = -1
for line in lines:
if line.startswith("AUG"):
name = line
count += 1
output[str(count)] = {name: []}
else:
if " " in line:
line = tuple(map(lambda x: int(x)-1, line.split(" ")))
output[str(count)][name].append(line)
return output
def hamming_dist(s, t):
return sum(i != j for i, j in zip(s, t))
def not_possible(amino_acids):
"""possible to change the codon without altering the amino acid"""
return all(aa in ["M", "W"] for aa in amino_acids)
def split_into_codons(sequence):
for i in range(0, len(sequence), 3):
yield sequence[0 + i: 3 + i]
def find_position(index):
"""return codon position, position in codon
e.g.
|AUG|GAU|UAC|AAA|
| 0 | 1 | 2 | 3 | => codon position
|012|012|012|012| => position in codon
"""
return index // 3
def pos_nuc(index):
return index % 3
def interval_to_codon_and_range(sequence, interval):
"""
docstring
"""
cache = {}
codon_sequence = list(split_into_codons(sequence))
start, stop = interval
codon_idxs = []
nuc_idxs = []
for i in range(start, stop+1):
codon = codon_sequence[find_position(i)]
if codon in cache:
pass
else:
cache[codon] = []
cache[codon].append(pos_nuc(i))
return cache
def translate(codon):
return nucleotides.codon_table[codon]
def dist_closest_match(codon, indices):
aa = nucleotides.codon_table[codon]
possible_codons = nucleotides.amino_table[aa]
sli = slice(indices[0], indices[-1]+1)
if all([codon[sli] in pos[sli] for pos in possible_codons]):
return -1
else:
return min_edit_dist(codon[sli], [i[sli] for i in possible_codons])
def min_edit_dist(string, possibles):
# remove duplicates
possibles = set(i for i in possibles if i != string)
if len(possibles) > 0:
# find minimum change to convert string to a possible
min_dist = min(hamming_dist(i, string) for i in possibles)
return min_dist
else:
return -1
def n_changes(sequence, intervals):
"""
Parameters:
-----------
sequence: string
amino acid sequence
intervals: list of tuples
list of interval start and stop indices
Returns:
--------
int: minimum number of changes
"""
n_changes = 0
codon_sequence = list(split_into_codons(sequence))
for interval in intervals:
seq_dict = interval_to_codon_and_range(sequence, interval)
for seq, range_ in seq_dict.items():
dist = dist_closest_match(seq, range_)
if dist != -1:
n_changes += dist
if n_changes == 0:
n_changes = -1
return n_changes
pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
sequences = parse_input(sys.argv[1])
run_seq = []
for subdict in sequences.values():
for sequence, intervals in subdict.items():
run_seq.append([sequence, intervals])
ans = pool.starmap(n_changes, run_seq)
for i in ans:
print(i)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment