Created
February 16, 2018 14:23
-
-
Save Swarchal/49fb567dfd55d04abcf1d6118afaf7aa to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| codon_table = { | |
| 'AUA':'I', 'AUC':'I', 'AUU':'I', 'AUG':'M', | |
| 'ACA':'U', 'ACC':'U', 'ACG':'U', 'ACU':'U', | |
| 'AAC':'N', 'AAU':'N', 'AAA':'K', 'AAG':'K', | |
| 'AGC':'S', 'AGU':'S', 'AGA':'R', 'AGG':'R', | |
| 'CUA':'L', 'CUC':'L', 'CUG':'L', 'CUU':'L', | |
| 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCU':'P', | |
| 'CAC':'H', 'CAU':'H', 'CAA':'Q', 'CAG':'Q', | |
| 'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGU':'R', | |
| 'GUA':'V', 'GUC':'V', 'GUG':'V', 'GUU':'V', | |
| 'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCU':'A', | |
| 'GAC':'D', 'GAU':'D', 'GAA':'E', 'GAG':'E', | |
| 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGU':'G', | |
| 'UCA':'S', 'UCC':'S', 'UCG':'S', 'UCU':'S', | |
| 'UUC':'F', 'UUU':'F', 'UUA':'L', 'UUG':'L', | |
| 'UAC':'Y', 'UAU':'Y', 'UAA':'.', 'UAG':'.', | |
| 'UGC':'C', 'UGU':'C', 'UGA':'.', 'UGG':'W' | |
| } | |
| amino_table = { | |
| "I": ["AUU", "AUC", "AUA"], | |
| "L": ["CUU", "CUC", "CUA", "CUG", "UUA", "UUG"], | |
| "V": ["GUU", "GUC", "GUA", "GUG"], | |
| "F": ["UUU", "UUC"], | |
| "M": ["AUG"], | |
| "C": ["UGU", "UGC"], | |
| "A": ["GCU", "GCC," "GCA", "GCG"], | |
| "G": ["GGU", "GGC", "GGA", "GGG"], | |
| "P": ["CCU", "CCC", "CCA", "CCG"], | |
| "T": ["ACU", "ACC", "ACA", "ACG"], | |
| "S": ["UCU", "UCC", "UCA", "UCG", "AGU", "AGC"], | |
| "Y": ["UAU", "UAC"], | |
| "W": ["UGG"], | |
| "Q": ["CAA", "CAG"], | |
| "N": ["AAU", "AAC"], | |
| "H": ["CAU", "CAC"], | |
| "E": ["GAA", "GAG"], | |
| "D": ["GAU", "GAC"], | |
| "K": ["AAA", "AAG"], | |
| "R": ["CGU", "CGC", "CGA", "CGG", "AGA", "AGG"], | |
| ".": ["UAA", "UAG", "UGA"] | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import sys | |
| from collections import OrderedDict | |
| import multiprocessing | |
| import nucleotides | |
| def parse_input(file_path): | |
| """parse input into a dictionary {sequence: [(start, stop)]}""" | |
| # NOTE: changed input to 0-indexed | |
| lines = [l.strip() for l in open(file_path).readlines()] | |
| n_entires = int(lines.pop(0)) | |
| output = OrderedDict() | |
| count = -1 | |
| for line in lines: | |
| if line.startswith("AUG"): | |
| name = line | |
| count += 1 | |
| output[str(count)] = {name: []} | |
| else: | |
| if " " in line: | |
| line = tuple(map(lambda x: int(x)-1, line.split(" "))) | |
| output[str(count)][name].append(line) | |
| return output | |
| def hamming_dist(s, t): | |
| return sum(i != j for i, j in zip(s, t)) | |
| def not_possible(amino_acids): | |
| """possible to change the codon without altering the amino acid""" | |
| return all(aa in ["M", "W"] for aa in amino_acids) | |
| def split_into_codons(sequence): | |
| for i in range(0, len(sequence), 3): | |
| yield sequence[0 + i: 3 + i] | |
| def find_position(index): | |
| """return codon position, position in codon | |
| e.g. | |
| |AUG|GAU|UAC|AAA| | |
| | 0 | 1 | 2 | 3 | => codon position | |
| |012|012|012|012| => position in codon | |
| """ | |
| return index // 3 | |
| def pos_nuc(index): | |
| return index % 3 | |
| def interval_to_codon_and_range(sequence, interval): | |
| """ | |
| docstring | |
| """ | |
| cache = {} | |
| codon_sequence = list(split_into_codons(sequence)) | |
| start, stop = interval | |
| codon_idxs = [] | |
| nuc_idxs = [] | |
| for i in range(start, stop+1): | |
| codon = codon_sequence[find_position(i)] | |
| if codon in cache: | |
| pass | |
| else: | |
| cache[codon] = [] | |
| cache[codon].append(pos_nuc(i)) | |
| return cache | |
| def translate(codon): | |
| return nucleotides.codon_table[codon] | |
| def dist_closest_match(codon, indices): | |
| aa = nucleotides.codon_table[codon] | |
| possible_codons = nucleotides.amino_table[aa] | |
| sli = slice(indices[0], indices[-1]+1) | |
| if all([codon[sli] in pos[sli] for pos in possible_codons]): | |
| return -1 | |
| else: | |
| return min_edit_dist(codon[sli], [i[sli] for i in possible_codons]) | |
| def min_edit_dist(string, possibles): | |
| # remove duplicates | |
| possibles = set(i for i in possibles if i != string) | |
| if len(possibles) > 0: | |
| # find minimum change to convert string to a possible | |
| min_dist = min(hamming_dist(i, string) for i in possibles) | |
| return min_dist | |
| else: | |
| return -1 | |
| def n_changes(sequence, intervals): | |
| """ | |
| Parameters: | |
| ----------- | |
| sequence: string | |
| amino acid sequence | |
| intervals: list of tuples | |
| list of interval start and stop indices | |
| Returns: | |
| -------- | |
| int: minimum number of changes | |
| """ | |
| n_changes = 0 | |
| codon_sequence = list(split_into_codons(sequence)) | |
| for interval in intervals: | |
| seq_dict = interval_to_codon_and_range(sequence, interval) | |
| for seq, range_ in seq_dict.items(): | |
| dist = dist_closest_match(seq, range_) | |
| if dist != -1: | |
| n_changes += dist | |
| if n_changes == 0: | |
| n_changes = -1 | |
| return n_changes | |
| pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) | |
| sequences = parse_input(sys.argv[1]) | |
| run_seq = [] | |
| for subdict in sequences.values(): | |
| for sequence, intervals in subdict.items(): | |
| run_seq.append([sequence, intervals]) | |
| ans = pool.starmap(n_changes, run_seq) | |
| for i in ans: | |
| print(i) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment