Swarchal · February 16, 2018 14:23
diff --git a/nucleotides.py b/nucleotides.py
 codon_table = {
    'AUA':'I', 'AUC':'I', 'AUU':'I', 'AUG':'M',
    'ACA':'U', 'ACC':'U', 'ACG':'U', 'ACU':'U',
    'AAC':'N', 'AAU':'N', 'AAA':'K', 'AAG':'K',
    'AGC':'S', 'AGU':'S', 'AGA':'R', 'AGG':'R',
    'CUA':'L', 'CUC':'L', 'CUG':'L', 'CUU':'L',
    'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCU':'P',
    'CAC':'H', 'CAU':'H', 'CAA':'Q', 'CAG':'Q',
    'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGU':'R',
    'GUA':'V', 'GUC':'V', 'GUG':'V', 'GUU':'V',
    'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCU':'A',
    'GAC':'D', 'GAU':'D', 'GAA':'E', 'GAG':'E',
    'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGU':'G',
    'UCA':'S', 'UCC':'S', 'UCG':'S', 'UCU':'S',
    'UUC':'F', 'UUU':'F', 'UUA':'L', 'UUG':'L',
    'UAC':'Y', 'UAU':'Y', 'UAA':'.', 'UAG':'.',
    'UGC':'C', 'UGU':'C', 'UGA':'.', 'UGG':'W'
 }


 amino_table = {
    "I": ["AUU", "AUC", "AUA"],
    "L": ["CUU", "CUC", "CUA", "CUG", "UUA", "UUG"],
    "V": ["GUU", "GUC", "GUA", "GUG"],
    "F": ["UUU", "UUC"],
    "M": ["AUG"],
    "C": ["UGU", "UGC"],
    "A": ["GCU", "GCC," "GCA", "GCG"],
    "G": ["GGU", "GGC", "GGA", "GGG"],
    "P": ["CCU", "CCC", "CCA", "CCG"],
    "T": ["ACU", "ACC", "ACA", "ACG"],
    "S": ["UCU", "UCC", "UCA", "UCG", "AGU", "AGC"],
    "Y": ["UAU", "UAC"],
    "W": ["UGG"],
    "Q": ["CAA", "CAG"],
    "N": ["AAU", "AAC"],
    "H": ["CAU", "CAC"],
    "E": ["GAA", "GAG"],
    "D": ["GAU", "GAC"],
    "K": ["AAA", "AAG"],
    "R": ["CGU", "CGC", "CGA", "CGG", "AGA", "AGG"],
    ".": ["UAA", "UAG", "UGA"]
 }
diff --git a/p2.py b/p2.py
 import sys
 from collections import OrderedDict
 import multiprocessing
 import nucleotides


 def parse_input(file_path):
    """parse input into a dictionary {sequence: [(start, stop)]}"""
    # NOTE: changed input to 0-indexed
    lines = [l.strip() for l in open(file_path).readlines()]
    n_entires = int(lines.pop(0))
    output = OrderedDict()
    count = -1
    for line in lines:
        if line.startswith("AUG"):
            name = line
            count += 1
            output[str(count)] = {name: []}
        else:
            if " " in line:
                line = tuple(map(lambda x: int(x)-1, line.split(" ")))
                output[str(count)][name].append(line)
    return output


 def hamming_dist(s, t):
    return sum(i != j for i, j in zip(s, t))


 def not_possible(amino_acids):
    """possible to change the codon without altering the amino acid"""
    return all(aa in ["M", "W"] for aa in amino_acids)


 def split_into_codons(sequence):
    for i in range(0, len(sequence), 3):
        yield sequence[0 + i: 3 + i]


 def find_position(index):
    """return codon position, position in codon
    e.g.
         |AUG|GAU|UAC|AAA|
         | 0 | 1 | 2 | 3 | => codon position
         |012|012|012|012| => position in codon
    """
    return index // 3


 def pos_nuc(index):
    return index % 3


 def interval_to_codon_and_range(sequence, interval):
    """
    docstring
    """
    cache = {}
    codon_sequence = list(split_into_codons(sequence))
    start, stop = interval
    codon_idxs = []
    nuc_idxs = []
    for i in range(start, stop+1):
        codon = codon_sequence[find_position(i)]
        if codon in cache:
            pass
        else:
            cache[codon] = []
        cache[codon].append(pos_nuc(i))
    return cache



 def translate(codon):
    return nucleotides.codon_table[codon]


 def dist_closest_match(codon, indices):
    aa = nucleotides.codon_table[codon]
    possible_codons = nucleotides.amino_table[aa]
    sli = slice(indices[0], indices[-1]+1)
    if all([codon[sli] in pos[sli] for pos in possible_codons]):
        return -1
    else:
        return min_edit_dist(codon[sli], [i[sli] for i in possible_codons])


 def min_edit_dist(string, possibles):
    # remove duplicates
    possibles = set(i for i in possibles if i != string)
    if len(possibles) > 0:
        # find minimum change to convert string to a possible
        min_dist = min(hamming_dist(i, string) for i in possibles)
        return min_dist
    else:
        return -1


 def n_changes(sequence, intervals):
    """
    Parameters:
    -----------
    sequence: string
        amino acid sequence
    intervals: list of tuples
        list of interval start and stop indices
    Returns:
    --------
    int: minimum number of changes
    """
    n_changes = 0
    codon_sequence = list(split_into_codons(sequence))
    for interval in intervals:
        seq_dict = interval_to_codon_and_range(sequence, interval)
        for seq, range_ in seq_dict.items():
            dist = dist_closest_match(seq, range_)
            if dist != -1:
                n_changes += dist
    if n_changes == 0:
        n_changes = -1
    return n_changes



 pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
 sequences = parse_input(sys.argv[1])
 run_seq = []
 for subdict in sequences.values():
    for sequence, intervals in subdict.items():
        run_seq.append([sequence, intervals])

 ans = pool.starmap(n_changes, run_seq)
 for i in ans:
    print(i)
	codon_table = {
	'AUA':'I', 'AUC':'I', 'AUU':'I', 'AUG':'M',
	'ACA':'U', 'ACC':'U', 'ACG':'U', 'ACU':'U',
	'AAC':'N', 'AAU':'N', 'AAA':'K', 'AAG':'K',
	'AGC':'S', 'AGU':'S', 'AGA':'R', 'AGG':'R',
	'CUA':'L', 'CUC':'L', 'CUG':'L', 'CUU':'L',
	'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCU':'P',
	'CAC':'H', 'CAU':'H', 'CAA':'Q', 'CAG':'Q',
	'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGU':'R',
	'GUA':'V', 'GUC':'V', 'GUG':'V', 'GUU':'V',
	'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCU':'A',
	'GAC':'D', 'GAU':'D', 'GAA':'E', 'GAG':'E',
	'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGU':'G',
	'UCA':'S', 'UCC':'S', 'UCG':'S', 'UCU':'S',
	'UUC':'F', 'UUU':'F', 'UUA':'L', 'UUG':'L',
	'UAC':'Y', 'UAU':'Y', 'UAA':'.', 'UAG':'.',
	'UGC':'C', 'UGU':'C', 'UGA':'.', 'UGG':'W'
	}


	amino_table = {
	"I": ["AUU", "AUC", "AUA"],
	"L": ["CUU", "CUC", "CUA", "CUG", "UUA", "UUG"],
	"V": ["GUU", "GUC", "GUA", "GUG"],
	"F": ["UUU", "UUC"],
	"M": ["AUG"],
	"C": ["UGU", "UGC"],
	"A": ["GCU", "GCC," "GCA", "GCG"],
	"G": ["GGU", "GGC", "GGA", "GGG"],
	"P": ["CCU", "CCC", "CCA", "CCG"],
	"T": ["ACU", "ACC", "ACA", "ACG"],
	"S": ["UCU", "UCC", "UCA", "UCG", "AGU", "AGC"],
	"Y": ["UAU", "UAC"],
	"W": ["UGG"],
	"Q": ["CAA", "CAG"],
	"N": ["AAU", "AAC"],
	"H": ["CAU", "CAC"],
	"E": ["GAA", "GAG"],
	"D": ["GAU", "GAC"],
	"K": ["AAA", "AAG"],
	"R": ["CGU", "CGC", "CGA", "CGG", "AGA", "AGG"],
	".": ["UAA", "UAG", "UGA"]
	}
	import sys
	from collections import OrderedDict
	import multiprocessing
	import nucleotides


	def parse_input(file_path):
	"""parse input into a dictionary {sequence: [(start, stop)]}"""
	# NOTE: changed input to 0-indexed
	lines = [l.strip() for l in open(file_path).readlines()]
	n_entires = int(lines.pop(0))
	output = OrderedDict()
	count = -1
	for line in lines:
	if line.startswith("AUG"):
	name = line
	count += 1
	output[str(count)] = {name: []}
	else:
	if " " in line:
	line = tuple(map(lambda x: int(x)-1, line.split(" ")))
	output[str(count)][name].append(line)
	return output


	def hamming_dist(s, t):
	return sum(i != j for i, j in zip(s, t))


	def not_possible(amino_acids):
	"""possible to change the codon without altering the amino acid"""
	return all(aa in ["M", "W"] for aa in amino_acids)


	def split_into_codons(sequence):
	for i in range(0, len(sequence), 3):
	yield sequence[0 + i: 3 + i]


	def find_position(index):
	"""return codon position, position in codon
	e.g.
	\|AUG\|GAU\|UAC\|AAA\|
	\| 0 \| 1 \| 2 \| 3 \| => codon position
	\|012\|012\|012\|012\| => position in codon
	"""
	return index // 3


	def pos_nuc(index):
	return index % 3


	def interval_to_codon_and_range(sequence, interval):
	"""
	docstring
	"""
	cache = {}
	codon_sequence = list(split_into_codons(sequence))
	start, stop = interval
	codon_idxs = []
	nuc_idxs = []
	for i in range(start, stop+1):
	codon = codon_sequence[find_position(i)]
	if codon in cache:
	pass
	else:
	cache[codon] = []
	cache[codon].append(pos_nuc(i))
	return cache



	def translate(codon):
	return nucleotides.codon_table[codon]


	def dist_closest_match(codon, indices):
	aa = nucleotides.codon_table[codon]
	possible_codons = nucleotides.amino_table[aa]
	sli = slice(indices[0], indices[-1]+1)
	if all([codon[sli] in pos[sli] for pos in possible_codons]):
	return -1
	else:
	return min_edit_dist(codon[sli], [i[sli] for i in possible_codons])


	def min_edit_dist(string, possibles):
	# remove duplicates
	possibles = set(i for i in possibles if i != string)
	if len(possibles) > 0:
	# find minimum change to convert string to a possible
	min_dist = min(hamming_dist(i, string) for i in possibles)
	return min_dist
	else:
	return -1


	def n_changes(sequence, intervals):
	"""
	Parameters:
	-----------
	sequence: string
	amino acid sequence
	intervals: list of tuples
	list of interval start and stop indices
	Returns:
	--------
	int: minimum number of changes
	"""
	n_changes = 0
	codon_sequence = list(split_into_codons(sequence))
	for interval in intervals:
	seq_dict = interval_to_codon_and_range(sequence, interval)
	for seq, range_ in seq_dict.items():
	dist = dist_closest_match(seq, range_)
	if dist != -1:
	n_changes += dist
	if n_changes == 0:
	n_changes = -1
	return n_changes



	pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
	sequences = parse_input(sys.argv[1])
	run_seq = []
	for subdict in sequences.values():
	for sequence, intervals in subdict.items():
	run_seq.append([sequence, intervals])

	ans = pool.starmap(n_changes, run_seq)
	for i in ans:
	print(i)