|
""" |
|
Basic alignment of sequences (lists) of items |
|
Alpha for each alignment is (will be) configurable |
|
|
|
An "alignment" is a list of lists + a score |
|
|
|
Sequence alignment videos: |
|
https://www.youtube.com/watch?v=M0yn555erqo&index=12&list=PLAwxTw4SYaPkbWSEj_1iO7rILlWDJImW4 |
|
|
|
""" |
|
|
|
import copy |
|
import sys |
|
|
|
# Map of tuple lengths (lenA, lenB) to (alignment, cost) tupes |
|
SUB_ALIGNS = {} |
|
|
|
def alpha(a, b): |
|
return 1.5 if a != b else 0 |
|
|
|
def print_sequence(seq, is_chars): |
|
(a_align, b_align), total_cost = seq |
|
if is_chars: |
|
trans = lambda i: i if i != None else '-' |
|
else: |
|
trans = lambda i: i |
|
print [trans(i) for i in a_align] |
|
print [trans(i) for i in b_align] |
|
print "TOTAL COST: ", total_cost |
|
|
|
def align(A, B, debug=False): |
|
# Return the calculation result |
|
|
|
if debug: |
|
print "ALIGN CALLED WITH:" |
|
print "\tA: ", A |
|
print "\tB: ", B |
|
|
|
# Cached cases |
|
cache_key = (len(A), len(B)) |
|
r = SUB_ALIGNS.get(cache_key, None) |
|
if r is not None: |
|
# Fetch a copy, not the original |
|
return copy.deepcopy(r) |
|
|
|
# Boundary cases |
|
if len(A) == 0: |
|
tmp_result = [[None]*len(B), B], len(B) |
|
SUB_ALIGNS[cache_key] = copy.deepcopy(tmp_result) |
|
return copy.deepcopy(tmp_result) |
|
if len(B) == 0: |
|
tmp_result = [A, [None]*len(A)], len(A) |
|
SUB_ALIGNS[cache_key] = copy.deepcopy(tmp_result) |
|
return copy.deepcopy(tmp_result) |
|
|
|
# The N+1 solution |
|
NW_alignment, NW_cost = align(A[:-1], B[:-1]) |
|
N_alignment, N_cost = align(A, B[:-1]) |
|
W_alignment, W_cost = align(A[:-1], B) |
|
|
|
# Cost for each option |
|
NW_total_cost = NW_cost + alpha(A[-1], B[-1]) |
|
N_total_cost = N_cost + 1 |
|
W_total_cost = W_cost + 1 |
|
|
|
# Exit early instead of returning multiple options |
|
min_total_cost = min(NW_total_cost, N_total_cost, W_total_cost) |
|
if min_total_cost == NW_total_cost: |
|
if debug: |
|
print "NW_alignment" + "="*50 |
|
a_align, b_align = NW_alignment |
|
a_align.append(A[-1]) |
|
b_align.append(B[-1]) |
|
elif min_total_cost == N_total_cost: |
|
if debug: |
|
print "N_alignment" + "="*50 |
|
a_align, b_align = N_alignment |
|
a_align.append(None) |
|
b_align.append(B[-1]) |
|
elif min_total_cost == W_total_cost: |
|
if debug: |
|
print "W_alignment" + "="*50 |
|
a_align, b_align = W_alignment |
|
a_align.append(A[-1]) |
|
b_align.append(None) |
|
|
|
# Make sure we're only adding None values to sequences and not performing any other mods |
|
assert(len(A) == len([i for i in a_align if i != None])) |
|
assert(len(B) == len([i for i in b_align if i != None])) |
|
assert(len(a_align) == len(b_align)) |
|
|
|
if debug: |
|
print "A: ", A |
|
print "B: ", B |
|
print "a_align: ", a_align |
|
print "b_align: ", b_align |
|
print "SETTING CACHE VALUE: ", cache_key |
|
|
|
tmp_result = [a_align, b_align], min_total_cost |
|
|
|
# Memoize |
|
# We're using append, so make sure we return a new copy |
|
SUB_ALIGNS[cache_key] = copy.deepcopy(tmp_result) |
|
|
|
return copy.deepcopy(tmp_result) |
|
|
|
def print_path(A, B): |
|
"""Print the path of transitions |
|
""" |
|
# Work backwards to find the path taken |
|
tuples = [] |
|
current_state = (len(A), len(B)) |
|
current_cost = SUB_ALIGNS |
|
while True: |
|
tuples.append(current_state) |
|
cx, cy = current_state |
|
NW = cx - 1, cy - 1 |
|
NW_cost = SUB_ALIGNS.get(NW, [0,-1])[1] |
|
W = cx - 1, cy |
|
W_cost = SUB_ALIGNS.get(W, [0,-1])[1] |
|
N = cx, cy - 1 |
|
N_cost = SUB_ALIGNS.get(N, [0,-1])[1] |
|
|
|
min_state = min(NW_cost, W_cost, N_cost) |
|
if min_state == -1: |
|
break |
|
|
|
if NW_cost == min_state: |
|
current_state = NW |
|
elif W_cost == min_state: |
|
current_state = W |
|
elif N_cost == min_state: |
|
current_state = N |
|
|
|
print "\n(PATH)" + "="*50 |
|
for x in range(len(A) + 1): |
|
s = "" |
|
for y in range(len(B) + 1): |
|
s += "X " if (x, y) in tuples else "O " |
|
print s |
|
|
|
def test(): |
|
s1 = ["A", "B", "B", "A"] |
|
s2 = ["A", "A"] |
|
seq = align(s1, s2) |
|
print_sequence(seq) |
|
|
|
# OK |
|
s1 = ["A", "B"] |
|
s2 = ["A", "A"] |
|
seq = align(s1, s2) |
|
print_sequence(seq) |
|
|
|
# OK |
|
s1 = ["B", "A", "A"] |
|
s2 = ["A", "A"] |
|
seq = align(s1, s2) |
|
print_sequence(seq) |
|
|
|
# OK |
|
s1 = ["B", "A", "A"] |
|
s2 = ["A", "B"] |
|
seq = align(s1, s2) |
|
print_sequence(seq) |
|
|
|
|
|
if __name__ == "__main__": |
|
# Do it with strings on the command line |
|
s1, s2 = list(sys.argv[1]), list(sys.argv[2]) |
|
seq = align(s1, s2) |
|
print_sequence(seq, True) |
|
|
|
# Everything is covered; edges aren't saved |
|
print_path(s1, s2) |
Example usage: