Skip to content

Instantly share code, notes, and snippets.

@turtlemonvh
Last active January 22, 2016 17:52
Show Gist options
  • Save turtlemonvh/c77ab72d9a2bcbd55a08 to your computer and use it in GitHub Desktop.
Save turtlemonvh/c77ab72d9a2bcbd55a08 to your computer and use it in GitHub Desktop.
Sequence alignment

Sequence alignment

While watching some videos from Udacity describing sequence alignment I was having a bit of a hard time understanding the algorithm as it was explained, so I decided to write it in code.

This is a fairly simple approach and uses recursion (so stack depth may be a problem for larger sequences), but it does use dynamic programming to keep things relatively fast. The code uses the same N, NW, W notation the video uses to describe the problem graph.

If there are branches in the problem graph (i.e. 2 alignments are have the same score) the program just takes the first solution.

Sequence alignment video series (part of the "Computability Complexity and Algorithms" class on Udacity): https://www.youtube.com/watch?v=3NLYH44F6SU&index=5&list=PLAwxTw4SYaPkbWSEj_1iO7rILlWDJImW4

Example Output

$ python sequence_alignment.py ABBBB AABBAAAAAAAA
['-', 'A', 'B', 'B', '-', '-', '-', '-', '-', '-', 'B', 'B']
['A', 'A', 'B', 'B', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A']
TOTAL COST:  10.0

(PATH)==================================================
X O O O O O O O O O O O O
O X X O O O O O O O O O O
O O O X O O O O O O O O O
O O O O X X X X X X X O O
O O O O O O O O O O O X O
O O O O O O O O O O O O X
"""
Basic alignment of sequences (lists) of items
Alpha for each alignment is (will be) configurable
An "alignment" is a list of lists + a score
Sequence alignment videos:
https://www.youtube.com/watch?v=M0yn555erqo&index=12&list=PLAwxTw4SYaPkbWSEj_1iO7rILlWDJImW4
"""
import copy
import sys
# Map of tuple lengths (lenA, lenB) to (alignment, cost) tupes
SUB_ALIGNS = {}
def alpha(a, b):
return 1.5 if a != b else 0
def print_sequence(seq, is_chars):
(a_align, b_align), total_cost = seq
if is_chars:
trans = lambda i: i if i != None else '-'
else:
trans = lambda i: i
print [trans(i) for i in a_align]
print [trans(i) for i in b_align]
print "TOTAL COST: ", total_cost
def align(A, B, debug=False):
# Return the calculation result
if debug:
print "ALIGN CALLED WITH:"
print "\tA: ", A
print "\tB: ", B
# Cached cases
cache_key = (len(A), len(B))
r = SUB_ALIGNS.get(cache_key, None)
if r is not None:
# Fetch a copy, not the original
return copy.deepcopy(r)
# Boundary cases
if len(A) == 0:
tmp_result = [[None]*len(B), B], len(B)
SUB_ALIGNS[cache_key] = copy.deepcopy(tmp_result)
return copy.deepcopy(tmp_result)
if len(B) == 0:
tmp_result = [A, [None]*len(A)], len(A)
SUB_ALIGNS[cache_key] = copy.deepcopy(tmp_result)
return copy.deepcopy(tmp_result)
# The N+1 solution
NW_alignment, NW_cost = align(A[:-1], B[:-1])
N_alignment, N_cost = align(A, B[:-1])
W_alignment, W_cost = align(A[:-1], B)
# Cost for each option
NW_total_cost = NW_cost + alpha(A[-1], B[-1])
N_total_cost = N_cost + 1
W_total_cost = W_cost + 1
# Exit early instead of returning multiple options
min_total_cost = min(NW_total_cost, N_total_cost, W_total_cost)
if min_total_cost == NW_total_cost:
if debug:
print "NW_alignment" + "="*50
a_align, b_align = NW_alignment
a_align.append(A[-1])
b_align.append(B[-1])
elif min_total_cost == N_total_cost:
if debug:
print "N_alignment" + "="*50
a_align, b_align = N_alignment
a_align.append(None)
b_align.append(B[-1])
elif min_total_cost == W_total_cost:
if debug:
print "W_alignment" + "="*50
a_align, b_align = W_alignment
a_align.append(A[-1])
b_align.append(None)
# Make sure we're only adding None values to sequences and not performing any other mods
assert(len(A) == len([i for i in a_align if i != None]))
assert(len(B) == len([i for i in b_align if i != None]))
assert(len(a_align) == len(b_align))
if debug:
print "A: ", A
print "B: ", B
print "a_align: ", a_align
print "b_align: ", b_align
print "SETTING CACHE VALUE: ", cache_key
tmp_result = [a_align, b_align], min_total_cost
# Memoize
# We're using append, so make sure we return a new copy
SUB_ALIGNS[cache_key] = copy.deepcopy(tmp_result)
return copy.deepcopy(tmp_result)
def print_path(A, B):
"""Print the path of transitions
"""
# Work backwards to find the path taken
tuples = []
current_state = (len(A), len(B))
current_cost = SUB_ALIGNS
while True:
tuples.append(current_state)
cx, cy = current_state
NW = cx - 1, cy - 1
NW_cost = SUB_ALIGNS.get(NW, [0,-1])[1]
W = cx - 1, cy
W_cost = SUB_ALIGNS.get(W, [0,-1])[1]
N = cx, cy - 1
N_cost = SUB_ALIGNS.get(N, [0,-1])[1]
min_state = min(NW_cost, W_cost, N_cost)
if min_state == -1:
break
if NW_cost == min_state:
current_state = NW
elif W_cost == min_state:
current_state = W
elif N_cost == min_state:
current_state = N
print "\n(PATH)" + "="*50
for x in range(len(A) + 1):
s = ""
for y in range(len(B) + 1):
s += "X " if (x, y) in tuples else "O "
print s
def test():
s1 = ["A", "B", "B", "A"]
s2 = ["A", "A"]
seq = align(s1, s2)
print_sequence(seq)
# OK
s1 = ["A", "B"]
s2 = ["A", "A"]
seq = align(s1, s2)
print_sequence(seq)
# OK
s1 = ["B", "A", "A"]
s2 = ["A", "A"]
seq = align(s1, s2)
print_sequence(seq)
# OK
s1 = ["B", "A", "A"]
s2 = ["A", "B"]
seq = align(s1, s2)
print_sequence(seq)
if __name__ == "__main__":
# Do it with strings on the command line
s1, s2 = list(sys.argv[1]), list(sys.argv[2])
seq = align(s1, s2)
print_sequence(seq, True)
# Everything is covered; edges aren't saved
print_path(s1, s2)
@turtlemonvh
Copy link
Author

Example usage:

$ python sequence_alignment.py ABBBBAAA AABBAAAA
['A', 'B', 'B', 'B', 'B', 'A', 'A', 'A']
['A', 'A', 'B', 'B', 'A', 'A', 'A', 'A']
TOTAL COST:  3.0

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment