turtlemonvh · January 22, 2016 17:52 · turtlemonvh · Jan 22, 2016
diff --git a/sequence_alignment.md b/sequence_alignment.md
diff --git a/sequence_alignment.py b/sequence_alignment.py
 """
 Basic alignment of sequences (lists) of items
 Alpha for each alignment is (will be) configurable

 An "alignment" is a list of lists + a score

 Sequence alignment videos:
 https://www.youtube.com/watch?v=M0yn555erqo&index=12&list=PLAwxTw4SYaPkbWSEj_1iO7rILlWDJImW4

 """

 import copy
 import sys

 # Map of tuple lengths (lenA, lenB) to (alignment, cost) tupes
 SUB_ALIGNS = {}

 def alpha(a, b):
    return 1.5 if a != b else 0

 def print_sequence(seq, is_chars):
    (a_align, b_align), total_cost = seq
    if is_chars:
        trans = lambda i: i if i != None else '-'
    else:
        trans = lambda i: i
    print [trans(i) for i in a_align]
    print [trans(i) for i in b_align]
    print "TOTAL COST: ", total_cost

 def align(A, B, debug=False):
    # Return the calculation result

    if debug:
        print "ALIGN CALLED WITH:"
        print "\tA: ", A
        print "\tB: ", B

    # Cached cases
    cache_key = (len(A), len(B))
    r = SUB_ALIGNS.get(cache_key, None)
    if r is not None:
        # Fetch a copy, not the original
        return copy.deepcopy(r)

    # Boundary cases
    if len(A) == 0:
        tmp_result = [[None]*len(B), B], len(B)
        SUB_ALIGNS[cache_key] = copy.deepcopy(tmp_result)
        return copy.deepcopy(tmp_result)
    if len(B) == 0:
        tmp_result = [A, [None]*len(A)], len(A)
        SUB_ALIGNS[cache_key] = copy.deepcopy(tmp_result)
        return copy.deepcopy(tmp_result)

    # The N+1 solution
    NW_alignment, NW_cost = align(A[:-1], B[:-1])
    N_alignment, N_cost = align(A, B[:-1])
    W_alignment, W_cost = align(A[:-1], B)

    # Cost for each option
    NW_total_cost = NW_cost + alpha(A[-1], B[-1])
    N_total_cost = N_cost + 1
    W_total_cost = W_cost + 1

    # Exit early instead of returning multiple options
    min_total_cost = min(NW_total_cost, N_total_cost, W_total_cost)
    if min_total_cost == NW_total_cost:
        if debug:
            print "NW_alignment" + "="*50
        a_align, b_align = NW_alignment
        a_align.append(A[-1])
        b_align.append(B[-1])
    elif min_total_cost == N_total_cost:
        if debug:
            print "N_alignment" + "="*50
        a_align, b_align = N_alignment
        a_align.append(None)
        b_align.append(B[-1])
    elif min_total_cost == W_total_cost:
        if debug:
            print "W_alignment" + "="*50
        a_align, b_align = W_alignment
        a_align.append(A[-1])
        b_align.append(None)

    # Make sure we're only adding None values to sequences and not performing any other mods
    assert(len(A) == len([i for i in a_align if i != None]))
    assert(len(B) == len([i for i in b_align if i != None]))
    assert(len(a_align) == len(b_align))

    if debug:
        print "A: ", A
        print "B: ", B
        print "a_align: ", a_align
        print "b_align: ", b_align
        print "SETTING CACHE VALUE: ", cache_key

    tmp_result = [a_align, b_align], min_total_cost

    # Memoize
    # We're using append, so make sure we return a new copy
    SUB_ALIGNS[cache_key] = copy.deepcopy(tmp_result)

    return copy.deepcopy(tmp_result)

 def print_path(A, B):
    """Print the path of transitions
    """
    # Work backwards to find the path taken
    tuples = []
    current_state = (len(A), len(B))
    current_cost = SUB_ALIGNS
    while True:
        tuples.append(current_state)
        cx, cy = current_state
        NW = cx - 1, cy - 1
        NW_cost = SUB_ALIGNS.get(NW, [0,-1])[1]
        W = cx - 1, cy
        W_cost = SUB_ALIGNS.get(W, [0,-1])[1]
        N = cx, cy - 1
        N_cost = SUB_ALIGNS.get(N, [0,-1])[1]

        min_state = min(NW_cost, W_cost, N_cost)
        if min_state == -1:
            break

        if NW_cost == min_state:
            current_state = NW
        elif W_cost == min_state:
            current_state = W
        elif N_cost == min_state:
            current_state = N

    print "\n(PATH)" + "="*50
    for x in range(len(A) + 1):
        s = ""
        for y in range(len(B) + 1):
            s += "X " if (x, y) in tuples else "O "
        print s

 def test():
    s1 = ["A", "B", "B", "A"]
    s2 = ["A", "A"]
    seq = align(s1, s2)
    print_sequence(seq)

    # OK
    s1 = ["A", "B"]
    s2 = ["A", "A"]
    seq = align(s1, s2)
    print_sequence(seq)

    # OK
    s1 = ["B", "A", "A"]
    s2 = ["A", "A"]
    seq = align(s1, s2)
    print_sequence(seq)

    # OK
    s1 = ["B", "A", "A"]
    s2 = ["A", "B"]
    seq = align(s1, s2)
    print_sequence(seq)


 if __name__ == "__main__":
    # Do it with strings on the command line
    s1, s2 = list(sys.argv[1]), list(sys.argv[2])
    seq = align(s1, s2)
    print_sequence(seq, True)

    # Everything is covered; edges aren't saved
    print_path(s1, s2)
	"""
	Basic alignment of sequences (lists) of items
	Alpha for each alignment is (will be) configurable

	An "alignment" is a list of lists + a score

	Sequence alignment videos:
	https://www.youtube.com/watch?v=M0yn555erqo&index=12&list=PLAwxTw4SYaPkbWSEj_1iO7rILlWDJImW4

	"""

	import copy
	import sys

	# Map of tuple lengths (lenA, lenB) to (alignment, cost) tupes
	SUB_ALIGNS = {}

	def alpha(a, b):
	return 1.5 if a != b else 0

	def print_sequence(seq, is_chars):
	(a_align, b_align), total_cost = seq
	if is_chars:
	trans = lambda i: i if i != None else '-'
	else:
	trans = lambda i: i
	print [trans(i) for i in a_align]
	print [trans(i) for i in b_align]
	print "TOTAL COST: ", total_cost

	def align(A, B, debug=False):
	# Return the calculation result

	if debug:
	print "ALIGN CALLED WITH:"
	print "\tA: ", A
	print "\tB: ", B

	# Cached cases
	cache_key = (len(A), len(B))
	r = SUB_ALIGNS.get(cache_key, None)
	if r is not None:
	# Fetch a copy, not the original
	return copy.deepcopy(r)

	# Boundary cases
	if len(A) == 0:
	tmp_result = [[None]*len(B), B], len(B)
	SUB_ALIGNS[cache_key] = copy.deepcopy(tmp_result)
	return copy.deepcopy(tmp_result)
	if len(B) == 0:
	tmp_result = [A, [None]*len(A)], len(A)
	SUB_ALIGNS[cache_key] = copy.deepcopy(tmp_result)
	return copy.deepcopy(tmp_result)

	# The N+1 solution
	NW_alignment, NW_cost = align(A[:-1], B[:-1])
	N_alignment, N_cost = align(A, B[:-1])
	W_alignment, W_cost = align(A[:-1], B)

	# Cost for each option
	NW_total_cost = NW_cost + alpha(A[-1], B[-1])
	N_total_cost = N_cost + 1
	W_total_cost = W_cost + 1

	# Exit early instead of returning multiple options
	min_total_cost = min(NW_total_cost, N_total_cost, W_total_cost)
	if min_total_cost == NW_total_cost:
	if debug:
	print "NW_alignment" + "="*50
	a_align, b_align = NW_alignment
	a_align.append(A[-1])
	b_align.append(B[-1])
	elif min_total_cost == N_total_cost:
	if debug:
	print "N_alignment" + "="*50
	a_align, b_align = N_alignment
	a_align.append(None)
	b_align.append(B[-1])
	elif min_total_cost == W_total_cost:
	if debug:
	print "W_alignment" + "="*50
	a_align, b_align = W_alignment
	a_align.append(A[-1])
	b_align.append(None)

	# Make sure we're only adding None values to sequences and not performing any other mods
	assert(len(A) == len([i for i in a_align if i != None]))
	assert(len(B) == len([i for i in b_align if i != None]))
	assert(len(a_align) == len(b_align))

	if debug:
	print "A: ", A
	print "B: ", B
	print "a_align: ", a_align
	print "b_align: ", b_align
	print "SETTING CACHE VALUE: ", cache_key

	tmp_result = [a_align, b_align], min_total_cost

	# Memoize
	# We're using append, so make sure we return a new copy
	SUB_ALIGNS[cache_key] = copy.deepcopy(tmp_result)

	return copy.deepcopy(tmp_result)

	def print_path(A, B):
	"""Print the path of transitions
	"""
	# Work backwards to find the path taken
	tuples = []
	current_state = (len(A), len(B))
	current_cost = SUB_ALIGNS
	while True:
	tuples.append(current_state)
	cx, cy = current_state
	NW = cx - 1, cy - 1
	NW_cost = SUB_ALIGNS.get(NW, [0,-1])[1]
	W = cx - 1, cy
	W_cost = SUB_ALIGNS.get(W, [0,-1])[1]
	N = cx, cy - 1
	N_cost = SUB_ALIGNS.get(N, [0,-1])[1]

	min_state = min(NW_cost, W_cost, N_cost)
	if min_state == -1:
	break

	if NW_cost == min_state:
	current_state = NW
	elif W_cost == min_state:
	current_state = W
	elif N_cost == min_state:
	current_state = N

	print "\n(PATH)" + "="*50
	for x in range(len(A) + 1):
	s = ""
	for y in range(len(B) + 1):
	s += "X " if (x, y) in tuples else "O "
	print s

	def test():
	s1 = ["A", "B", "B", "A"]
	s2 = ["A", "A"]
	seq = align(s1, s2)
	print_sequence(seq)

	# OK
	s1 = ["A", "B"]
	s2 = ["A", "A"]
	seq = align(s1, s2)
	print_sequence(seq)

	# OK
	s1 = ["B", "A", "A"]
	s2 = ["A", "A"]
	seq = align(s1, s2)
	print_sequence(seq)

	# OK
	s1 = ["B", "A", "A"]
	s2 = ["A", "B"]
	seq = align(s1, s2)
	print_sequence(seq)


	if __name__ == "__main__":
	# Do it with strings on the command line
	s1, s2 = list(sys.argv[1]), list(sys.argv[2])
	seq = align(s1, s2)
	print_sequence(seq, True)

	# Everything is covered; edges aren't saved
	print_path(s1, s2)