Last active
February 27, 2016 05:45
-
-
Save GiantDarth/dcc011ac762544a33a2b to your computer and use it in GitHub Desktop.
Needleman-Wunsch vs Smith-Waterman
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
# Copyright (c) 2016 Christopher Philabaum | |
# CS599 - HW3 Needleman-Wunsch & Smith-Waterman | |
import random, datetime | |
class Alignment: | |
def __init__(self, a, b, align): | |
self._a = a | |
self._b = b | |
self._align = align | |
def __str__(self): | |
return self._a + '\n' + self._align + '\n' + self._b | |
NO_DIRECTION = '•' | |
UP_ARROW = '↑' | |
LEFT_ARROW = '←' | |
DIAGONAL_ARROW = '↖' | |
def needleman_wunsch(a, b, sim_table, gap_penalty): | |
# Initialize grid | |
grid = [] | |
traceback = [] | |
num_rows = len(a) + 1 | |
num_cols = len(b) + 1 | |
for i in range(num_rows): | |
grid.append([0] * num_cols) | |
traceback.append([''] * num_cols) | |
# Set first row based on gaps | |
for i in range(num_rows): | |
grid[i][0] = i * gap_penalty | |
# Set first col based on gaps | |
for j in range(num_cols): | |
grid[0][j] = j * gap_penalty | |
# Set initial traceback | |
traceback[0][0] = NO_DIRECTION | |
for i in range(1, num_rows): | |
traceback[i][0] = UP_ARROW | |
for j in range(1, num_cols): | |
traceback[0][j] = LEFT_ARROW | |
ARROWS = (DIAGONAL_ARROW, UP_ARROW, LEFT_ARROW) | |
for i in range(1, num_rows): | |
for j in range(1, num_cols): | |
# A faster way of getting the max and the max's index without multiple checks | |
# Combines ('↖', '↑', '←') & the previous values, produces: | |
# (('↖', val of diag), ('↑', val of above), ('←', val of left) | |
# and only maxes by the second items. | |
# Doesn't use touch memory unnecessarily, and does no additional checks | |
traceback[i][j], grid[i][j] = max( | |
zip(ARROWS, (grid[i-1][j-1] + sim_table[a[i-1] + b[j-1]], | |
grid[i-1][j] + gap_penalty, | |
grid[i][j-1] + gap_penalty)), | |
key=lambda p: p[1] | |
) | |
new_a = '' | |
new_b = '' | |
align = '' | |
x = num_rows - 1 | |
y = num_cols - 1 | |
while traceback[x][y] != NO_DIRECTION: | |
if x > 0 and y > 0 and traceback[x][y] == DIAGONAL_ARROW: | |
new_a = a[x-1] + new_a | |
new_b = b[y-1] + new_b | |
if a[x-1] == b[y-1]: | |
align = '|' + align | |
else: | |
align = 'X' + align | |
x -= 1 | |
y -= 1 | |
elif x > 0 and traceback[x][y] == UP_ARROW: | |
new_a = a[x-1] + new_a | |
new_b = '_' + new_b | |
align = ' ' + align | |
x -= 1 | |
else: | |
new_b = b[y-1] + new_b | |
new_a = '_' + new_a | |
align = ' ' + align | |
y -= 1 | |
return Alignment(new_a, new_b, align), grid, traceback | |
def smith_waterman(a, b, sim_table, gap_penalty): | |
# Initialize grid | |
grid = [] | |
traceback = [] | |
num_rows = len(a) + 1 | |
num_cols = len(b) + 1 | |
for i in range(num_rows): | |
grid.append([0] * num_cols) | |
traceback.append([''] * num_cols) | |
# Set initial traceback | |
traceback[0][0] = NO_DIRECTION | |
for i in range(1, num_rows): | |
traceback[i][0] = NO_DIRECTION | |
for j in range(1, num_cols): | |
traceback[0][j] = NO_DIRECTION | |
ARROWS = (NO_DIRECTION, DIAGONAL_ARROW, UP_ARROW, LEFT_ARROW) | |
for i in range(1, num_rows): | |
for j in range(1, num_cols): | |
# A faster way of getting the max and the max's index without multiple checks | |
# Combines ('↖', '↑', '←') & the previous values, produces: | |
# (('↖', val of diag), ('↑', val of above), ('←', val of left) | |
# and only maxes by the second items. | |
# Doesn't use touch memory unnecessarily, and does no additional checks | |
traceback[i][j], grid[i][j] = max( | |
zip(ARROWS, (0, | |
grid[i-1][j-1] + sim_table[a[i-1] + b[j-1]], | |
grid[i-1][j] + gap_penalty, | |
grid[i][j-1] + gap_penalty)), | |
key=lambda p: p[1] | |
) | |
new_a = '' | |
new_b = '' | |
align = '' | |
x = num_rows - 1 | |
y = num_cols - 1 | |
while traceback[x][y] != NO_DIRECTION: | |
if x > 0 and y > 0 and traceback[x][y] == DIAGONAL_ARROW: | |
new_a = a[x-1] + new_a | |
new_b = b[y-1] + new_b | |
if a[x-1] == b[y-1]: | |
align = '|' + align | |
else: | |
align = 'X' + align | |
x -= 1 | |
y -= 1 | |
elif x > 0 and traceback[x][y] == UP_ARROW: | |
new_a = a[x-1] + new_a | |
new_b = '_' + new_b | |
align = ' ' + align | |
x -= 1 | |
else: | |
new_b = b[y-1] + new_b | |
new_a = '_' + new_a | |
align = ' ' + align | |
y -= 1 | |
return Alignment(new_a, new_b, align), grid, traceback | |
def generate_sequence(mer): | |
seq = '' | |
for m in range(mer): | |
seq += random.choice('ACGT') | |
return seq | |
def generate_sequences(k, mer): | |
seqs = [] | |
for i in range(k): | |
seqs.append(generate_sequence(mer)) | |
return seqs | |
if __name__ == '__main__': | |
sim_table = dict() | |
sim_table["AA"] = 1 | |
sim_table["CC"] = 1 | |
sim_table["GG"] = 1 | |
sim_table["TT"] = 1 | |
sim_table["AC"] = -1 | |
sim_table["AG"] = -1 | |
sim_table["AT"] = -1 | |
sim_table["CA"] = -1 | |
sim_table["CG"] = -1 | |
sim_table["CT"] = -1 | |
sim_table["GA"] = -1 | |
sim_table["GC"] = -1 | |
sim_table["GT"] = -1 | |
sim_table["TA"] = -1 | |
sim_table["TC"] = -1 | |
sim_table["TG"] = -1 | |
initial_k = 1000 | |
mer_len = 100 | |
# Test Run | |
print("Needleman-Wunsch Test Run: ") | |
test_seq = "GCATGCT" | |
test_query = "GATTACA" | |
print("Seq: ", test_seq) | |
print("Qry: ", test_query) | |
print() | |
test_res = needleman_wunsch(test_seq, test_query, sim_table, -1) | |
print(test_res[1]) | |
print(test_res[2]) | |
print(test_res[0]) | |
print() | |
print("Smith-Waterman Test Run: ") | |
test_seq = "GCATGCT" | |
test_query = "GATTACA" | |
print("Seq: ", test_seq) | |
print("Qry: ", test_query) | |
print() | |
test_res = smith_waterman(test_seq, test_query, sim_table, -1) | |
print(test_res[1]) | |
print(test_res[2]) | |
print(test_res[0]) | |
print() | |
# Problem 1 A/B | |
print("Problem 1") | |
for i in range(4): | |
k = initial_k * 10**i | |
print("Running {} {}-mer sequences".format(k, mer_len)) | |
seqs = generate_sequences(k, mer_len) | |
query = generate_sequence(mer_len) | |
t = datetime.datetime.now() | |
for j in range(k): | |
needleman_wunsch(seqs[j], query, sim_table, -3) | |
t = datetime.datetime.now() - t | |
print("{}s".format(t.seconds)) | |
print("Done") | |
# Problem 2 A/B | |
print("Problem 2") | |
for i in range(4): | |
k = initial_k * 10**i | |
print("Running {} {}-mer sequences".format(k, mer_len)) | |
seqs = generate_sequences(k, mer_len) | |
query = generate_sequence(mer_len) | |
t = datetime.datetime.now() | |
for j in range(k): | |
smith_waterman(seqs[j], query, sim_table, -3) | |
t = datetime.datetime.now() - t | |
print("{}s".format(t.seconds)) | |
print("Done") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Problem 1 (Needleman-Wunsch): | |
1k: 14s | |
10k: 146s = 2m26s | |
100k: Projected 1460s = 24m20s | |
1000k: Projected 14600s = 243m20s | |
Problem 2 (Smith-Waterman): | |
1k: 12s | |
10k: 125 = 2m5s | |
100k: Projected 1250s = 20m50s | |
1000k: Projected 12500s = 208m20s |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment