Created
October 10, 2022 13:12
-
-
Save mzdravkov/8ebc4cc1047411cd3a16de71c5b0c22f to your computer and use it in GitHub Desktop.
Needleman-Wunsch algorithm in Python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import pprint | |
from collections import defaultdict | |
if len(sys.argv) != 3: | |
print("Please pass two sequences as separate arguments. E.g. ./program TCAT CGAT") | |
exit(1) | |
seq1 = sys.argv[1].upper() | |
seq2 = sys.argv[2].upper() | |
# Matrix to keep the scores in. | |
matrix = [] | |
# Substitution matrix for DNA | |
sub_matrix = { | |
'A': { | |
'A': 10, | |
'C': -5, | |
'G': 0, | |
'T': -5, | |
}, | |
'C': { | |
'A': -5, | |
'C': 10, | |
'G': -5, | |
'T': 0, | |
}, | |
'G': { | |
'A': 0, | |
'C': -5, | |
'G': 10, | |
'T': -5, | |
}, | |
'T': { | |
'A': -5, | |
'C': 0, | |
'G': -5, | |
'T': 10, | |
}, | |
} | |
# A map from position to a list of possible previous steps leading to it. | |
backsteps = defaultdict(lambda: []) | |
GAP_PENALTY = -4 | |
for i in range(len(seq2) + 1): | |
matrix.append([]) | |
for j in range(len(seq1) + 1): | |
# Top-left corner is initialized with score 0 | |
if i == 0 and j == 0: | |
matrix[i].append(0) | |
continue | |
# First row initialization uses only the values to the left | |
if i == 0: | |
matrix[i].append(matrix[i][j-1] + GAP_PENALTY) | |
backsteps[(i, j)].append((i, j-1)) | |
continue | |
# First column initialization uses only the values above | |
if j == 0: | |
matrix[i].append(matrix[i-1][j] + GAP_PENALTY) | |
backsteps[(i, j)].append((i-1, j)) | |
continue | |
sub_score = matrix[i-1][j-1] + sub_matrix[seq2[i-1]][seq1[j-1]] | |
seq1_gap_score = matrix[i-1][j] + GAP_PENALTY | |
seq2_gap_score = matrix[i][j-1] + GAP_PENALTY | |
max_score = max(sub_score, seq1_gap_score, seq2_gap_score) | |
matrix[i].append(max_score) | |
if max_score == sub_score: | |
backsteps[(i, j)].append((i-1, j-1)) | |
if max_score == seq1_gap_score: | |
backsteps[(i, j)].append((i-1, j)) | |
if max_score == seq2_gap_score: | |
backsteps[(i, j)].append((i, j-1)) | |
# Backtrack | |
# We start from the bottom-right corner of the matrix and look for possible | |
# routes that lead to it. | |
start = (len(seq2), len(seq1)) | |
def backtrack(pos): | |
"""Recursively finds all possible alternative (equivalent) paths | |
leading to a specific position.""" | |
if pos == (0, 0): | |
return [[(0, 0)]] | |
paths = [] | |
for possible_prev_step in backsteps[pos]: | |
for possible_backtrack in backtrack(possible_prev_step): | |
path = possible_backtrack + [pos] | |
paths.append(path) | |
return paths | |
print("Score matrix:") | |
print('\n'.join(' '.join('{0: >5}'.format(str(x)) for x in row) for row in matrix)) | |
print("\nAlternative backsteps:") | |
pprint.pprint(backsteps) | |
print("\nAlternative paths:") | |
alternative_paths = backtrack(start) | |
pprint.pprint(alternative_paths) | |
print("\nAlternative alignments:") | |
for path in alternative_paths: | |
print('=======================') | |
align1, align2 = "", "" | |
seq1_counter = 0 | |
seq2_counter = 0 | |
for i in range(len(path) - 1): | |
ri, ci = path[i] | |
rn, cn = path[i+1] | |
seq1_diff = cn - ci | |
seq2_diff = rn - ri | |
if seq1_diff == 0: | |
align1 += '-' | |
else: | |
align1 += seq1[seq1_counter] | |
seq1_counter += 1 | |
if seq2_diff == 0: | |
align2 += '-' | |
else: | |
align2 += seq2[seq2_counter] | |
seq2_counter += 1 | |
print(align1) | |
print(align2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment