Skip to content

Instantly share code, notes, and snippets.

@mzdravkov
Created October 10, 2022 13:12
Show Gist options
  • Save mzdravkov/8ebc4cc1047411cd3a16de71c5b0c22f to your computer and use it in GitHub Desktop.
Save mzdravkov/8ebc4cc1047411cd3a16de71c5b0c22f to your computer and use it in GitHub Desktop.
Needleman-Wunsch algorithm in Python
import sys
import pprint
from collections import defaultdict
if len(sys.argv) != 3:
print("Please pass two sequences as separate arguments. E.g. ./program TCAT CGAT")
exit(1)
seq1 = sys.argv[1].upper()
seq2 = sys.argv[2].upper()
# Matrix to keep the scores in.
matrix = []
# Substitution matrix for DNA
sub_matrix = {
'A': {
'A': 10,
'C': -5,
'G': 0,
'T': -5,
},
'C': {
'A': -5,
'C': 10,
'G': -5,
'T': 0,
},
'G': {
'A': 0,
'C': -5,
'G': 10,
'T': -5,
},
'T': {
'A': -5,
'C': 0,
'G': -5,
'T': 10,
},
}
# A map from position to a list of possible previous steps leading to it.
backsteps = defaultdict(lambda: [])
GAP_PENALTY = -4
for i in range(len(seq2) + 1):
matrix.append([])
for j in range(len(seq1) + 1):
# Top-left corner is initialized with score 0
if i == 0 and j == 0:
matrix[i].append(0)
continue
# First row initialization uses only the values to the left
if i == 0:
matrix[i].append(matrix[i][j-1] + GAP_PENALTY)
backsteps[(i, j)].append((i, j-1))
continue
# First column initialization uses only the values above
if j == 0:
matrix[i].append(matrix[i-1][j] + GAP_PENALTY)
backsteps[(i, j)].append((i-1, j))
continue
sub_score = matrix[i-1][j-1] + sub_matrix[seq2[i-1]][seq1[j-1]]
seq1_gap_score = matrix[i-1][j] + GAP_PENALTY
seq2_gap_score = matrix[i][j-1] + GAP_PENALTY
max_score = max(sub_score, seq1_gap_score, seq2_gap_score)
matrix[i].append(max_score)
if max_score == sub_score:
backsteps[(i, j)].append((i-1, j-1))
if max_score == seq1_gap_score:
backsteps[(i, j)].append((i-1, j))
if max_score == seq2_gap_score:
backsteps[(i, j)].append((i, j-1))
# Backtrack
# We start from the bottom-right corner of the matrix and look for possible
# routes that lead to it.
start = (len(seq2), len(seq1))
def backtrack(pos):
"""Recursively finds all possible alternative (equivalent) paths
leading to a specific position."""
if pos == (0, 0):
return [[(0, 0)]]
paths = []
for possible_prev_step in backsteps[pos]:
for possible_backtrack in backtrack(possible_prev_step):
path = possible_backtrack + [pos]
paths.append(path)
return paths
print("Score matrix:")
print('\n'.join(' '.join('{0: >5}'.format(str(x)) for x in row) for row in matrix))
print("\nAlternative backsteps:")
pprint.pprint(backsteps)
print("\nAlternative paths:")
alternative_paths = backtrack(start)
pprint.pprint(alternative_paths)
print("\nAlternative alignments:")
for path in alternative_paths:
print('=======================')
align1, align2 = "", ""
seq1_counter = 0
seq2_counter = 0
for i in range(len(path) - 1):
ri, ci = path[i]
rn, cn = path[i+1]
seq1_diff = cn - ci
seq2_diff = rn - ri
if seq1_diff == 0:
align1 += '-'
else:
align1 += seq1[seq1_counter]
seq1_counter += 1
if seq2_diff == 0:
align2 += '-'
else:
align2 += seq2[seq2_counter]
seq2_counter += 1
print(align1)
print(align2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment