Created
October 30, 2017 13:36
-
-
Save cgoliver/e96a2043145e9ca7f672e054165bddc0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Nussinov RNA folding algorithm + recursive backtrack. Implemented by Carlos G. Oliver ## | |
import numpy as np | |
min_loop_length = 4 | |
def ss_to_bp(ss): | |
pairs = [] | |
stack = [] | |
for i, b in enumerate(ss): | |
if b == "(": | |
stack.append(i) | |
if b == ")": | |
pairs.append((stack.pop(), i)) | |
if len(stack) > 0: | |
print("INVALID STRUCTURE, BRACKETS NOT BALANCED!") | |
return None | |
return (ss, pairs) | |
def pair_check(tup): | |
if tup in [('A', 'U'), ('U', 'A'), ('C', 'G'), ('G', 'C')]: | |
return True | |
return False | |
def OPT(i,j, sequence): | |
""" | |
returns the score of the optimal pairing between indices i and j | |
""" | |
#base case: no pairs allowed when i and j are less than 4 bases apart | |
if i >= j-min_loop_length: | |
return 0 | |
else: | |
#i and j can either be paired or not be paired, if not paired then the optimal score is OPT(i,j-1) | |
unpaired = OPT(i, j-1, sequence) | |
#check if j can be involved in a pairing with a position t | |
pairing = [1 + OPT(i, t-1, sequence) + OPT(t+1, j-1, sequence) for t in range(i, j-4)\ | |
if pair_check((sequence[t], sequence[j]))] | |
if not pairing: | |
pairing = [0] | |
paired = max(pairing) | |
return max(unpaired, paired) | |
def traceback(i, j, structure, DP, sequence): | |
#in this case we've gone through the whole sequence. Nothing to do. | |
if j <= i: | |
return | |
#if j is unpaired, there will be no change in score when we take it out, so we just recurse to the next index | |
elif DP[i][j] == DP[i][j-1]: | |
traceback(i, j-1, structure, DP, sequence) | |
#consider cases where j forms a pair. | |
else: | |
#try pairing j with a matching index k to its left. | |
for k in [b for b in range(i, j-min_loop_length) if pair_check((sequence[b], sequence[j]))]: | |
#if the score at i,j is the result of adding 1 from pairing (j,k) and whatever score | |
#comes from the substructure to its left (i, k-1) and to its right (k+1, j-1) | |
if k-1 < 0: | |
if DP[i][j] == DP[k+1][j-1] + 1: | |
structure.append((k,j)) | |
traceback(k+1, j-1, structure, DP, sequence) | |
break | |
elif DP[i][j] == DP[i][k-1] + DP[k+1][j-1] + 1: | |
#add the pair (j,k) to our list of pairs | |
structure.append((k,j)) | |
#move the recursion to the two substructures formed by this pairing | |
traceback(i, k-1, structure, DP, sequence) | |
traceback(k+1, j-1, structure, DP, sequence) | |
break | |
def write_structure(sequence, structure): | |
dot_bracket = ["." for _ in range(len(sequence))] | |
for s in structure: | |
dot_bracket[min(s)] = "(" | |
dot_bracket[max(s)] = ")" | |
return "".join(dot_bracket) | |
#initialize matrix with zeros where can't have pairings | |
def initialize(N): | |
#NxN matrix that stores the scores of the optimal pairings. | |
DP = np.empty((N,N)) | |
DP[:] = np.NAN | |
for k in range(0, min_loop_length): | |
for i in range(N-k): | |
j = i + k | |
DP[i][j] = 0 | |
return DP | |
def nussinov(sequence): | |
N = len(sequence) | |
DP = initialize(N) | |
structure = [] | |
#fill the DP matrix diagonally | |
for k in range(min_loop_length, N): | |
for i in range(N-k): | |
j = i + k | |
DP[i][j] = OPT(i,j, sequence) | |
#copy values to lower triangle to avoid null references | |
for i in range(N): | |
for j in range(0, i): | |
DP[i][j] = DP[j][i] | |
traceback(0,N-1, structure, DP, sequence) | |
return (write_structure(sequence, structure), structure) | |
if __name__ == "__main__": | |
print(nussinov("AACCCGAGAGACCAA")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment