Created
January 8, 2021 09:40
-
-
Save chrisjurich/c77a55c5b7bf7380b71fb0ef3a7f7ef2 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from enum import Enum | |
from abc import ABC, abstractmethod | |
from multipledispatch import dispatch | |
class MotifType(Enum): | |
SINGLESTRAND = 0 | |
HELIX = 1 | |
HAIRPIN = 2 | |
JUNCTION = 3 | |
TYPE_MAPPER = { | |
MotifType.SINGLESTRAND : "SingleStrand", | |
MotifType.HELIX : "Helix", | |
MotifType.HAIRPIN : "Hairpin", | |
MotifType.JUNCTION : "Junction", | |
} | |
def connectivity_list(structure): | |
connections, pairs = [-1]*len(structure), [] | |
for index, db in enumerate(structure): | |
if db == '(': | |
pairs.append(index) | |
elif db == ')': | |
complement = pairs.pop() | |
connections[complement] = index | |
connections[index] = complement | |
assert len(pairs) == 0 | |
return connections | |
class Motif(ABC): | |
def __init__(self, **kwargs): | |
self.type_ = None | |
self.parent_ = None | |
self.sequence_ = str() | |
self.children_ = [] | |
self.strands_ = [] | |
self.depth = int() | |
self.structure = str() | |
self.token = str() | |
if 'sequence' in kwargs: | |
self.sequence_ = kwargs['sequence'] | |
if 'strands_' in kwargs: | |
self.strands_ = kwargs['strands'] | |
for variable, value in kwargs.items(): | |
setattr(self, variable, value) | |
if len(self.sequence_) == 0: | |
return | |
strand_seqs = [] | |
for strand in self.strands: | |
strand_seqs.append(''.join([self.sequence_[index] for index in strand])) | |
self.sequence_ = '&'.join(strand_seqs) | |
def link_children_(self, depth): | |
if self.type() == MotifType.SINGLESTRAND: | |
depth = 0 | |
self.depth = depth | |
for child in self.children(): | |
child.parent(self) | |
for child in self.children(): | |
child.link_children_(depth+1) | |
def str(self): | |
depth = '\t'*self.depth | |
if len(self.children_ ) == 0: | |
return f"{depth}{self.token} {self.structure} {self.sequence_}" | |
else: | |
children_ = '\n'.join([""] + [child.str() for child in self.children_]) | |
return f"{depth}{self.token} {self.structure} {self.sequence_}{children_}" | |
def __str__(self): | |
return self.str() | |
def __repr__(self): | |
return self.str() | |
# def __eq__(self, other): | |
def type(self): | |
return self.type_ | |
def children(self): | |
return self.children_ | |
def parent(self, other=None): | |
if other is not None: | |
self.parent_ = other | |
else: | |
return self.parent_ | |
@abstractmethod | |
def buffer(self): | |
pass | |
def has_children(self): | |
return len(self.children_) > 0 | |
class SingleStrand(Motif): | |
def __init__(self, **kwargs): | |
super().__init__( **kwargs) | |
self.type_ = MotifType.SINGLESTRAND | |
self.structure = '.'*len(self.sequence_) | |
self.token = f"SingleStrand{len(self.sequence_)}" | |
def buffer(self): | |
return -1 | |
class Helix(Motif): | |
def __init__(self, **kwargs): | |
super().__init__( **kwargs) | |
self.type_ = MotifType.HELIX | |
self.size_ = (len(self.sequence_) - 1) // 2 | |
self.structure = f'{"("*self.size_}&{")"*self.size_}' | |
self.token = f"Helix{self.size_}" | |
self.pairs_ = [] | |
for index in range(self.size_): | |
self.pairs_.append( | |
self.sequence_[index] + self.sequence_[-index-1] | |
) | |
def buffer(self): | |
return self.size_ | |
def pairs(self): | |
return self.pairs_ | |
class Hairpin(Motif): | |
def __init__(self, **kwargs): | |
super().__init__( **kwargs) | |
self.type_ = MotifType.HAIRPIN | |
size = len(self.sequence_) - 2 | |
self.structure = '(' + '.'*size + ')' | |
self.token = f"Hairpin{size}" | |
def buffer(self): | |
return self.parent().buffer() | |
class Junction(Motif): | |
def __init__(self, **kwargs): | |
super().__init__( **kwargs) | |
self.type_ = MotifType.JUNCTION | |
self.gaps_ = [len(strand) -2 for strands in self.strands_] | |
self.structure = ['.']*len(self.sequence_) | |
for index, nt in enumerate(self.sequence_): | |
if nt == '&': | |
self.structure[index-1] = '(' | |
self.structure[index] = '&' | |
self.structure[index+1] = ')' | |
self.structure[0] = '(' | |
self.structure[-1] = ')' | |
self.structure = "".join(self.structure) | |
self.token = "Junction_" + '|'.join([str(len(strand)-2) for strand in self.strands_]) | |
def buffer(self): | |
buffers = [self.parent.buffer()] | |
for child in self.children: | |
buffers.append(child.buffer()) | |
return buffers | |
def gaps(self): | |
return self.gaps_ | |
def helix_length_(connections, start): | |
complement = connections[start] | |
length = 0 | |
while start + length == connections[connections[start+length]]: | |
length += 1 | |
return length | |
def get_junction_or_helix_(sequence, connections, start): | |
strands = [] | |
offset = 1 | |
it = start | |
while True: | |
next = [it] | |
it += 1 | |
while connections[it] == -1: | |
next.append(it) | |
it += 1 | |
next.append(it) | |
strands.append(next) | |
it = connections[it] | |
if it == start: | |
break | |
if len(strands) > 1: | |
# JUNCTION | |
junction = Junction(strands=strands, sequence=sequence) | |
for strand in strands[:-1]: | |
junction.children_.append(get_motifs_(sequence, connections, strand[-1])) | |
return junction | |
else: | |
hp = Hairpin(sequence=sequence, strands=strands) | |
return hp | |
def get_helix_(sequence, connections, start): | |
# figure out how big this current helix i | |
helix_len = helix_length_(connections, start) | |
lhs, rhs = [], [] | |
for index in range(start, start + helix_len): | |
lhs.append(index) | |
rhs.append(connections[index]) | |
rhs.reverse() | |
helix = Helix(sequence=sequence, strands = [lhs, rhs] ) | |
# figure out what is at the other end of the junction | |
helix.children_.append(get_junction_or_helix_(sequence, connections, start+helix_len-1)) | |
# basically need to check if already inside of a junction or need to go out to a new singlestrand | |
# the singlestrand is only added if the singlestrand ends out the structure or the next helix | |
# is not part of a junction | |
it = rhs[-1] + 1 | |
while it < len(connections) and connections[it] == -1: | |
it += 1 | |
if it == len(connections) or connections[it] > rhs[-1]: | |
helix.children_.append(get_motifs_(sequence, connections, rhs[-1] + 1)) | |
return helix | |
def get_singlestrand_(sequence, connections, start): | |
offset = 0 | |
while start + offset < len(connections) and connections[start+offset] == -1: | |
offset += 1 | |
strand = list(range(start,start+offset)) | |
singlestrand = SingleStrand(sequence=sequence, strands=[strand]) | |
# checking here if there is an outer nt | |
if start + offset < len(connections): | |
singlestrand.children_.append(get_motifs_(sequence, connections, start+offset)) | |
return singlestrand | |
def get_motifs_(sequence, connections, start): | |
if connections[start] < 0: | |
return get_singlestrand_(sequence, connections, start) | |
return get_helix_(sequence, connections, start) | |
def parse_to_motifs(structure, sequence): | |
connections = connectivity_list(structure) | |
root = get_motifs_(sequence, connections, 0) | |
root.link_children_(0) | |
return root | |
if __name__ == "__main__": | |
ss="...(((.((((....)))).)))..(((.((((....)))).)))..(((.((((....)))).)))..(((.((((....)))).)))..(((((((....)))))))....................." | |
seq="GGAACCGAGACUUCGGUCUGGGUAAACCGUGACUUCGGUCAGGGUAAACCGCGACUUCGGUCGGGGUAAACCGGGACUUCGGUCCGGGUAAACUGUUUUUCGAAACAGUAAAAGAAACAACAACAACAAC" | |
m = parse_to_motifs(ss, seq) | |
print(m) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment