Created
May 27, 2020 08:05
-
-
Save f6v/3ac92667233a14aa071f264d8e13903b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import defaultdict | |
from Bio import SeqIO | |
def overlap(kmer_left, kmer_right, overlap_size): | |
return kmer_left[-overlap_size:] == kmer_right[:overlap_size] | |
def maximal_overlap(kmer_left, kmer_right): | |
overlap_sizes = [] | |
for overlap_size in range(1, len(kmer_right) + 1): | |
has_overlap = overlap(kmer_left, kmer_right, overlap_size) | |
if has_overlap: | |
overlap_sizes.append(overlap_size) | |
if len(overlap_sizes) >= 1: | |
return max(overlap_sizes) | |
return 0 | |
def overlap_graph(reads, min_overlap): | |
graph = defaultdict(list) | |
for read in reads: | |
for other_read in reads: | |
if read != other_read and maximal_overlap(read, other_read) >= min_overlap: | |
graph[read].append(other_read) | |
result = {} | |
for item in graph.items(): | |
result[item[0]] = item[1] | |
return result | |
def find_start_node(graph, reads): | |
in_counts = dict.fromkeys(reads, 0) | |
for from_node in graph.keys(): | |
for to_node in graph[from_node]: | |
in_counts[to_node] += 1 | |
for node, in_count in in_counts.items(): | |
if in_count == 0: | |
return node | |
return None | |
def get_genome_path(graph, start_node): | |
path = [start_node] | |
current_node = start_node | |
while current_node in graph: | |
new_node = graph[current_node][0] | |
path.append(new_node) | |
current_node = new_node | |
return path | |
def reconstruct(genome_path): | |
result = [genome_path[0]] | |
#overlap = maximal_overlap(genome_path[0], genome_path[1]) | |
for i in range(0, len(genome_path) - 1): | |
overlap = maximal_overlap(genome_path[i], genome_path[i + 1]) | |
result.append(genome_path[i+1][overlap:]) | |
return ''.join(result) | |
def shortestSuperstring(file_name): | |
sequences = SeqIO.parse(file_name, 'fasta') | |
reads = list(map(lambda seq_rec: str(seq_rec.seq), sequences)) | |
graph = overlap_graph(reads, len(reads[0]) // 2 + 1) | |
start_node = find_start_node(graph, reads) | |
genome_path = get_genome_path(graph, start_node) | |
return reconstruct(genome_path) | |
print(shortestSuperstring('data_shortest_superstring.fna')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment