Skip to content

Instantly share code, notes, and snippets.

@f6v
Created May 27, 2020 08:05
Show Gist options
  • Save f6v/3ac92667233a14aa071f264d8e13903b to your computer and use it in GitHub Desktop.
Save f6v/3ac92667233a14aa071f264d8e13903b to your computer and use it in GitHub Desktop.
from collections import defaultdict
from Bio import SeqIO
def overlap(kmer_left, kmer_right, overlap_size):
return kmer_left[-overlap_size:] == kmer_right[:overlap_size]
def maximal_overlap(kmer_left, kmer_right):
overlap_sizes = []
for overlap_size in range(1, len(kmer_right) + 1):
has_overlap = overlap(kmer_left, kmer_right, overlap_size)
if has_overlap:
overlap_sizes.append(overlap_size)
if len(overlap_sizes) >= 1:
return max(overlap_sizes)
return 0
def overlap_graph(reads, min_overlap):
graph = defaultdict(list)
for read in reads:
for other_read in reads:
if read != other_read and maximal_overlap(read, other_read) >= min_overlap:
graph[read].append(other_read)
result = {}
for item in graph.items():
result[item[0]] = item[1]
return result
def find_start_node(graph, reads):
in_counts = dict.fromkeys(reads, 0)
for from_node in graph.keys():
for to_node in graph[from_node]:
in_counts[to_node] += 1
for node, in_count in in_counts.items():
if in_count == 0:
return node
return None
def get_genome_path(graph, start_node):
path = [start_node]
current_node = start_node
while current_node in graph:
new_node = graph[current_node][0]
path.append(new_node)
current_node = new_node
return path
def reconstruct(genome_path):
result = [genome_path[0]]
#overlap = maximal_overlap(genome_path[0], genome_path[1])
for i in range(0, len(genome_path) - 1):
overlap = maximal_overlap(genome_path[i], genome_path[i + 1])
result.append(genome_path[i+1][overlap:])
return ''.join(result)
def shortestSuperstring(file_name):
sequences = SeqIO.parse(file_name, 'fasta')
reads = list(map(lambda seq_rec: str(seq_rec.seq), sequences))
graph = overlap_graph(reads, len(reads[0]) // 2 + 1)
start_node = find_start_node(graph, reads)
genome_path = get_genome_path(graph, start_node)
return reconstruct(genome_path)
print(shortestSuperstring('data_shortest_superstring.fna'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment