Last active
November 21, 2019 00:51
-
-
Save dwinter/d02c4b0ab75d619ef8efb5d9428505b0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import defaultdict | |
from Bio import SeqIO | |
class Orthologs: | |
def __init__(self, ortho_file, seq_map_file): | |
"""Store information on orthologs from another of strains """ | |
self.ortholog_map = defaultdict(lambda: defaultdict(list)) | |
with open(ortho_file) as infile: | |
for line in infile: | |
og, gene_id, spp = line.split() | |
self.ortholog_map[og][spp].append(gene_id) | |
self.sequence_map = {} | |
with open(seq_map_file) as infile: | |
for line in infile: | |
spp, seq_file = line.split() | |
self.sequence_map[spp] = SeqIO.to_dict(SeqIO.parse(seq_file, "fasta")) | |
self.spp = self.sequence_map.keys() | |
self.orth = self.ortholog_map.keys() | |
def retrieve_seqs(self, ortho_id, spp_name = False): | |
"""Get sequences corresponding to a given ortho group. | |
Arguments: | |
ortho_id: the ID of a an ortho group present in this object. | |
spp_name: boolean, if True, replace the ID (and name and description) of | |
the sequence with the species name. | |
Returns: | |
A list of Biopython SeqRecords representing orthologs | |
""" | |
gene_id_map = self.ortholog_map[ortho_id] | |
seqs = [] | |
for sp in self.spp: | |
for gid in gene_id_map[sp]: | |
to_add = self.sequence_map[sp][gid] | |
if spp_name: | |
to_add.id = sp | |
to_add.description = "" | |
to_add.name = "" | |
seqs.append(to_add) | |
return(seqs) | |
def all_orthos(self, min_spp, include_paralogs = False): | |
"""Iterate over all ortholog groups | |
Arguments: | |
min_spp: minimmum number of species required for an | |
ortho group to be included. | |
include_paralogs: boolean, if True, allow ortho groups | |
that include paralogs (i.e. more than one gene | |
per species). If False skip such ortho groups | |
Returns: | |
This is a generator function, for use in for loops and simialr. | |
The generator yields tuple with two elements | |
1: the name of the ortho group being represented | |
2: a list of SeqRecords representing sequences in this ortho group | |
""" | |
for ortho_id in self.orth: | |
gene_id_map = self.ortholog_map[ortho_id] | |
if len(gene_id_map.keys()) < min_spp: | |
continue | |
if include_paralogs: | |
for seq_list in gene_id_map.values(): | |
if len(seq_list) > 1: | |
continue | |
# flatten a list of lists | |
yield( (ortho_id, self.retrieve_seqs(ortho_id, True) )) | |
O = Orthologs("ortho_subset.tsv", "seq_files.tsv") | |
O.retrieve_seqs('og_3922') | |
for ortho_name, seqs in O.all_orthos(4): | |
print("{} sequences from ortho group {}".format(ortho_name, seqs)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment