import pyham
import logging
treeFile=fastoma_out+'/species_tree_checked.nwk'
orthoxmlFile=fastoma_out+'/FastOMA_HOGs.orthoxml'
logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level=logging.INFO, datefmt='%Y-%m-%d %H:%M:%S')
logger = logging.getLogger("hog")
logger.setLevel(logging.INFO)
ham = pyham.Ham(treeFile, orthoxmlFile, use_internal_name=True)
# code from Alex and Stefano
def get_measures(ham, tree_path, fix_name_func=lambda x: x):
# added an input parameter to fix the names where underscore is removed
# first part, from tree
t = Tree.get(path=tree_path, schema='newick')
n = t.seed_node
terminal_branches = []
internal_branches = []
for n1 in n.levelorder_iter():
if n1.is_leaf():
terminal_branches.append((fix_name_func(n1.parent_node.label), n1.taxon.label))
else:
if n1.parent_node is not None:
internal_branches.append((fix_name_func(n1.parent_node.label), n1.label))
# second part
hog_qual = []
for (hog_id, hog) in tqdm(ham.top_level_hogs.items()):
x = hog.get_all_descendant_genes_clustered_by_species()
n_sp_in_hog = len(x.keys())
n_genes_in_hog = sum(1 for _ in itertools.chain.from_iterable(x.values()))
n_sp_below_root_level = len(hog.genome.taxon.get_leaf_names())
q = n_sp_in_hog / n_sp_below_root_level
hog_qual.append((hog_id, q, n_sp_below_root_level, n_sp_in_hog, n_genes_in_hog, hog.genome.name))
df = pd.DataFrame(hog_qual, columns=['hog_id', 'qual', 'n_species_below_root', 'n_species_observed', 'n_members',
'root_level'])
# count implied losses
implied_losses = Counter()
duplication_events = Counter()
stats = []
for btype, b in tqdm(list(zip(itertools.repeat('terminal'), terminal_branches)) + list(
zip(itertools.repeat('internal'), internal_branches))):
tail_genome = ham.get_ancestral_genome_by_name(b[0])
try:
head_genome = ham.get_ancestral_genome_by_name(b[1])
except:
head_genome = ham.get_extant_genome_by_name(b[1])
vmap = ham.compare_genomes_vertically(head_genome, tail_genome)
# count the loss events per-family
for g in vmap.get_lost():
fam_id = g.get_top_level_hog().hog_id
implied_losses[fam_id] += 1
# count the duplication events per-family
for (g_p, g_c) in vmap.get_duplicated().items():
fam_id = g_p.get_top_level_hog().hog_id
# number of duplication events
duplication_events[fam_id] += (len(g_c) - 1)
# also want to gather the statistics about the size of each of the sets
x = {'tail_node': b[0],
'head_node': b[1],
'retained_head': len(vmap.get_retained().values()),
'retained_tail': len(vmap.get_retained().keys()),
'gained': len(vmap.get_gained()),
'duplicated_head': len(list(itertools.chain.from_iterable(vmap.get_duplicated().values()))),
'duplicated_tail': len(vmap.get_duplicated().keys()),
'lost': len(vmap.get_lost()),
'tail_genome_size': len(tail_genome.genes),
'head_genome_size': len(head_genome.genes),
'branch_type': btype}
stats.append(x)
branch_stats = pd.DataFrame(stats)
df['implied_losses'] = df['hog_id'].apply(lambda x: implied_losses[x])
df['duplication_events'] = df['hog_id'].apply(lambda x: duplication_events[x])
df['norm_losses'] = (df['implied_losses'] / df['n_members'])
df['norm_loss_events'] = (df['implied_losses'] / df['n_species_below_root'])
# losses + duplications
df['n_events'] = df['implied_losses'] + df['duplication_events']
return branch_stats, df