Last active
August 29, 2015 14:21
-
-
Save jhcepas/39e214d6e85cd428688f to your computer and use it in GitHub Desktop.
Newick tree to OrthoXML converter
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import sys | |
from ete2 import PhyloTree, orthoxml | |
import argparse | |
__DESCRIPTION__ = """ | |
etree2orthoxml is a python script that extracts evolutionary events | |
(speciation and duplication) from a newick tree and exports them as an | |
OrthoXML file. | |
## ######################################################################## | |
## | |
## YOU CAN FIND EXAMPLES AND DOCUMENTATION AT: | |
## | |
## http://pythonhosted.org/ete2/tutorial/tutorial_etree2orthoxml.html | |
## | |
## ######################################################################## | |
""" | |
SPECIES_NAME_DELIMITER = "" | |
SPECIES_NAME_POS = 0 | |
def extract_spname(node_name): | |
return node_name.split(SPECIES_NAME_DELIMITER)[SPECIES_NAME_POS] | |
def export_as_orthoXML(t, database, evoltype_attr="evoltype"): | |
""" This function takes a ETE PhyloTree instance and export all | |
its speciation and duplication events as OrthoXML format. | |
Note that this function can be enriched or customized to include | |
more orthoXML features using the orthoxml.py module. | |
""" | |
# Creates an empty orthoXML object | |
O = orthoxml.orthoXML() | |
#Generate the structure containing sequence information | |
leaf2id= {} | |
sp2genes = {} | |
for genid, leaf in enumerate(t.iter_leaves()): | |
spname=extract_spname(leaf.name) | |
if spname not in sp2genes: | |
sp = orthoxml.species(spname) | |
db = orthoxml.database(name=database) | |
genes = orthoxml.genes() | |
sp.add_database(db) | |
db.set_genes(genes) | |
sp2genes[spname] = genes | |
# add info to the orthoXML document | |
O.add_species(sp) | |
else: | |
genes = sp2genes[spname] | |
gn = orthoxml.gene(protId=leaf.name, id=genid) | |
leaf2id[leaf] = genid | |
genes.add_gene(gn) | |
# Add an ortho group container to the orthoXML document | |
ortho_groups = orthoxml.groups() | |
O.set_groups(ortho_groups) | |
# OrthoXML does not support duplication events to be at the root | |
# of the tree, so we search for the top most speciation events in | |
# the tree and export them as separate ortholog groups | |
is_speciation = lambda n: getattr(n, evoltype_attr, "") == "S" or not n.children | |
for speciation_root in t.iter_leaves(is_leaf_fn=is_speciation): | |
# Creates an orthogroup in which all events will be added | |
node2event = {} | |
node2event[speciation_root] = orthoxml.group() | |
ortho_groups.add_orthologGroup(node2event[speciation_root]) | |
# if root node is a leaf, just export an orphan sequence within the group | |
if speciation_root.is_leaf(): | |
node2event[speciation_root].add_geneRef(orthoxml.geneRef(leaf2id[speciation_root])) | |
# otherwise, descend the tree and export orthology structure | |
for node in speciation_root.traverse("preorder"): | |
if node.is_leaf(): | |
continue | |
parent_event = node2event[node] | |
for ch in node.children: | |
if ch.is_leaf(): | |
parent_event.add_geneRef(orthoxml.geneRef(leaf2id[ch])) | |
else: | |
node2event[ch] = orthoxml.group() | |
try: | |
evoltype = getattr(ch, evoltype_attr) | |
except AttributeError: | |
if not ch.is_leaf(): | |
print ch.features | |
raise AttributeError("\n\nUnknown evolutionary event. Please use [S] for speciation and [D] for duplication: %s" %ch.get_ascii()) | |
if evoltype == "D": | |
parent_event.add_paralogGroup(node2event[ch]) | |
elif evoltype == "S": | |
parent_event.add_orthologGroup(node2event[ch]) | |
O.export(sys.stdout, 0, namespace_="") | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description=__DESCRIPTION__, | |
formatter_class=argparse.RawDescriptionHelpFormatter) | |
parser.add_argument('tree', metavar='tree_file', type=str, nargs=1, | |
help='A tree file (or text string) in newick format.') | |
parser.add_argument("--sp_delimiter", dest="species_delimiter", | |
type=str, default="_", | |
help=("When species names are guessed from node names," | |
" this argument specifies how to split node name to guess" | |
" the species code")) | |
parser.add_argument("--sp_field", dest="species_field", | |
type=int, default=1, | |
help=("When species names are guessed from node names," | |
" this argument specifies the position of the species" | |
" name code relative to the name splitting delimiter")) | |
parser.add_argument("--root", dest="root", | |
type=str, nargs="*", | |
help="Roots the tree to the node grouping the list" | |
" of node names provided (space separated). In example:" | |
"'--root human rat mouse'") | |
parser.add_argument("--skip_ortholog_detection", dest="skip_ortholog_detection", | |
action="store_true", | |
help=("Skip automatic detection of" | |
" speciation and duplication events, thus relying in the" | |
" correct annotation of the provided tree using" | |
" the extended newick format (i.e. '((A, A)[&&NHX:evoltype=D], B)[&&NHX:evoltype=S];')")) | |
parser.add_argument("--evoltype_attr", dest="evoltype_attr", | |
type=str, default="evoltype", | |
help=("When orthology detection is disabled," | |
" the attribute name provided here will be expected to exist" | |
" in all internal nodes and read from the extended newick format")) | |
parser.add_argument("--database", dest="database", | |
type=str, default="", | |
help=("Database name")) | |
parser.add_argument("--show", dest="show", | |
action="store_true", default="", | |
help=("Show the tree and its evolutionary events before orthoXML export")) | |
parser.add_argument("--ascii", dest="ascii", | |
action="store_true", default="", | |
help=("Show the tree using ASCII representation and all its evolutionary" | |
" events before orthoXML export")) | |
parser.add_argument("--newick", dest="newick", | |
action="store_true", default="", | |
help=("print the extended newick format for provided tree using" | |
" ASCII representation and all its evolutionary events" | |
" before orthoXML export")) | |
args = parser.parse_args() | |
newick = args.tree[0] | |
SPECIES_NAME_POS = args.species_field | |
SPECIES_NAME_DELIMITER = args.species_delimiter | |
# load a phylomeDB Tree provided as a newick file in the command line | |
t = PhyloTree(newick, sp_naming_function=extract_spname) | |
if args.root: | |
if len(args.root) > 1: | |
outgroup = t.get_common_ancestor(args.root) | |
else: | |
outgroup = t & args.root[0] | |
t.set_outgroup(outgroup) | |
if not args.skip_ortholog_detection: | |
# detect speciation and duplication events using the species overlap | |
# algorithm used in phylomeDB | |
t.get_descendant_evol_events() | |
if args.ascii: | |
print t.get_ascii(attributes=[args.evoltype_attr, "name"], show_internal=True) | |
if args.newick: | |
print t.write(features=[args.evoltype_attr], format_root_node=True) | |
if args.show: | |
t.show() | |
export_as_orthoXML(t, args.database, args.evoltype_attr) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment