Skip to content

Instantly share code, notes, and snippets.

@daler
Last active November 29, 2024 23:01
Show Gist options
  • Save daler/16d45166b7cdeb3df687d4c9d0681a64 to your computer and use it in GitHub Desktop.
Save daler/16d45166b7cdeb3df687d4c9d0681a64 to your computer and use it in GitHub Desktop.
Parse orthoXML into TSV files
#!/usr/bin/env python
"""
Expand orthogroups from an orthoXML file into TSV, and then only select the
easy cases where there is exactly one gene from each species. The full TSV is
output as well.
Currently assumes *pairwise* files.
Output will have suffixes _full.tsv and _pair.tsv, with full and only genes in
orthogroups containing one gene from each species.
"""
import xml.etree.ElementTree as ET
import pandas as pd
import argparse
ap = argparse.ArgumentParser(usage=__doc__)
ap.add_argument(
"xml",
help="XML file downloaded from https://inparanoid8.sbc.su.se/download/8.0_current/Orthologs_OrthoXML/. Output files will be placed next to this file, with appropriate suffixes.",
)
args = ap.parse_args()
# Find your file of interest from
# https://inparanoid8.sbc.su.se/download/8.0_current/Orthologs_OrthoXML/
xml_filename = args.xml
tree = ET.parse(xml_filename)
root = tree.getroot()
# When searching within XPath, we need to add the namespace. This makes it
# slightly more convenient to do so within f-strings.
NS = "{http://orthoXML.org/2011/}"
# First part of XML has a section for each species, giving a gene ID to each
# gene. IDs are unique across species. This keeps track of them in a dictionary
# we'll use later.
gene_lookup = {}
for species in root.findall(f".//{NS}species"):
name = species.attrib["name"].strip()
genes = species.findall(f".//{NS}gene")
for gene in genes:
gene_lookup[gene.attrib["id"]] = {
"protId": gene.attrib["protId"],
"geneId": gene.attrib["geneId"],
"species": name,
}
# The next part of the XML has orthogroups. Each group has two or more genes in
# it. Technically the group can contain genes from a single species; these are
# paralogs. Not sure if those make it into this particular XML.
#
# Here, we iterate through the groups, and sort of "unroll" them into lines
# containing duplicate info.
#
# That is, this XML item:
#
# <orthologGroup id="1">
# <score id="bit" value="7474"/>
# <geneRef id="1">
# <score id="inparalog" value="1"/>
# <score id="bootstrap" value="1"/>
# </geneRef>
# <geneRef id="2">
# <score id="inparalog" value="1"/>
# <score id="bootstrap" value="1"/>
# </geneRef>
# </orthologGroup>
#
# Becomes the following two dictionaries where we add the species, protein, and
# gene id from the lookup above:
#
# { 'protId': 'H9J4V7',
# 'geneId': 'uncharacterized_H9J4V7',
# 'species': 'Bombyx mori',
# 'orthogroup_id': '1',
# 'bit_score': 7474,
# 'inparalog_score': '1',
# 'bootstrap_score': '1',
# },
# { 'protId': 'D1YSG0',
# 'geneId': 'bt',
# 'species': 'Drosophila melanogaster',
# 'orthogroup_id': '1',
# 'bit_score': 7474,
# 'inparalog_score': '1',
# 'bootstrap_score': '1'
# }
#
# These are then added to a dataframe that can be grouped by orthogroup_id
# later.
# Keep dictionaries here to later convert to DataFrame.
lines = []
# Iterate through groups (one of which is shown above as an example).
for group in root.findall(f"{NS}groups/"):
orthogroup_id = group.attrib["id"]
# bit score is apparently for the group as a whole
bit_score = int(group.findall(f"./{NS}score")[0].attrib["value"])
# We'll make a line for each gene in the orthogroup, after adding gene info
# stored elsewhere in XML.
for gene_ref in group.findall(f"./{NS}geneRef"):
# Make a copy because we're about to attach additional info; don't want
# one gene in multiple orthology groups having changing information.
gene_info = gene_lookup[gene_ref.attrib["id"]].copy()
gene_info["orthogroup_id"] = orthogroup_id
gene_info["bit_score"] = bit_score
# Add scores to the line.
for score in gene_ref.findall(f"./{NS}score"):
gene_info[score.attrib["id"] + "_score"] = score.attrib["value"]
lines.append(gene_info)
full_df = pd.DataFrame(lines)
# Now group by orthogroup ID, and only keep ones where we have exactly one gene
# from each species. The resulting TSV will contain only the most confident
# orthologs between the species.
pairs = []
for oid, group in full_df.sort_values("orthogroup_id").groupby("orthogroup_id"):
if len(group) == 2 and (len(group["species"].unique()) == 2):
pair = {}
for _, line in group.iterrows():
pair[line["species"]] = line["geneId"]
pairs.append(pair)
pair_df = pd.DataFrame(pairs)
full_df.to_csv(xml_filename + "_full.tsv", sep="\t", index=False)
pair_df.to_csv(xml_filename + "_pair.tsv", sep="\t", index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment