daler · November 29, 2024 23:01
diff --git a/parse_orthoxml.py b/parse_orthoxml.py
 #!/usr/bin/env python

 """
 Expand orthogroups from an orthoXML file into TSV, and then only select the
 easy cases where there is exactly one gene from each species. The full TSV is
 output as well.

 Currently assumes *pairwise* files.

 Output will have suffixes _full.tsv and _pair.tsv, with full and only genes in
 orthogroups containing one gene from each species.
 """

 import xml.etree.ElementTree as ET
 import pandas as pd
 import argparse

 ap = argparse.ArgumentParser(usage=__doc__)
 ap.add_argument(
    "xml",
    help="XML file downloaded from https://inparanoid8.sbc.su.se/download/8.0_current/Orthologs_OrthoXML/. Output files will be placed next to this file, with appropriate suffixes.",
 )
 args = ap.parse_args()

 # Find your file of interest from
 # https://inparanoid8.sbc.su.se/download/8.0_current/Orthologs_OrthoXML/
 xml_filename = args.xml
 tree = ET.parse(xml_filename)
 root = tree.getroot()

 # When searching within XPath, we need to add the namespace. This makes it
 # slightly more convenient to do so within f-strings.
 NS = "{http://orthoXML.org/2011/}"

 # First part of XML has a section for each species, giving a gene ID to each
 # gene. IDs are unique across species. This keeps track of them in a dictionary
 # we'll use later.
 gene_lookup = {}
 for species in root.findall(f".//{NS}species"):
    name = species.attrib["name"].strip()
    genes = species.findall(f".//{NS}gene")
    for gene in genes:
        gene_lookup[gene.attrib["id"]] = {
            "protId": gene.attrib["protId"],
            "geneId": gene.attrib["geneId"],
            "species": name,
        }

 # The next part of the XML has orthogroups. Each group has two or more genes in
 # it. Technically the group can contain genes from a single species; these are
 # paralogs. Not sure if those make it into this particular XML.
 #
 # Here, we iterate through the groups, and sort of "unroll" them into lines
 # containing duplicate info.
 #
 # That is, this XML item:
 #
 #     <orthologGroup id="1">
 #       <score id="bit" value="7474"/>
 #       <geneRef id="1">
 #         <score id="inparalog" value="1"/>
 #         <score id="bootstrap" value="1"/>
 #       </geneRef>
 #       <geneRef id="2">
 #         <score id="inparalog" value="1"/>
 #         <score id="bootstrap" value="1"/>
 #       </geneRef>
 #     </orthologGroup>
 #
 # Becomes the following two dictionaries where we add the species, protein, and
 # gene id from the lookup above:
 #
 # { 'protId': 'H9J4V7',
 #   'geneId': 'uncharacterized_H9J4V7',
 #   'species': 'Bombyx mori',
 #   'orthogroup_id': '1',
 #   'bit_score': 7474,
 #   'inparalog_score': '1',
 #   'bootstrap_score': '1',
 # },
 # { 'protId': 'D1YSG0',
 #   'geneId': 'bt',
 #   'species': 'Drosophila melanogaster',
 #   'orthogroup_id': '1',
 #   'bit_score': 7474,
 #   'inparalog_score': '1',
 #   'bootstrap_score': '1'
 # }
 #
 # These are then added to a dataframe that can be grouped by orthogroup_id
 # later.

 # Keep dictionaries here to later convert to DataFrame.
 lines = []

 # Iterate through groups (one of which is shown above as an example).
 for group in root.findall(f"{NS}groups/"):
    orthogroup_id = group.attrib["id"]

    # bit score is apparently for the group as a whole
    bit_score = int(group.findall(f"./{NS}score")[0].attrib["value"])

    # We'll make a line for each gene in the orthogroup, after adding gene info
    # stored elsewhere in XML.
    for gene_ref in group.findall(f"./{NS}geneRef"):

        # Make a copy because we're about to attach additional info; don't want
        # one gene in multiple orthology groups having changing information.
        gene_info = gene_lookup[gene_ref.attrib["id"]].copy()
        gene_info["orthogroup_id"] = orthogroup_id
        gene_info["bit_score"] = bit_score

        # Add scores to the line.
        for score in gene_ref.findall(f"./{NS}score"):
            gene_info[score.attrib["id"] + "_score"] = score.attrib["value"]

        lines.append(gene_info)

 full_df = pd.DataFrame(lines)

 # Now group by orthogroup ID, and only keep ones where we have exactly one gene
 # from each species. The resulting TSV will contain only the most confident
 # orthologs between the species.
 pairs = []
 for oid, group in full_df.sort_values("orthogroup_id").groupby("orthogroup_id"):
    if len(group) == 2 and (len(group["species"].unique()) == 2):
        pair = {}
        for _, line in group.iterrows():
            pair[line["species"]] = line["geneId"]
        pairs.append(pair)

 pair_df = pd.DataFrame(pairs)

 full_df.to_csv(xml_filename + "_full.tsv", sep="\t", index=False)
 pair_df.to_csv(xml_filename + "_pair.tsv", sep="\t", index=False)
	#!/usr/bin/env python

	"""
	Expand orthogroups from an orthoXML file into TSV, and then only select the
	easy cases where there is exactly one gene from each species. The full TSV is
	output as well.

	Currently assumes pairwise files.

	Output will have suffixes _full.tsv and _pair.tsv, with full and only genes in
	orthogroups containing one gene from each species.
	"""

	import xml.etree.ElementTree as ET
	import pandas as pd
	import argparse

	ap = argparse.ArgumentParser(usage=__doc__)
	ap.add_argument(
	"xml",
	help="XML file downloaded from https://inparanoid8.sbc.su.se/download/8.0_current/Orthologs_OrthoXML/. Output files will be placed next to this file, with appropriate suffixes.",
	)
	args = ap.parse_args()

	# Find your file of interest from
	# https://inparanoid8.sbc.su.se/download/8.0_current/Orthologs_OrthoXML/
	xml_filename = args.xml
	tree = ET.parse(xml_filename)
	root = tree.getroot()

	# When searching within XPath, we need to add the namespace. This makes it
	# slightly more convenient to do so within f-strings.
	NS = "{http://orthoXML.org/2011/}"

	# First part of XML has a section for each species, giving a gene ID to each
	# gene. IDs are unique across species. This keeps track of them in a dictionary
	# we'll use later.
	gene_lookup = {}
	for species in root.findall(f".//{NS}species"):
	name = species.attrib["name"].strip()
	genes = species.findall(f".//{NS}gene")
	for gene in genes:
	gene_lookup[gene.attrib["id"]] = {
	"protId": gene.attrib["protId"],
	"geneId": gene.attrib["geneId"],
	"species": name,
	}

	# The next part of the XML has orthogroups. Each group has two or more genes in
	# it. Technically the group can contain genes from a single species; these are
	# paralogs. Not sure if those make it into this particular XML.
	#
	# Here, we iterate through the groups, and sort of "unroll" them into lines
	# containing duplicate info.
	#
	# That is, this XML item:
	#
	# <orthologGroup id="1">
	# <score id="bit" value="7474"/>
	# <geneRef id="1">
	# <score id="inparalog" value="1"/>
	# <score id="bootstrap" value="1"/>
	# </geneRef>
	# <geneRef id="2">
	# <score id="inparalog" value="1"/>
	# <score id="bootstrap" value="1"/>
	# </geneRef>
	# </orthologGroup>
	#
	# Becomes the following two dictionaries where we add the species, protein, and
	# gene id from the lookup above:
	#
	# { 'protId': 'H9J4V7',
	# 'geneId': 'uncharacterized_H9J4V7',
	# 'species': 'Bombyx mori',
	# 'orthogroup_id': '1',
	# 'bit_score': 7474,
	# 'inparalog_score': '1',
	# 'bootstrap_score': '1',
	# },
	# { 'protId': 'D1YSG0',
	# 'geneId': 'bt',
	# 'species': 'Drosophila melanogaster',
	# 'orthogroup_id': '1',
	# 'bit_score': 7474,
	# 'inparalog_score': '1',
	# 'bootstrap_score': '1'
	# }
	#
	# These are then added to a dataframe that can be grouped by orthogroup_id
	# later.

	# Keep dictionaries here to later convert to DataFrame.
	lines = []

	# Iterate through groups (one of which is shown above as an example).
	for group in root.findall(f"{NS}groups/"):
	orthogroup_id = group.attrib["id"]

	# bit score is apparently for the group as a whole
	bit_score = int(group.findall(f"./{NS}score")[0].attrib["value"])

	# We'll make a line for each gene in the orthogroup, after adding gene info
	# stored elsewhere in XML.
	for gene_ref in group.findall(f"./{NS}geneRef"):

	# Make a copy because we're about to attach additional info; don't want
	# one gene in multiple orthology groups having changing information.
	gene_info = gene_lookup[gene_ref.attrib["id"]].copy()
	gene_info["orthogroup_id"] = orthogroup_id
	gene_info["bit_score"] = bit_score

	# Add scores to the line.
	for score in gene_ref.findall(f"./{NS}score"):
	gene_info[score.attrib["id"] + "_score"] = score.attrib["value"]

	lines.append(gene_info)

	full_df = pd.DataFrame(lines)

	# Now group by orthogroup ID, and only keep ones where we have exactly one gene
	# from each species. The resulting TSV will contain only the most confident
	# orthologs between the species.
	pairs = []
	for oid, group in full_df.sort_values("orthogroup_id").groupby("orthogroup_id"):
	if len(group) == 2 and (len(group["species"].unique()) == 2):
	pair = {}
	for _, line in group.iterrows():
	pair[line["species"]] = line["geneId"]
	pairs.append(pair)

	pair_df = pd.DataFrame(pairs)

	full_df.to_csv(xml_filename + "_full.tsv", sep="\t", index=False)
	pair_df.to_csv(xml_filename + "_pair.tsv", sep="\t", index=False)