Skip to content

Instantly share code, notes, and snippets.

@y1zhou
Created November 9, 2020 21:17
Show Gist options
  • Save y1zhou/b5e9acaa76db8d34364f1cf661bf947e to your computer and use it in GitHub Desktop.
Save y1zhou/b5e9acaa76db8d34364f1cf661bf947e to your computer and use it in GitHub Desktop.
Yeast gene symbols to human homologs
import numpy as np
import pandas as pd
import requests
# Interested genes
df = pd.read_excel("yeast_genes.xlsx")
pom_genes = df["Schizosaccharomyces pombe"][
df["Schizosaccharomyces pombe"].notnull()
].tolist()
cer_genes = df["Saccharomyces cerevisiae"][
df["Saccharomyces cerevisiae"].notnull()
].tolist()
# SGD data
cer_map = pd.read_table(
"http://sgd-archive.yeastgenome.org/curation/chromosomal_feature/SGD_features.tab",
header=None,
)
cer_map = cer_map.iloc[:, [0, 4]]
cer_map.columns = ["SGDID", "Symbol"]
cer_map = cer_map[cer_map["Symbol"].isin(cer_genes)].reset_index(drop=True)
# Get human homologs from Alliance of Genome Resources
ss = requests.session()
payload = {
"filter.stringency": "stringent",
"taxonID": "NCBITaxon:9606",
}
res = []
for i, row in cer_map.iterrows():
r = ss.get(
f"https://www.alliancegenome.org/api/gene/SGD:{row['SGDID']}/homologs",
params=payload,
)
homologs = [
x["homologGene"]["symbol"]
for x in r.json()["results"]
if x["homologGene"]["species"]["taxonId"] == "NCBITaxon:9606"
]
res.append(homologs)
cer_map["HumanSymbol"] = res
cer_map = cer_map[["Symbol", "HumanSymbol"]]
# Fission yeast (Schizosaccharomyces pombe) data from PomBase
pom_ortho = pd.read_table(
"ftp://ftp.pombase.org/pombe/orthologs/human-orthologs.txt.gz", header=None,
)
pom_ortho.columns = ["Systematic", "HumanSymbol"]
pom_ortho = (
pom_ortho.assign(HumanSymbol=pom_ortho["HumanSymbol"].str.split("|"))
.explode("HumanSymbol")
.query("(HumanSymbol.notnull()) & (HumanSymbol != 'NONE')")
)
pom_map = pd.read_table(
"ftp://ftp.pombase.org/pombe/names_and_identifiers/gene_IDs_names.tsv",
skiprows=1,
header=None,
)
pom_map.columns = ["Systematic", "Symbol", "Synonyms"]
pom_map = (
pom_map.drop(["Synonyms"], axis=1)
.assign(Symbol=pom_map["Symbol"].str.upper())
.query("Symbol.isin(@pom_genes)")
.merge(pom_ortho, on="Systematic")
.groupby(["Systematic", "Symbol"])["HumanSymbol"]
.apply(list)
.reset_index(name="HumanSymbol")[["Symbol", "HumanSymbol"]]
)
# Merge results
res = (
df.merge(cer_map, how="left", left_on="Saccharomyces cerevisiae", right_on="Symbol")
.drop("Symbol", axis=1)
.merge(pom_map, how="left", left_on="Schizosaccharomyces pombe", right_on="Symbol")
.drop("Symbol", axis=1)
)
res = res.assign(
HumanSymbol_x=res["HumanSymbol_x"].apply(lambda x: [] if x is np.NaN else x)
).assign(HumanSymbol_y=res["HumanSymbol_y"].apply(lambda x: [] if x is np.NaN else x))
res = res.assign(HumanSymbol=res["HumanSymbol_x"] + res["HumanSymbol_y"]).drop(
["HumanSymbol_x", "HumanSymbol_y"], axis=1
)
res["HumanSymbol"] = res["HumanSymbol"].apply(lambda x: sorted(list(set(x))))
res.rename(columns={"HumanSymbol": "Homo sapiens"}).to_excel(
"yeast_table_1.xlsx", index=False
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment