Skip to content

Instantly share code, notes, and snippets.

@Stfort52
Created August 22, 2024 07:31
Show Gist options
  • Save Stfort52/a67a6d4f887b6704f5058ec6cdc88320 to your computer and use it in GitHub Desktop.
Save Stfort52/a67a6d4f887b6704f5058ec6cdc88320 to your computer and use it in GitHub Desktop.
from typing import Literal
import pandas as pd
def get_ensembl_mappings(
genes: list[str],
organism: str = "hsa",
server: Literal["www", "eu", "uswest", "asia", "gprofiler"] = "asia",
) -> pd.DataFrame:
match server:
case "eu":
return _query_ensembl(genes, organism, "www")
case "www" | "uswest" | "asia":
return _query_ensembl(genes, organism, server)
case "gprofiler":
return _query_gprofiler(genes, organism)
case _:
raise ValueError(f"Invalid server: {server}")
def _query_ensembl(genes: list[str], organism: str, server: str) -> pd.DataFrame:
import biomart
ORGANISM_KV: dict[Literal["mmu", "hsa"], str] = {
"mmu": "mmusculus_gene_ensembl",
"hsa": "hsapiens_gene_ensembl",
}
server = f"http://{server}.ensembl.org/biomart"
mart = biomart.BiomartServer(server)
dataset = mart.datasets[ORGANISM_KV[organism]]
response = dataset.search(
{
"filters": {"ensembl_gene_id": genes},
"attributes": ["ensembl_gene_id", "external_gene_name"],
}
)
df = pd.read_csv(StringIO(response.text), sep="\t", header=None)
df.columns = ["ensembl_gene_id", "external_gene_name"]
return df.set_index("ensembl_gene_id")
def _query_gprofiler(genes: list[str], organism: str) -> pd.DataFrame:
import requests
ORGANISM_KV: dict[Literal["mmu", "hsa"], str] = {
"mmu": "mmusculus",
"hsa": "hsapiens",
}
url = f"https://biit.cs.ut.ee/gprofiler/api/convert/convert/"
response = requests.post(
url,
json={
"query": genes,
"target": "ENTREZGENE",
"organism": ORGANISM_KV[organism],
},
)
df = pd.DataFrame(response.json()["result"])
df = df.loc[:, ["incoming", "name"]]
df = df.drop_duplicates()
return df.set_index("incoming")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment