Created
April 15, 2022 18:43
-
-
Save dwinston/fa74bb9dccb3a11fa5e7e8b6a4f57089 to your computer and use it in GitHub Desktop.
build a subsumption map scoped to ENVO terms in use by NMDC biosamples
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Build a subsumption map scoped to ENVO terms in use by NMDC biosamples | |
""" | |
from collections import defaultdict | |
import json | |
from rdflib import Graph | |
from rdflib.namespace import Namespace | |
from tqdm import tqdm | |
from nmdc_runtime.site.repository import run_config_frozen__normal_env | |
from nmdc_runtime.site.resources import get_mongo | |
g = Graph() | |
print("fetching and loading envo.owl...") | |
g.parse("http://purl.obolibrary.org/obo/envo.owl", format="xml") | |
print("done loading envo.owl") | |
nmdc_envo_terms = set() | |
nmdc_envo_fields = {"env_broad_scale.has_raw_value", "env_local_scale.has_raw_value", "env_medium.has_raw_value"} | |
mongo = get_mongo(run_config_frozen__normal_env) | |
mdb = mongo.db | |
for field in nmdc_envo_fields: | |
nmdc_envo_terms |= set(mdb.biosample_set.distinct(field)) | |
print(len(nmdc_envo_terms)) # 36 | |
OBO = Namespace('http://purl.obolibrary.org/obo/') | |
def supers_for(term): | |
qres = g.query(f""" | |
SELECT DISTINCT ?o | |
WHERE {{ | |
{term} rdfs:subClassOf+ ?o . | |
?o a owl:Class . | |
}} | |
""", initNs={"obo": OBO}) | |
return {row.o.n3(g.namespace_manager) for row in qres} | |
def nmdc_to_obo(t): | |
return "obo:" + t.replace(":", "_") | |
def obo_to_nmdc(t): | |
return t.replace("_", ":")[4:] | |
term_supers = defaultdict(set) | |
# takes ~30 seconds in total | |
for term in tqdm(nmdc_envo_terms): | |
term_supers[term] = {obo_to_nmdc(t) for t in supers_for(nmdc_to_obo(term))} | |
term_subs = defaultdict(set) | |
for sub, supers in term_supers.items(): | |
for sup in supers: | |
term_subs[sup].add(sub) | |
term_subs = {sup: list(subs) for sup, subs in term_subs.items() if subs and ":" in sup} | |
print(len(term_subs)) # 68 | |
with open("envo_term_subterms.json", "w") as f: | |
json.dump(term_subs, f, indent=4) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment