Last active
March 1, 2021 16:53
-
-
Save Avantol13/a53593837503fede2881f315198247c4 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Script to get initial metadata for BDCat studies from Gen3 Graph and by scraping dbGaP. | |
Available tags: | |
Program | |
- TOPMed | |
- COVID 19 | |
Study Registration | |
- dbGaP | |
- BIOLINCC | |
Disease Type | |
- Heart | |
- Lung | |
- Blood | |
Data Type | |
- DCC Harmonized | |
- Clinical phenotype | |
- Genotype | |
- Imaging | |
""" | |
import os | |
import requests | |
import logging | |
import sys | |
import csv | |
import copy | |
import asyncio | |
from datetime import date | |
from bs4 import BeautifulSoup | |
from gen3.submission import Gen3Submission | |
from gen3.auth import Gen3Auth | |
from gen3.tools import metadata | |
from gen3.tools.metadata.ingest_manifest import manifest_row_parsers | |
COMMONS = "https://preprod.gen3.biodatacatalyst.nhlbi.nih.gov/" | |
API_KEY_FILEPATH = "/home/buffy/Documents/bdcat_preprod_creds.json" | |
COMMONS_TO_SUBMIT_METADATA = "https://staging.gen3.biodatacatalyst.nhlbi.nih.gov/" | |
API_KEY_FILEPATH_FOR_METADATA = "/home/buffy/Documents/bdcat_staging_creds.json" | |
OUTPUT_FILE = "discovery_metadata.tsv" | |
DBGAP_WEBSITE = "https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=" | |
logging.basicConfig(filename="generate_discovery_metadata.log", level=logging.DEBUG) | |
logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) | |
def generate_discovery_metadata(): | |
auth = Gen3Auth(COMMONS, refresh_file=API_KEY_FILEPATH) | |
submission = Gen3Submission(COMMONS, auth_provider=auth) | |
print("getting currently submitted project/study data...") | |
query_txt = """ | |
{ | |
project(first:0) { | |
project_id | |
code | |
name | |
studies(first:0) { | |
study_id | |
dbgap_phs | |
dbgap_consent | |
dbgap_version | |
dbgap_accession | |
dbgap_consent_text | |
dbgap_participant_set | |
authz | |
full_name | |
short_name | |
study_description | |
_subjects_count | |
} | |
} | |
} | |
""" | |
raw_results = submission.query(query_txt).get("data", {}).get("project", []) | |
results = [] | |
fields = set() | |
for raw_result in raw_results: | |
studies = raw_result.get("studies") | |
study_data = {} | |
if len(studies) != 1: | |
logging.warning( | |
f"expect 1:1 project:study, got {studies} from {raw_result}" | |
) | |
else: | |
study_data = studies[0] | |
del raw_result["studies"] | |
result = copy.deepcopy(raw_result) | |
result.update(study_data) | |
if "authz" in result: | |
result["authz"] = str(result["authz"]).replace("'", '"') | |
result["tags"] = _determine_tags_from_study_info(result) | |
result["study_description"] = _get_study_description(result) | |
# don't include studies with no subjects for now, this effectively removes | |
# any projects that were created but have no data submitted | |
if result.get("_subjects_count"): | |
results.append(result) | |
fields = fields | set(result.keys()) | |
with open(OUTPUT_FILE, "w+", encoding="utf-8") as output_file: | |
logging.info(f"writing headers to {OUTPUT_FILE}: {fields}") | |
output_writer = csv.DictWriter( | |
output_file, | |
delimiter="\t", | |
fieldnames=fields, | |
extrasaction="ignore", | |
) | |
output_writer.writeheader() | |
for row in results: | |
output_writer.writerow(row) | |
output_file.close() | |
def submit_to_metadata_service(commons, api_key): | |
loop = asyncio.new_event_loop() | |
asyncio.set_event_loop(loop) | |
auth = Gen3Auth(commons, refresh_file=api_key) | |
# must provide a str to namespace the metadata from the file in a block in | |
# the metadata service | |
metadata_source = "gen3_discovery" | |
# override default unique ID parsing behavior | |
def _custom_get_guid_for_row(commons_url, row, lock): | |
""" | |
Given a row from the manifest, return the unique ID to use for the metadata object. | |
Args: | |
commons_url (str): root domain for commons where mds lives | |
row (dict): column_name:row_value | |
lock (asyncio.Semaphore): semaphones used to limit ammount of concurrent http | |
connections if making a call to an external service | |
Returns: | |
str: unique ID | |
""" | |
return row.get("name") | |
# override default unique ID parsing behavior | |
manifest_row_parsers["guid_for_row"] = _custom_get_guid_for_row | |
loop.run_until_complete( | |
metadata.async_ingest_metadata_manifest( | |
commons, | |
manifest_file=OUTPUT_FILE, | |
metadata_source=metadata_source, | |
auth=auth, | |
metadata_type="discovery_metadata" | |
) | |
) | |
def _get_study_description(study): | |
dbgap_phs = study.get("dbgap_phs", "") or "" | |
dbgap_version = study.get("dbgap_version", "") or "" | |
dbgap_participant_set = study.get("dbgap_participant_set", "") or "" | |
dbgap_study = f"{dbgap_phs}.{dbgap_version}.{dbgap_participant_set}" | |
study_description = study.get("study_description") | |
if dbgap_study != "..": | |
url = DBGAP_WEBSITE + dbgap_study | |
logging.debug(f"scraping {url}") | |
page = requests.get(url) | |
soup = BeautifulSoup(page.content, "html.parser") | |
report = soup.find("dl", class_="report") | |
if report: | |
study_description_start = report.find("dt") | |
# sometimes the study description isn't the first "dd" tag | |
if "Study Description" not in study_description_start.getText(): | |
study_description_start = study_description_start.find_next_sibling( | |
"dt" | |
) | |
study_description = study_description_start.find_next_sibling("dd") or "" | |
if study_description: | |
links = study_description.find(id="important-links") | |
if links: | |
links.decompose() | |
study_description = ( | |
study_description.getText().strip() | |
+ f"\n\nNOTE: This text was scraped from https://www.ncbi.nlm.nih.gov/ on {date.today()} and may not include exact formatting or images." | |
) | |
logging.debug(f"{study_description}") | |
return study_description | |
def _determine_tags_from_study_info(study): | |
tags = [] | |
if study.get("project_id", "") and study.get("project_id", "").startswith("parent"): | |
tags.append(_get_tag("Parent", "Program")) | |
tags.append(_get_tag("DCC Harmonized", "Data Type")) | |
tags.append(_get_tag("Clinical Phenotype", "Data Type")) | |
if study.get("project_id", "") and study.get("project_id", "").startswith("topmed"): | |
tags.append(_get_tag("TOPMed", "Program")) | |
tags.append(_get_tag("Genotype", "Data Type")) | |
if _is_topmed_study_geno_and_pheno(study.get("code", "")): | |
tags.append(_get_tag("Clinical Phenotype", "Data Type")) | |
if study.get("project_id", "") and study.get("project_id", "").startswith("COVID"): | |
tags.append(_get_tag("COVID 19", "Program")) | |
if study.get("dbgap_accession", "") and study.get("dbgap_accession", "").startswith( | |
"phs" | |
): | |
tags.append(_get_tag("dbGaP", "Study Registration")) | |
return str(tags).replace("'", '"') | |
def _get_tag(name, category): | |
return {"name": name, "category": category} | |
def _is_topmed_study_geno_and_pheno(study): | |
# if the topmed study has both gennomic and phenotype data (instead of having a parent | |
# study with pheno and a topmed with geno separately) | |
# | |
# determined from https://docs.google.com/spreadsheets/d/1iVOmZVu_IzsVMdefH-1Rgf8zrjqvnZOUEA2dxS5iRjc/edit#gid=698119570 | |
# filter to "program"=="topmed" and "parent_study_accession"=="" | |
return study in [ | |
"SAGE_DS-LD-IRB-COL", | |
"Amish_HMB-IRB-MDS", | |
"CRA_DS-ASTHMA-IRB-MDS-RD", | |
"VAFAR_HMB-IRB", | |
"PARTNERS_HMB", | |
"WGHS_HMB", | |
"BAGS_GRU-IRB", | |
"Sarcoidosis_DS-SAR-IRB", | |
"HyperGEN_GRU-IRB", | |
"HyperGEN_DS-CVD-IRB-RD", | |
"THRV_DS-CVD-IRB-COL-NPU-RD", | |
"miRhythm_GRU", | |
"AustralianFamilialAF_HMB-NPU-MDS", | |
"pharmHU_HMB", | |
"pharmHU_DS-SCD-RD", | |
"pharmHU_DS-SCD", | |
"SAPPHIRE_asthma_DS-ASTHMA-IRB-COL", | |
"REDS-III_Brazil_SCD_GRU-IRB-PUB-NPU", | |
"Walk_PHaSST_SCD_HMB-IRB-PUB-COL-NPU-MDS-GSO", | |
"Walk_PHaSST_SCD_DS-SCD-IRB-PUB-COL-NPU-MDS-RD", | |
"MLOF_HMB-PUB", | |
"AFLMU_HMB-IRB-PUB-COL-NPU-MDS", | |
"MPP_HMB-NPU-MDS", | |
"INSPIRE_AF_DS-MULTIPLE_DISEASES-MDS", | |
"DECAF_GRU", | |
"GENAF_HMB-NPU", | |
"JHU_AF_HMB-NPU-MDS", | |
"ChildrensHS_GAP_GRU", | |
"ChildrensHS_IGERA_GRU", | |
"ChildrensHS_MetaAir_GRU", | |
"CHIRAH_DS-ASTHMA-IRB-COL", | |
"EGCUT_GRU", | |
"IPF_DS-PUL-ILD-IRB-NPU", | |
"IPF_DS-LD-IRB-NPU", | |
"IPF_DS-PFIB-IRB-NPU", | |
"IPF_HMB-IRB-NPU", | |
"IPF_DS-ILD-IRB-NPU", | |
"OMG_SCD_DS-SCD-IRB-PUB-COL-MDS-RD", | |
"BioVU_AF_HMB-GSO", | |
"LTRC_HMB-MDS", | |
"PUSH_SCD_DS-SCD-IRB-PUB-COL", | |
"GGAF_GRU", | |
"PIMA_DS-ASTHMA-IRB-COL", | |
"CARE_BADGER_DS-ASTHMA-IRB-COL", | |
"CARE_TREXA_DS-ASTHMA-IRB-COL", | |
] | |
if __name__ == "__main__": | |
generate_discovery_metadata() | |
submit_to_metadata_service( | |
COMMONS_TO_SUBMIT_METADATA, API_KEY_FILEPATH_FOR_METADATA | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment