Last active
January 24, 2025 14:12
-
-
Save cthoyt/12a3cb3c63ad68d73fe5a2f0d506526f to your computer and use it in GitHub Desktop.
A script that generates a histogram over ClinicalTrials.gov study types.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# /// script | |
# requires-python = ">=3.11" | |
# dependencies = [ | |
# "clinicaltrials-downloader>=0.0.2", | |
# "pyobo[grounding]", | |
# "tabulate", | |
# "pystow", | |
# "click", | |
# ] | |
# | |
# [tool.uv.sources] | |
# pyobo = { git = "https://github.com/biopragmatics/pyobo" } | |
# gilda = { git = "https://github.com/cthoyt/gilda", branch = "slim" } | |
# /// | |
"""A script that generates a summaries over ClinicalTrials.gov.""" | |
import pickle | |
import textwrap | |
from collections import Counter | |
import click | |
import pystow | |
from pyobo.gilda_utils import get_grounder | |
from tabulate import tabulate | |
from clinicaltrials_downloader import get_studies_slim | |
@click.command() | |
@click.option("--force", is_flag=True) | |
def main(force: bool) -> None: | |
"""Make a summary.""" | |
examples = {} | |
phase_counter = Counter() | |
study_type_counter = Counter() | |
grouped_counter = Counter() | |
grounder_pkl_path = pystow.join("bio", "clinicaltrials", name="chebi_grounder.pkl") | |
if grounder_pkl_path.exists() and not force: | |
chebi_grounder = pickle.loads(grounder_pkl_path.read_bytes()) # noqa:S301 | |
else: | |
chebi_grounder = get_grounder("chebi", force=force) | |
grounder_pkl_path.write_bytes( | |
pickle.dumps(chebi_grounder, protocol=pickle.HIGHEST_PROTOCOL) | |
) | |
for study in get_studies_slim(force=force): | |
design_module = study["protocolSection"].get("designModule", {}) | |
study_type = (design_module.get("studyType") or "").replace("_", " ").title() | |
allocation = design_module.get("designInfo", {}).get("allocation") or "" | |
if allocation == "NA": | |
# TODO is there a difference between N/A and missing? | |
allocation = "" | |
allocation = allocation.replace("_", "-").title() | |
study_type_counter[study_type, allocation] += 1 | |
phases = design_module.get("phases", []) | |
phases_key = ", ".join( | |
x.removeprefix("PHASE").replace("EARLY_PHASE1", "e1") for x in sorted(phases) | |
) | |
if phases_key == "NA": | |
# TODO is there a difference between N/A and missing? | |
phases_key = "" | |
phase_counter[phases_key] += 1 | |
derived_section = study["derivedSection"] | |
conditions = derived_section.get("conditionBrowseModule", {}).get("meshes", []) | |
has_conditions = "true" if conditions else "" | |
interventions = derived_section.get("interventionBrowseModule", {}).get("meshes", []) | |
has_interventions = "true" if interventions else "" | |
grouped_counter[study_type, allocation, phases_key, has_conditions, has_interventions] += 1 | |
protocol_section = study["protocolSection"] | |
identification_module = protocol_section["identificationModule"] | |
identifier = identification_module["nctId"] | |
name = identification_module.get("briefTitle") | |
if len(conditions) == 1: | |
if allocation: | |
key = (f"{allocation} (Phase {phases_key})",) | |
elif phases_key: | |
key = (f"{study_type} (Phase {phases_key})",) | |
else: | |
key = (study_type,) | |
condition_text = ( | |
f"[{conditions[0]['term']}](https://bioregistry.io/mesh:{conditions[0]['id']})" | |
) | |
identifier_text = f"[{identifier}](https://bioregistry.io/clinicaltrials:{identifier})" | |
name_text = textwrap.shorten(name, width=80) | |
if study_type == "Observational": | |
if not has_interventions: | |
# don't want example of observational trial w/ intervention | |
examples[key] = (identifier_text, name_text, condition_text, "", "") | |
elif len(interventions) == 1 and ( | |
rr := chebi_grounder.ground(interventions[0]["term"]) | |
): | |
intervention_name = rr[0].term.entry_name | |
intervention_chebi_id = rr[0].term.id | |
intervention_text = ( | |
f"[{intervention_name}](https://bioregistry.io/chebi:{intervention_chebi_id})" | |
) | |
intervention_img = ( | |
f"" | |
) | |
if (study_type == "Interventional" and phases_key and allocation) or ( | |
study_type == "Expanded Access" | |
): | |
examples[key] = ( | |
identifier_text, | |
name_text, | |
condition_text, | |
intervention_text, | |
intervention_img, | |
) | |
click.echo("\nStudy Types Table\n") | |
click.echo( | |
tabulate( | |
[ | |
(study_type, allocation, count) | |
for (study_type, allocation), count in study_type_counter.most_common() | |
], | |
headers=["Study Type", "Allocation", "Count"], | |
tablefmt="github", | |
) | |
) | |
click.echo("\nPhases Table\n") | |
click.echo( | |
tabulate( | |
phase_counter.most_common(), | |
headers=["Phase", "Count"], | |
tablefmt="github", | |
) | |
) | |
key_col_first = "Study Type", "Allocation", "Phase(s)" | |
key_col = (*key_col_first, "has_condition(s)", "has_intervention(s)") | |
click.echo("\nSuper Long Table\n") | |
click.echo( | |
tabulate( | |
[(*key, count) for key, count in sorted(grouped_counter.most_common())], | |
headers=[*key_col, "count"], | |
tablefmt="github", | |
) | |
) | |
click.echo("\nExamples Table\n") | |
click.echo( | |
tabulate( | |
[(*key, *value) for key, value in sorted(examples.items())], | |
headers=[ | |
"Study/Phase(s)", | |
"NCT ID", | |
"Title", | |
"Condition", | |
"Intervention", | |
"Structure", | |
], | |
tablefmt="github", | |
) | |
) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment