Skip to content

Instantly share code, notes, and snippets.

@cthoyt
Last active January 24, 2025 14:12
Show Gist options
  • Save cthoyt/12a3cb3c63ad68d73fe5a2f0d506526f to your computer and use it in GitHub Desktop.
Save cthoyt/12a3cb3c63ad68d73fe5a2f0d506526f to your computer and use it in GitHub Desktop.
A script that generates a histogram over ClinicalTrials.gov study types.
# /// script
# requires-python = ">=3.11"
# dependencies = [
# "clinicaltrials-downloader>=0.0.2",
# "pyobo[grounding]",
# "tabulate",
# "pystow",
# "click",
# ]
#
# [tool.uv.sources]
# pyobo = { git = "https://github.com/biopragmatics/pyobo" }
# gilda = { git = "https://github.com/cthoyt/gilda", branch = "slim" }
# ///
"""A script that generates a summaries over ClinicalTrials.gov."""
import pickle
import textwrap
from collections import Counter
import click
import pystow
from pyobo.gilda_utils import get_grounder
from tabulate import tabulate
from clinicaltrials_downloader import get_studies_slim
@click.command()
@click.option("--force", is_flag=True)
def main(force: bool) -> None:
"""Make a summary."""
examples = {}
phase_counter = Counter()
study_type_counter = Counter()
grouped_counter = Counter()
grounder_pkl_path = pystow.join("bio", "clinicaltrials", name="chebi_grounder.pkl")
if grounder_pkl_path.exists() and not force:
chebi_grounder = pickle.loads(grounder_pkl_path.read_bytes()) # noqa:S301
else:
chebi_grounder = get_grounder("chebi", force=force)
grounder_pkl_path.write_bytes(
pickle.dumps(chebi_grounder, protocol=pickle.HIGHEST_PROTOCOL)
)
for study in get_studies_slim(force=force):
design_module = study["protocolSection"].get("designModule", {})
study_type = (design_module.get("studyType") or "").replace("_", " ").title()
allocation = design_module.get("designInfo", {}).get("allocation") or ""
if allocation == "NA":
# TODO is there a difference between N/A and missing?
allocation = ""
allocation = allocation.replace("_", "-").title()
study_type_counter[study_type, allocation] += 1
phases = design_module.get("phases", [])
phases_key = ", ".join(
x.removeprefix("PHASE").replace("EARLY_PHASE1", "e1") for x in sorted(phases)
)
if phases_key == "NA":
# TODO is there a difference between N/A and missing?
phases_key = ""
phase_counter[phases_key] += 1
derived_section = study["derivedSection"]
conditions = derived_section.get("conditionBrowseModule", {}).get("meshes", [])
has_conditions = "true" if conditions else ""
interventions = derived_section.get("interventionBrowseModule", {}).get("meshes", [])
has_interventions = "true" if interventions else ""
grouped_counter[study_type, allocation, phases_key, has_conditions, has_interventions] += 1
protocol_section = study["protocolSection"]
identification_module = protocol_section["identificationModule"]
identifier = identification_module["nctId"]
name = identification_module.get("briefTitle")
if len(conditions) == 1:
if allocation:
key = (f"{allocation} (Phase {phases_key})",)
elif phases_key:
key = (f"{study_type} (Phase {phases_key})",)
else:
key = (study_type,)
condition_text = (
f"[{conditions[0]['term']}](https://bioregistry.io/mesh:{conditions[0]['id']})"
)
identifier_text = f"[{identifier}](https://bioregistry.io/clinicaltrials:{identifier})"
name_text = textwrap.shorten(name, width=80)
if study_type == "Observational":
if not has_interventions:
# don't want example of observational trial w/ intervention
examples[key] = (identifier_text, name_text, condition_text, "", "")
elif len(interventions) == 1 and (
rr := chebi_grounder.ground(interventions[0]["term"])
):
intervention_name = rr[0].term.entry_name
intervention_chebi_id = rr[0].term.id
intervention_text = (
f"[{intervention_name}](https://bioregistry.io/chebi:{intervention_chebi_id})"
)
intervention_img = (
f"![](https://bioregistry.io/chebi:{intervention_chebi_id}?provider=chebi-img)"
)
if (study_type == "Interventional" and phases_key and allocation) or (
study_type == "Expanded Access"
):
examples[key] = (
identifier_text,
name_text,
condition_text,
intervention_text,
intervention_img,
)
click.echo("\nStudy Types Table\n")
click.echo(
tabulate(
[
(study_type, allocation, count)
for (study_type, allocation), count in study_type_counter.most_common()
],
headers=["Study Type", "Allocation", "Count"],
tablefmt="github",
)
)
click.echo("\nPhases Table\n")
click.echo(
tabulate(
phase_counter.most_common(),
headers=["Phase", "Count"],
tablefmt="github",
)
)
key_col_first = "Study Type", "Allocation", "Phase(s)"
key_col = (*key_col_first, "has_condition(s)", "has_intervention(s)")
click.echo("\nSuper Long Table\n")
click.echo(
tabulate(
[(*key, count) for key, count in sorted(grouped_counter.most_common())],
headers=[*key_col, "count"],
tablefmt="github",
)
)
click.echo("\nExamples Table\n")
click.echo(
tabulate(
[(*key, *value) for key, value in sorted(examples.items())],
headers=[
"Study/Phase(s)",
"NCT ID",
"Title",
"Condition",
"Intervention",
"Structure",
],
tablefmt="github",
)
)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment