cthoyt · January 24, 2025 14:12
diff --git a/clinicaltrials-summary.py b/clinicaltrials-summary.py
 # /// script
 # requires-python = ">=3.11"
 # dependencies = [
 #     "clinicaltrials-downloader>=0.0.2",
 #     "pyobo[grounding]",
 #     "tabulate",
 #     "pystow",
 #     "click",
 # ]
 #
 # [tool.uv.sources]
 # pyobo = { git = "https://github.com/biopragmatics/pyobo" }
 # gilda = { git = "https://github.com/cthoyt/gilda", branch = "slim" }
 # ///

 """A script that generates a summaries over ClinicalTrials.gov."""

 import pickle
 import textwrap
 from collections import Counter

 import click
 import pystow
 from pyobo.gilda_utils import get_grounder
 from tabulate import tabulate

 from clinicaltrials_downloader import get_studies_slim


 @click.command()
 @click.option("--force", is_flag=True)
 def main(force: bool) -> None:
    """Make a summary."""
    examples = {}
    phase_counter = Counter()
    study_type_counter = Counter()
    grouped_counter = Counter()

    grounder_pkl_path = pystow.join("bio", "clinicaltrials", name="chebi_grounder.pkl")
    if grounder_pkl_path.exists() and not force:
        chebi_grounder = pickle.loads(grounder_pkl_path.read_bytes())  # noqa:S301
    else:
        chebi_grounder = get_grounder("chebi", force=force)
        grounder_pkl_path.write_bytes(
            pickle.dumps(chebi_grounder, protocol=pickle.HIGHEST_PROTOCOL)
        )

    for study in get_studies_slim(force=force):
        design_module = study["protocolSection"].get("designModule", {})
        study_type = (design_module.get("studyType") or "").replace("_", " ").title()
        allocation = design_module.get("designInfo", {}).get("allocation") or ""
        if allocation == "NA":
            # TODO is there a difference between N/A and missing?
            allocation = ""
        allocation = allocation.replace("_", "-").title()
        study_type_counter[study_type, allocation] += 1

        phases = design_module.get("phases", [])
        phases_key = ", ".join(
            x.removeprefix("PHASE").replace("EARLY_PHASE1", "e1") for x in sorted(phases)
        )
        if phases_key == "NA":
            # TODO is there a difference between N/A and missing?
            phases_key = ""
        phase_counter[phases_key] += 1

        derived_section = study["derivedSection"]
        conditions = derived_section.get("conditionBrowseModule", {}).get("meshes", [])
        has_conditions = "true" if conditions else ""

        interventions = derived_section.get("interventionBrowseModule", {}).get("meshes", [])
        has_interventions = "true" if interventions else ""

        grouped_counter[study_type, allocation, phases_key, has_conditions, has_interventions] += 1

        protocol_section = study["protocolSection"]
        identification_module = protocol_section["identificationModule"]
        identifier = identification_module["nctId"]
        name = identification_module.get("briefTitle")

        if len(conditions) == 1:
            if allocation:
                key = (f"{allocation} (Phase {phases_key})",)
            elif phases_key:
                key = (f"{study_type} (Phase {phases_key})",)
            else:
                key = (study_type,)
            condition_text = (
                f"[{conditions[0]['term']}](https://bioregistry.io/mesh:{conditions[0]['id']})"
            )
            identifier_text = f"[{identifier}](https://bioregistry.io/clinicaltrials:{identifier})"
            name_text = textwrap.shorten(name, width=80)
            if study_type == "Observational":
                if not has_interventions:
                    # don't want example of observational trial w/ intervention
                    examples[key] = (identifier_text, name_text, condition_text, "", "")
            elif len(interventions) == 1 and (
                rr := chebi_grounder.ground(interventions[0]["term"])
            ):
                intervention_name = rr[0].term.entry_name
                intervention_chebi_id = rr[0].term.id
                intervention_text = (
                    f"[{intervention_name}](https://bioregistry.io/chebi:{intervention_chebi_id})"
                )
                intervention_img = (
                    f"![](https://bioregistry.io/chebi:{intervention_chebi_id}?provider=chebi-img)"
                )
                if (study_type == "Interventional" and phases_key and allocation) or (
                    study_type == "Expanded Access"
                ):
                    examples[key] = (
                        identifier_text,
                        name_text,
                        condition_text,
                        intervention_text,
                        intervention_img,
                    )

    click.echo("\nStudy Types Table\n")
    click.echo(
        tabulate(
            [
                (study_type, allocation, count)
                for (study_type, allocation), count in study_type_counter.most_common()
            ],
            headers=["Study Type", "Allocation", "Count"],
            tablefmt="github",
        )
    )

    click.echo("\nPhases Table\n")
    click.echo(
        tabulate(
            phase_counter.most_common(),
            headers=["Phase", "Count"],
            tablefmt="github",
        )
    )

    key_col_first = "Study Type", "Allocation", "Phase(s)"
    key_col = (*key_col_first, "has_condition(s)", "has_intervention(s)")

    click.echo("\nSuper Long Table\n")
    click.echo(
        tabulate(
            [(*key, count) for key, count in sorted(grouped_counter.most_common())],
            headers=[*key_col, "count"],
            tablefmt="github",
        )
    )

    click.echo("\nExamples Table\n")
    click.echo(
        tabulate(
            [(*key, *value) for key, value in sorted(examples.items())],
            headers=[
                "Study/Phase(s)",
                "NCT ID",
                "Title",
                "Condition",
                "Intervention",
                "Structure",
            ],
            tablefmt="github",
        )
    )


 if __name__ == "__main__":
    main()
	# /// script
	# requires-python = ">=3.11"
	# dependencies = [
	# "clinicaltrials-downloader>=0.0.2",
	# "pyobo[grounding]",
	# "tabulate",
	# "pystow",
	# "click",
	# ]
	#
	# [tool.uv.sources]
	# pyobo = { git = "https://github.com/biopragmatics/pyobo" }
	# gilda = { git = "https://github.com/cthoyt/gilda", branch = "slim" }
	# ///

	"""A script that generates a summaries over ClinicalTrials.gov."""

	import pickle
	import textwrap
	from collections import Counter

	import click
	import pystow
	from pyobo.gilda_utils import get_grounder
	from tabulate import tabulate

	from clinicaltrials_downloader import get_studies_slim


	@click.command()
	@click.option("--force", is_flag=True)
	def main(force: bool) -> None:
	"""Make a summary."""
	examples = {}
	phase_counter = Counter()
	study_type_counter = Counter()
	grouped_counter = Counter()

	grounder_pkl_path = pystow.join("bio", "clinicaltrials", name="chebi_grounder.pkl")
	if grounder_pkl_path.exists() and not force:
	chebi_grounder = pickle.loads(grounder_pkl_path.read_bytes()) # noqa:S301
	else:
	chebi_grounder = get_grounder("chebi", force=force)
	grounder_pkl_path.write_bytes(
	pickle.dumps(chebi_grounder, protocol=pickle.HIGHEST_PROTOCOL)
	)

	for study in get_studies_slim(force=force):
	design_module = study["protocolSection"].get("designModule", {})
	study_type = (design_module.get("studyType") or "").replace("_", " ").title()
	allocation = design_module.get("designInfo", {}).get("allocation") or ""
	if allocation == "NA":
	# TODO is there a difference between N/A and missing?
	allocation = ""
	allocation = allocation.replace("_", "-").title()
	study_type_counter[study_type, allocation] += 1

	phases = design_module.get("phases", [])
	phases_key = ", ".join(
	x.removeprefix("PHASE").replace("EARLY_PHASE1", "e1") for x in sorted(phases)
	)
	if phases_key == "NA":
	# TODO is there a difference between N/A and missing?
	phases_key = ""
	phase_counter[phases_key] += 1

	derived_section = study["derivedSection"]
	conditions = derived_section.get("conditionBrowseModule", {}).get("meshes", [])
	has_conditions = "true" if conditions else ""

	interventions = derived_section.get("interventionBrowseModule", {}).get("meshes", [])
	has_interventions = "true" if interventions else ""

	grouped_counter[study_type, allocation, phases_key, has_conditions, has_interventions] += 1

	protocol_section = study["protocolSection"]
	identification_module = protocol_section["identificationModule"]
	identifier = identification_module["nctId"]
	name = identification_module.get("briefTitle")

	if len(conditions) == 1:
	if allocation:
	key = (f"{allocation} (Phase {phases_key})",)
	elif phases_key:
	key = (f"{study_type} (Phase {phases_key})",)
	else:
	key = (study_type,)
	condition_text = (
	f"[{conditions[0]['term']}](https://bioregistry.io/mesh:{conditions[0]['id']})"
	)
	identifier_text = f"[{identifier}](https://bioregistry.io/clinicaltrials:{identifier})"
	name_text = textwrap.shorten(name, width=80)
	if study_type == "Observational":
	if not has_interventions:
	# don't want example of observational trial w/ intervention
	examples[key] = (identifier_text, name_text, condition_text, "", "")
	elif len(interventions) == 1 and (
	rr := chebi_grounder.ground(interventions[0]["term"])
	):
	intervention_name = rr[0].term.entry_name
	intervention_chebi_id = rr[0].term.id
	intervention_text = (
	f"[{intervention_name}](https://bioregistry.io/chebi:{intervention_chebi_id})"
	)
	intervention_img = (
	f"![](https://bioregistry.io/chebi:{intervention_chebi_id}?provider=chebi-img)"
	)
	if (study_type == "Interventional" and phases_key and allocation) or (
	study_type == "Expanded Access"
	):
	examples[key] = (
	identifier_text,
	name_text,
	condition_text,
	intervention_text,
	intervention_img,
	)

	click.echo("\nStudy Types Table\n")
	click.echo(
	tabulate(
	[
	(study_type, allocation, count)
	for (study_type, allocation), count in study_type_counter.most_common()
	],
	headers=["Study Type", "Allocation", "Count"],
	tablefmt="github",
	)
	)

	click.echo("\nPhases Table\n")
	click.echo(
	tabulate(
	phase_counter.most_common(),
	headers=["Phase", "Count"],
	tablefmt="github",
	)
	)

	key_col_first = "Study Type", "Allocation", "Phase(s)"
	key_col = (*key_col_first, "has_condition(s)", "has_intervention(s)")

	click.echo("\nSuper Long Table\n")
	click.echo(
	tabulate(
	[(*key, count) for key, count in sorted(grouped_counter.most_common())],
	headers=[*key_col, "count"],
	tablefmt="github",
	)
	)

	click.echo("\nExamples Table\n")
	click.echo(
	tabulate(
	[(key, value) for key, value in sorted(examples.items())],
	headers=[
	"Study/Phase(s)",
	"NCT ID",
	"Title",
	"Condition",
	"Intervention",
	"Structure",
	],
	tablefmt="github",
	)
	)


	if __name__ == "__main__":
	main()