Skip to content

Instantly share code, notes, and snippets.

@jbusecke
Created April 23, 2025 23:47
Show Gist options
  • Save jbusecke/65d0e160ed1d90dac903c9c3f36cf9da to your computer and use it in GitHub Desktop.
Save jbusecke/65d0e160ed1d90dac903c9c3f36cf9da to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"id": "d920b3e5",
"metadata": {},
"source": [
"# Testing how to programatically extract reference details from CMIP data"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "82e0dd0e",
"metadata": {},
"outputs": [],
"source": [
"import aiohttp\n",
"from tqdm.asyncio import tqdm_asyncio\n",
"\n",
"async def handle_to_url(handle):\n",
" # Convert handle to URL\n",
" return f\"https://hdl.handle.net/api/handles/{handle.replace('hdl:', '')}\"\n",
"\n",
"async def get_json(session, url):\n",
" async with session.get(url) as response:\n",
" if response.status == 200:\n",
" return await response.json()\n",
" else:\n",
" raise ValueError(f\"Failed to retrieve data from {url}\")\n",
"\n",
"async def get_value(json_response, value_type):\n",
" # Return only the value index with type \"value_type\"\n",
" for value in json_response['values']:\n",
" if value['type'] == value_type:\n",
" return value['data']['value']\n",
" raise ValueError(f\"Value of type {value_type} not found in response\")\n",
"\n",
"async def get_doi_from_tracking_id(tracking_id: str) -> str:\n",
" \"\"\"\n",
" Get the DOI from a tracking ID attribute string\n",
" \"\"\"\n",
" tracking_ids = tracking_id.split('\\n')\n",
" async with aiohttp.ClientSession() as session:\n",
" # Check that all handles point to the same root handle\n",
" root_handles = await tqdm_asyncio.gather(\n",
" *[get_value(await get_json(session, await handle_to_url(handle)), \"IS_PART_OF\") for handle in tracking_ids]\n",
" )\n",
" # If not all root_handles are the same, throw an error\n",
" if len(set(root_handles)) > 1:\n",
" raise ValueError(\"Not all handles point to the same root handle\")\n",
" else:\n",
" root_handle = root_handles[0]\n",
" # Get the DOI of the root handle\n",
" doi = await get_value(await get_json(session, await handle_to_url(root_handle)), \"IS_PART_OF\")\n",
" # If root_doi does not start with \"doi:\", raise an error\n",
" if not doi.startswith(\"doi:\"):\n",
" raise ValueError(\"Root handle does not point to a DOI\")\n",
" return doi\n"
]
},
{
"cell_type": "markdown",
"id": "a040bc2f",
"metadata": {},
"source": [
"## Load the CMIP Catalog"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "78b86dc2",
"metadata": {},
"outputs": [],
"source": [
"import intake\n",
"import xarray as xr\n",
"\n",
"\n",
"# uncomment/comment lines to swap catalogs\n",
"url = \"https://storage.googleapis.com/cmip6/cmip6-pgf-ingestion-test/catalog/catalog.json\" # Only stores that pass current tests\n",
"col = intake.open_esm_datastore(url)"
]
},
{
"cell_type": "markdown",
"id": "da6c28e5",
"metadata": {},
"source": [
"## A single test"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cf22605c",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 4/4 [00:00<00:00, 8439.24it/s]\n"
]
},
{
"data": {
"text/plain": [
"'doi:10.22033/ESGF/CMIP6.11762'"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"path = col.df['zstore'].tolist()[100]\n",
"\n",
"ds = xr.open_zarr(path, consolidated=True)\n",
"# await get_dois_from_tracking_ids(ds.attrs['tracking_id'])\n",
"await get_doi_from_tracking_id(ds.attrs['tracking_id'])"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "2752bb31",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'gs://cmip6/CMIP6/CMIP/NASA-GISS/GISS-E2-1-G-CC/historical/r1i1p1f1/Omon/fsitherm/gn/v20190815/'"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"path"
]
},
{
"cell_type": "markdown",
"id": "ea93a8f4",
"metadata": {},
"source": [
"Manually checking the facets and the doi, this seems to work 😁"
]
},
{
"cell_type": "markdown",
"id": "cfb08e7c",
"metadata": {},
"source": [
"## The data used in the pco2 testbed publication\n",
"\n",
"Got a list of partial instance_ids, and will get all available dois\n",
"\n",
">[!NOTE]\n",
"> I found that the results are highly redundant and generally there is only one DOI per simulation (e.g. source_id and experiment_id). I thus pruned things heavily below to reduce runtime\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7d88cc25",
"metadata": {},
"outputs": [],
"source": [
"partial_instance_ids = [\n",
" \"ACCESS-ESM1-5.gn.ssp245.Omon.r4i1p1f1\",\n",
" \"ACCESS-ESM1-5.gn.historical.Omon.r4i1p1f1\",\n",
" \"ACCESS-ESM1-5.gn.ssp245.Omon.r5i1p1f1\",\n",
" \"ACCESS-ESM1-5.gn.historical.Omon.r5i1p1f1\",\n",
" \"CESM2.gn.ssp245.Omon.r10i1p1f1\",\n",
" \"CESM2.gn.historical.Omon.r10i1p1f1\",\n",
" \"CESM2.gn.ssp245.Omon.r11i1p1f1\",\n",
" \"CESM2.gn.historical.Omon.r11i1p1f1\",\n",
" \"CESM2.gn.ssp245.Omon.r4i1p1f1\",\n",
" \"CESM2.gn.historical.Omon.r4i1p1f1\",\n",
" \"CESM2-WACCM.gn.ssp245.Omon.r1i1p1f1\",\n",
" \"CESM2-WACCM.gn.historical.Omon.r1i1p1f1\",\n",
" \"CESM2-WACCM.gr.ssp245.Omon.r2i1p1f1\",\n",
" \"CESM2-WACCM.gr.historical.Omon.r2i1p1f1\",\n",
" \"CESM2-WACCM.gr.ssp245.Omon.r3i1p1f1\",\n",
" \"CESM2-WACCM.gr.historical.Omon.r3i1p1f1\",\n",
" \"CMCC-ESM2.gn.ssp245.Omon.r1i1p1f1\",\n",
" \"CMCC-ESM2.gn.historical.Omon.r1i1p1f1\",\n",
" \"CanESM5.gn.ssp245.Omon.r10i1p2f1\",\n",
" \"CanESM5.gn.historical.Omon.r10i1p2f1\",\n",
" \"CanESM5.gn.ssp245.Omon.r1i1p1f1\",\n",
" \"CanESM5.gn.historical.Omon.r1i1p1f1\",\n",
" \"CanESM5.gn.ssp245.Omon.r1i1p2f1\",\n",
" \"CanESM5.gn.historical.Omon.r1i1p2f1\",\n",
" \"CanESM5.gn.ssp245.Omon.r2i1p1f1\",\n",
" \"CanESM5.gn.historical.Omon.r2i1p1f1\",\n",
" \"CanESM5.gn.ssp245.Omon.r2i1p2f1\",\n",
" \"CanESM5.gn.historical.Omon.r2i1p2f1\",\n",
" \"CanESM5.gn.ssp245.Omon.r3i1p1f1\",\n",
" \"CanESM5.gn.historical.Omon.r3i1p1f1\",\n",
" \"CanESM5.gn.ssp245.Omon.r3i1p2f1\",\n",
" \"CanESM5.gn.historical.Omon.r3i1p2f1\",\n",
" \"CanESM5.gn.ssp245.Omon.r4i1p1f1\",\n",
" \"CanESM5.gn.historical.Omon.r4i1p1f1\",\n",
" \"CanESM5.gn.ssp245.Omon.r4i1p2f1\",\n",
" \"CanESM5.gn.historical.Omon.r4i1p2f1\",\n",
" \"CanESM5.gn.ssp245.Omon.r5i1p1f1\",\n",
" \"CanESM5.gn.historical.Omon.r5i1p1f1\",\n",
" \"CanESM5.gn.ssp245.Omon.r5i1p2f1\",\n",
" \"CanESM5.gn.historical.Omon.r5i1p2f1\",\n",
" \"CanESM5.gn.ssp245.Omon.r6i1p1f1\",\n",
" \"CanESM5.gn.historical.Omon.r6i1p1f1\",\n",
" \"CanESM5.gn.ssp245.Omon.r6i1p2f1\",\n",
" \"CanESM5.gn.historical.Omon.r6i1p2f1\",\n",
" \"CanESM5.gn.ssp245.Omon.r7i1p1f1\",\n",
" \"CanESM5.gn.historical.Omon.r7i1p1f1\",\n",
" \"CanESM5.gn.ssp245.Omon.r7i1p2f1\",\n",
" \"CanESM5.gn.historical.Omon.r7i1p2f1\",\n",
" \"CanESM5.gn.ssp245.Omon.r8i1p1f1\",\n",
" \"CanESM5.gn.historical.Omon.r8i1p1f1\",\n",
" \"CanESM5.gn.ssp245.Omon.r8i1p2f1\",\n",
" \"CanESM5.gn.historical.Omon.r8i1p2f1\",\n",
" \"CanESM5.gn.ssp245.Omon.r9i1p2f1\",\n",
" \"CanESM5.gn.historical.Omon.r9i1p2f1\",\n",
" \"CanESM5-CanOE.gn.ssp245.Omon.r1i1p2f1\",\n",
" \"CanESM5-CanOE.gn.historical.Omon.r1i1p2f1\",\n",
" \"CanESM5-CanOE.gn.ssp245.Omon.r2i1p2f1\",\n",
" \"CanESM5-CanOE.gn.historical.Omon.r2i1p2f1\",\n",
" \"CanESM5-CanOE.gn.ssp245.Omon.r3i1p2f1\",\n",
" \"CanESM5-CanOE.gn.historical.Omon.r3i1p2f1\",\n",
" \"GFDL-ESM4.gr.ssp245.Omon.r1i1p1f1\",\n",
" \"GFDL-ESM4.gr.historical.Omon.r1i1p1f1\",\n",
" \"MPI-ESM1-2-LR.gn.ssp245.Omon.r11i1p1f1\",\n",
" \"MPI-ESM1-2-LR.gn.historical.Omon.r11i1p1f1\",\n",
" \"MPI-ESM1-2-LR.gn.ssp245.Omon.r12i1p1f1\",\n",
" \"MPI-ESM1-2-LR.gn.historical.Omon.r12i1p1f1\",\n",
" \"MPI-ESM1-2-LR.gn.ssp245.Omon.r14i1p1f1\",\n",
" \"MPI-ESM1-2-LR.gn.historical.Omon.r14i1p1f1\",\n",
" \"MPI-ESM1-2-LR.gn.ssp245.Omon.r15i1p1f1\",\n",
" \"MPI-ESM1-2-LR.gn.historical.Omon.r15i1p1f1\",\n",
" \"MPI-ESM1-2-LR.gn.ssp245.Omon.r16i1p1f1\",\n",
" \"MPI-ESM1-2-LR.gn.historical.Omon.r16i1p1f1\",\n",
" \"MPI-ESM1-2-LR.gn.ssp245.Omon.r22i1p1f1\",\n",
" \"MPI-ESM1-2-LR.gn.historical.Omon.r22i1p1f1\",\n",
" \"MPI-ESM1-2-LR.gn.ssp245.Omon.r23i1p1f1\",\n",
" \"MPI-ESM1-2-LR.gn.historical.Omon.r23i1p1f1\",\n",
" \"MPI-ESM1-2-LR.gn.ssp245.Omon.r26i1p1f1\",\n",
" \"MPI-ESM1-2-LR.gn.historical.Omon.r26i1p1f1\",\n",
" \"MPI-ESM1-2-LR.gn.ssp245.Omon.r27i1p1f1\",\n",
" \"MPI-ESM1-2-LR.gn.historical.Omon.r27i1p1f1\",\n",
" \"UKESM1-0-LL.gn.ssp245.Omon.r1i1p1f2\",\n",
" \"UKESM1-0-LL.gn.historical.Omon.r1i1p1f2\",\n",
" \"UKESM1-0-LL.gn.ssp245.Omon.r2i1p1f2\",\n",
" \"UKESM1-0-LL.gn.historical.Omon.r2i1p1f2\",\n",
" \"UKESM1-0-LL.gn.ssp245.Omon.r3i1p1f2\",\n",
" \"UKESM1-0-LL.gn.historical.Omon.r3i1p1f2\",\n",
" \"UKESM1-0-LL.gn.ssp245.Omon.r4i1p1f2\",\n",
" \"UKESM1-0-LL.gn.historical.Omon.r4i1p1f2\",\n",
" \"UKESM1-0-LL.gn.ssp245.Omon.r8i1p1f2\",\n",
" \"UKESM1-0-LL.gn.historical.Omon.r8i1p1f2\"\n",
"]\n",
"# Im gonna cheat here a bit. I am pretty sure that the dois are per simulation, so lets just search for the source_id and experiment_id\n",
"\n",
"partial_instance_ids_pruned = sorted(list(set(['.'.join(iid.split('.')[:-1]) for iid in partial_instance_ids])))\n",
"partial_instance_ids_pruned"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "8623fb75",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "0748d44cf648430ba7a5fbd1c16ec6ca",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Instance ID Loop: 0%| | 0/20 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"ACCESS-ESM1-5.gn.historical.Omon\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "6e3f99e4c13546b2b886327281720c57",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Dataset Loop: 0%| | 0/1 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"100%|██████████| 1/1 [00:00<00:00, 1758.62it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"ACCESS-ESM1-5.historical {'doi:10.22033/ESGF/CMIP6.4272'}\n",
"ACCESS-ESM1-5.gn.ssp245.Omon\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "5d7072d711d24e6c9979eecd21b6ba83",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Dataset Loop: 0%| | 0/1 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"100%|██████████| 1/1 [00:00<00:00, 2123.70it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"ACCESS-ESM1-5.ssp245 {'doi:10.22033/ESGF/CMIP6.4322'}\n",
"CESM2-WACCM.gn.historical.Omon\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "ce5f8f1299ba40e08f66f1beee04a567",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Dataset Loop: 0%| | 0/1 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"100%|██████████| 1/1 [00:00<00:00, 1831.57it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CESM2-WACCM.historical {'doi:10.22033/ESGF/CMIP6.10071'}\n",
"CESM2-WACCM.gn.ssp245.Omon\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "4322ef65150649b495c4b3f7f25fe73a",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Dataset Loop: 0%| | 0/1 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"100%|██████████| 2/2 [00:00<00:00, 4606.59it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CESM2-WACCM.ssp245 {'doi:10.22033/ESGF/CMIP6.10101'}\n",
"CESM2-WACCM.gr.historical.Omon\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "d5e8b77862ee44d1b06135694fd40651",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Dataset Loop: 0%| | 0/1 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"100%|██████████| 1/1 [00:00<00:00, 1472.72it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CESM2-WACCM.historical {'doi:10.22033/ESGF/CMIP6.10071'}\n",
"CESM2-WACCM.gr.ssp245.Omon\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "a8faf47c904b4618a4a5e28ce141bb5f",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Dataset Loop: 0%| | 0/1 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"100%|██████████| 2/2 [00:00<00:00, 4202.71it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CESM2-WACCM.ssp245 {'doi:10.22033/ESGF/CMIP6.10101'}\n",
"CESM2.gn.historical.Omon\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "9116eec45e6b496081d2b62ca9cc1686",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Dataset Loop: 0%| | 0/1 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"100%|██████████| 4/4 [00:00<00:00, 5635.61it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CESM2.historical {'doi:10.22033/ESGF/CMIP6.7627'}\n",
"CESM2.gn.ssp245.Omon\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "d7a429bdc3fc45ab90c2b77940a436d6",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Dataset Loop: 0%| | 0/1 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"100%|██████████| 2/2 [00:00<00:00, 3067.13it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CESM2.ssp245 {'doi:10.22033/ESGF/CMIP6.7748'}\n",
"CMCC-ESM2.gn.historical.Omon\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "a8a4c1d03cc748c696cf0b7516e1d5ca",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Dataset Loop: 0%| | 0/1 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"100%|██████████| 1/1 [00:00<00:00, 1248.30it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CMCC-ESM2.historical {'doi:10.22033/ESGF/CMIP6.13195'}\n",
"CMCC-ESM2.gn.ssp245.Omon\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "c91075a618534aeba89adc0974cdb468",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Dataset Loop: 0%| | 0/1 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"100%|██████████| 1/1 [00:00<00:00, 1492.63it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CMCC-ESM2.ssp245 {'doi:10.22033/ESGF/CMIP6.13252'}\n",
"CanESM5-CanOE.gn.historical.Omon\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "57c1cb85cea84ed49714f0f34c03004b",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Dataset Loop: 0%| | 0/1 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"100%|██████████| 1/1 [00:00<00:00, 1838.80it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CanESM5-CanOE.historical {'doi:10.22033/ESGF/CMIP6.10260'}\n",
"CanESM5-CanOE.gn.ssp245.Omon\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "e2c08ab2ca814afbbe5ed8edbda99d2e",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Dataset Loop: 0%| | 0/1 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"100%|██████████| 1/1 [00:00<00:00, 1326.47it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CanESM5-CanOE.ssp245 {'doi:10.22033/ESGF/CMIP6.10270'}\n",
"CanESM5.gn.historical.Omon\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "90a389cfe9b649e9b68d1148be20a3cd",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Dataset Loop: 0%| | 0/1 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"100%|██████████| 1/1 [00:00<00:00, 3536.51it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CanESM5.historical {'doi:10.22033/ESGF/CMIP6.3610'}\n",
"CanESM5.gn.ssp245.Omon\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "6270aec497674838a096b76ff191658e",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Dataset Loop: 0%| | 0/1 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"100%|██████████| 1/1 [00:00<00:00, 1937.32it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CanESM5.ssp245 {'doi:10.22033/ESGF/CMIP6.3685'}\n",
"GFDL-ESM4.gr.historical.Omon\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "294a76ffb18944b49f041fd8e6a7a3ee",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Dataset Loop: 0%| | 0/1 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"100%|██████████| 9/9 [00:00<00:00, 10742.38it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"GFDL-ESM4.historical {'doi:10.22033/ESGF/CMIP6.8597'}\n",
"GFDL-ESM4.gr.ssp245.Omon\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "fef3851f50f844ec88410a06277f386a",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Dataset Loop: 0%| | 0/1 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"100%|██████████| 5/5 [00:00<00:00, 4425.30it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"GFDL-ESM4.ssp245 {'doi:10.22033/ESGF/CMIP6.8686'}\n",
"MPI-ESM1-2-LR.gn.historical.Omon\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "4f779eb7a575412c8112bcb3c2f55b28",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Dataset Loop: 0%| | 0/1 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"100%|██████████| 9/9 [00:00<00:00, 13414.62it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"MPI-ESM1-2-LR.historical {'doi:10.22033/ESGF/CMIP6.6595'}\n",
"MPI-ESM1-2-LR.gn.ssp245.Omon\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "d2d085112576458895af2f7d07ab9772",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Dataset Loop: 0%| | 0/1 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"100%|██████████| 5/5 [00:00<00:00, 10082.46it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"MPI-ESM1-2-LR.ssp245 {'doi:10.22033/ESGF/CMIP6.6693'}\n",
"UKESM1-0-LL.gn.historical.Omon\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "28915813399a4f51b07d841699296d63",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Dataset Loop: 0%| | 0/1 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"100%|██████████| 2/2 [00:00<00:00, 4914.24it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"UKESM1-0-LL.historical {'doi:10.22033/ESGF/CMIP6.6113'}\n",
"UKESM1-0-LL.gn.ssp245.Omon\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "c117a01d4deb4c92989c688e5f59a0b4",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Dataset Loop: 0%| | 0/1 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"100%|██████████| 2/2 [00:00<00:00, 4017.53it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"UKESM1-0-LL.ssp245 {'doi:10.22033/ESGF/CMIP6.6339'}\n"
]
}
],
"source": [
"from tqdm.auto import tqdm\n",
"\n",
"doi_dict = {}\n",
"for iid in tqdm(partial_instance_ids_pruned, desc=\"Instance ID Loop\"):\n",
" print(iid)\n",
" source_id, grid_label, experiment_id, table_id = iid.split('.')\n",
" cat = col.search(\n",
" source_id=source_id,\n",
" grid_label=grid_label,\n",
" experiment_id=experiment_id,\n",
" table_id=table_id,\n",
" variable_id=['tos'], #TODO: I need to add the others that were used\n",
" )\n",
" # more thorough search but the results are redundant anyways and the above search is much faster\n",
"\n",
" # source_id, grid_label, experiment_id, table_id, member_id = iid.split('.')\n",
" # cat = col.search(\n",
" # source_id=source_id,\n",
" # grid_label=grid_label,\n",
" # experiment_id=experiment_id,\n",
" # table_id=table_id,\n",
" # member_id=member_id\n",
" # variable_id=['tos'], #TODO: I need to add the others that were used\n",
" # )\n",
" datasets = cat.to_dataset_dict(\n",
" aggregate=False,\n",
" skip_on_error=True,\n",
" xarray_open_kwargs={\n",
" 'consolidated':True,\n",
" 'decode_times':False\n",
" },\n",
" progressbar=False,\n",
" )\n",
" # some datasets have no tracking_id....seems to be a problem with some BGC variables? Needs further investigation\n",
" # should not be a deal breaker since we are looking for simulation level DOIs here. So if we find a single one per simulation we are good\n",
"\n",
"\n",
" # Im gonna cheat here a bit. I am pretty sure that the dois are per simulation, so lets just get the firste one that has the tracking_id attribute\n",
" datasets_pruned = {}\n",
" for name, ds in datasets.items():\n",
" if 'tracking_id' in ds.attrs:\n",
" datasets_pruned[name] = ds\n",
" break\n",
" \n",
" dois = [await get_doi_from_tracking_id(ds.attrs['tracking_id']) for ds in tqdm(datasets_pruned.values(), desc=\"Dataset Loop\") if 'tracking_id' in ds.attrs]\n",
"\n",
" # create new source_id key in dict or if source_id key already existst append to list\n",
" key = '.'.join([source_id, experiment_id])\n",
" print(key,set(dois))\n",
" if key not in doi_dict:\n",
" doi_dict[key] = set(dois)\n",
" else:\n",
" doi_dict[key] = doi_dict[key].union(set(dois))\n"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "1a1b17ab",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'ACCESS-ESM1-5.historical': {'doi:10.22033/ESGF/CMIP6.4272'},\n",
" 'ACCESS-ESM1-5.ssp245': {'doi:10.22033/ESGF/CMIP6.4322'},\n",
" 'CESM2-WACCM.historical': {'doi:10.22033/ESGF/CMIP6.10071'},\n",
" 'CESM2-WACCM.ssp245': {'doi:10.22033/ESGF/CMIP6.10101'},\n",
" 'CESM2.historical': {'doi:10.22033/ESGF/CMIP6.7627'},\n",
" 'CESM2.ssp245': {'doi:10.22033/ESGF/CMIP6.7748'},\n",
" 'CMCC-ESM2.historical': {'doi:10.22033/ESGF/CMIP6.13195'},\n",
" 'CMCC-ESM2.ssp245': {'doi:10.22033/ESGF/CMIP6.13252'},\n",
" 'CanESM5-CanOE.historical': {'doi:10.22033/ESGF/CMIP6.10260'},\n",
" 'CanESM5-CanOE.ssp245': {'doi:10.22033/ESGF/CMIP6.10270'},\n",
" 'CanESM5.historical': {'doi:10.22033/ESGF/CMIP6.3610'},\n",
" 'CanESM5.ssp245': {'doi:10.22033/ESGF/CMIP6.3685'},\n",
" 'GFDL-ESM4.historical': {'doi:10.22033/ESGF/CMIP6.8597'},\n",
" 'GFDL-ESM4.ssp245': {'doi:10.22033/ESGF/CMIP6.8686'},\n",
" 'MPI-ESM1-2-LR.historical': {'doi:10.22033/ESGF/CMIP6.6595'},\n",
" 'MPI-ESM1-2-LR.ssp245': {'doi:10.22033/ESGF/CMIP6.6693'},\n",
" 'UKESM1-0-LL.historical': {'doi:10.22033/ESGF/CMIP6.6113'},\n",
" 'UKESM1-0-LL.ssp245': {'doi:10.22033/ESGF/CMIP6.6339'}}"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"doi_dict"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "0c192c5a",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "376eec5b",
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame.from_dict(doi_dict, orient='index').reset_index().rename(columns={'index':'source_id_experiment_id', 0:'dois'})"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "1af84ab4",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>source_id_experiment_id</th>\n",
" <th>dois</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>ACCESS-ESM1-5.historical</td>\n",
" <td>doi:10.22033/ESGF/CMIP6.4272</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>ACCESS-ESM1-5.ssp245</td>\n",
" <td>doi:10.22033/ESGF/CMIP6.4322</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>CESM2-WACCM.historical</td>\n",
" <td>doi:10.22033/ESGF/CMIP6.10071</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>CESM2-WACCM.ssp245</td>\n",
" <td>doi:10.22033/ESGF/CMIP6.10101</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>CESM2.historical</td>\n",
" <td>doi:10.22033/ESGF/CMIP6.7627</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>CESM2.ssp245</td>\n",
" <td>doi:10.22033/ESGF/CMIP6.7748</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>CMCC-ESM2.historical</td>\n",
" <td>doi:10.22033/ESGF/CMIP6.13195</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>CMCC-ESM2.ssp245</td>\n",
" <td>doi:10.22033/ESGF/CMIP6.13252</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>CanESM5-CanOE.historical</td>\n",
" <td>doi:10.22033/ESGF/CMIP6.10260</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>CanESM5-CanOE.ssp245</td>\n",
" <td>doi:10.22033/ESGF/CMIP6.10270</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>CanESM5.historical</td>\n",
" <td>doi:10.22033/ESGF/CMIP6.3610</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>CanESM5.ssp245</td>\n",
" <td>doi:10.22033/ESGF/CMIP6.3685</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>GFDL-ESM4.historical</td>\n",
" <td>doi:10.22033/ESGF/CMIP6.8597</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>GFDL-ESM4.ssp245</td>\n",
" <td>doi:10.22033/ESGF/CMIP6.8686</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>MPI-ESM1-2-LR.historical</td>\n",
" <td>doi:10.22033/ESGF/CMIP6.6595</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>MPI-ESM1-2-LR.ssp245</td>\n",
" <td>doi:10.22033/ESGF/CMIP6.6693</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>UKESM1-0-LL.historical</td>\n",
" <td>doi:10.22033/ESGF/CMIP6.6113</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>UKESM1-0-LL.ssp245</td>\n",
" <td>doi:10.22033/ESGF/CMIP6.6339</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" source_id_experiment_id dois\n",
"0 ACCESS-ESM1-5.historical doi:10.22033/ESGF/CMIP6.4272\n",
"1 ACCESS-ESM1-5.ssp245 doi:10.22033/ESGF/CMIP6.4322\n",
"2 CESM2-WACCM.historical doi:10.22033/ESGF/CMIP6.10071\n",
"3 CESM2-WACCM.ssp245 doi:10.22033/ESGF/CMIP6.10101\n",
"4 CESM2.historical doi:10.22033/ESGF/CMIP6.7627\n",
"5 CESM2.ssp245 doi:10.22033/ESGF/CMIP6.7748\n",
"6 CMCC-ESM2.historical doi:10.22033/ESGF/CMIP6.13195\n",
"7 CMCC-ESM2.ssp245 doi:10.22033/ESGF/CMIP6.13252\n",
"8 CanESM5-CanOE.historical doi:10.22033/ESGF/CMIP6.10260\n",
"9 CanESM5-CanOE.ssp245 doi:10.22033/ESGF/CMIP6.10270\n",
"10 CanESM5.historical doi:10.22033/ESGF/CMIP6.3610\n",
"11 CanESM5.ssp245 doi:10.22033/ESGF/CMIP6.3685\n",
"12 GFDL-ESM4.historical doi:10.22033/ESGF/CMIP6.8597\n",
"13 GFDL-ESM4.ssp245 doi:10.22033/ESGF/CMIP6.8686\n",
"14 MPI-ESM1-2-LR.historical doi:10.22033/ESGF/CMIP6.6595\n",
"15 MPI-ESM1-2-LR.ssp245 doi:10.22033/ESGF/CMIP6.6693\n",
"16 UKESM1-0-LL.historical doi:10.22033/ESGF/CMIP6.6113\n",
"17 UKESM1-0-LL.ssp245 doi:10.22033/ESGF/CMIP6.6339"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "7a418a28",
"metadata": {},
"outputs": [],
"source": [
"df.to_csv('doi_dict.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "52dcfb32",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "cmip6",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment