Created
April 23, 2025 23:47
-
-
Save jbusecke/65d0e160ed1d90dac903c9c3f36cf9da to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"id": "d920b3e5", | |
"metadata": {}, | |
"source": [ | |
"# Testing how to programatically extract reference details from CMIP data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "82e0dd0e", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import aiohttp\n", | |
"from tqdm.asyncio import tqdm_asyncio\n", | |
"\n", | |
"async def handle_to_url(handle):\n", | |
" # Convert handle to URL\n", | |
" return f\"https://hdl.handle.net/api/handles/{handle.replace('hdl:', '')}\"\n", | |
"\n", | |
"async def get_json(session, url):\n", | |
" async with session.get(url) as response:\n", | |
" if response.status == 200:\n", | |
" return await response.json()\n", | |
" else:\n", | |
" raise ValueError(f\"Failed to retrieve data from {url}\")\n", | |
"\n", | |
"async def get_value(json_response, value_type):\n", | |
" # Return only the value index with type \"value_type\"\n", | |
" for value in json_response['values']:\n", | |
" if value['type'] == value_type:\n", | |
" return value['data']['value']\n", | |
" raise ValueError(f\"Value of type {value_type} not found in response\")\n", | |
"\n", | |
"async def get_doi_from_tracking_id(tracking_id: str) -> str:\n", | |
" \"\"\"\n", | |
" Get the DOI from a tracking ID attribute string\n", | |
" \"\"\"\n", | |
" tracking_ids = tracking_id.split('\\n')\n", | |
" async with aiohttp.ClientSession() as session:\n", | |
" # Check that all handles point to the same root handle\n", | |
" root_handles = await tqdm_asyncio.gather(\n", | |
" *[get_value(await get_json(session, await handle_to_url(handle)), \"IS_PART_OF\") for handle in tracking_ids]\n", | |
" )\n", | |
" # If not all root_handles are the same, throw an error\n", | |
" if len(set(root_handles)) > 1:\n", | |
" raise ValueError(\"Not all handles point to the same root handle\")\n", | |
" else:\n", | |
" root_handle = root_handles[0]\n", | |
" # Get the DOI of the root handle\n", | |
" doi = await get_value(await get_json(session, await handle_to_url(root_handle)), \"IS_PART_OF\")\n", | |
" # If root_doi does not start with \"doi:\", raise an error\n", | |
" if not doi.startswith(\"doi:\"):\n", | |
" raise ValueError(\"Root handle does not point to a DOI\")\n", | |
" return doi\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "a040bc2f", | |
"metadata": {}, | |
"source": [ | |
"## Load the CMIP Catalog" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"id": "78b86dc2", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import intake\n", | |
"import xarray as xr\n", | |
"\n", | |
"\n", | |
"# uncomment/comment lines to swap catalogs\n", | |
"url = \"https://storage.googleapis.com/cmip6/cmip6-pgf-ingestion-test/catalog/catalog.json\" # Only stores that pass current tests\n", | |
"col = intake.open_esm_datastore(url)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "da6c28e5", | |
"metadata": {}, | |
"source": [ | |
"## A single test" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "cf22605c", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"100%|██████████| 4/4 [00:00<00:00, 8439.24it/s]\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"'doi:10.22033/ESGF/CMIP6.11762'" | |
] | |
}, | |
"execution_count": 13, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"path = col.df['zstore'].tolist()[100]\n", | |
"\n", | |
"ds = xr.open_zarr(path, consolidated=True)\n", | |
"# await get_dois_from_tracking_ids(ds.attrs['tracking_id'])\n", | |
"await get_doi_from_tracking_id(ds.attrs['tracking_id'])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"id": "2752bb31", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'gs://cmip6/CMIP6/CMIP/NASA-GISS/GISS-E2-1-G-CC/historical/r1i1p1f1/Omon/fsitherm/gn/v20190815/'" | |
] | |
}, | |
"execution_count": 14, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"path" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "ea93a8f4", | |
"metadata": {}, | |
"source": [ | |
"Manually checking the facets and the doi, this seems to work 😁" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "cfb08e7c", | |
"metadata": {}, | |
"source": [ | |
"## The data used in the pco2 testbed publication\n", | |
"\n", | |
"Got a list of partial instance_ids, and will get all available dois\n", | |
"\n", | |
">[!NOTE]\n", | |
"> I found that the results are highly redundant and generally there is only one DOI per simulation (e.g. source_id and experiment_id). I thus pruned things heavily below to reduce runtime\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "7d88cc25", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"partial_instance_ids = [\n", | |
" \"ACCESS-ESM1-5.gn.ssp245.Omon.r4i1p1f1\",\n", | |
" \"ACCESS-ESM1-5.gn.historical.Omon.r4i1p1f1\",\n", | |
" \"ACCESS-ESM1-5.gn.ssp245.Omon.r5i1p1f1\",\n", | |
" \"ACCESS-ESM1-5.gn.historical.Omon.r5i1p1f1\",\n", | |
" \"CESM2.gn.ssp245.Omon.r10i1p1f1\",\n", | |
" \"CESM2.gn.historical.Omon.r10i1p1f1\",\n", | |
" \"CESM2.gn.ssp245.Omon.r11i1p1f1\",\n", | |
" \"CESM2.gn.historical.Omon.r11i1p1f1\",\n", | |
" \"CESM2.gn.ssp245.Omon.r4i1p1f1\",\n", | |
" \"CESM2.gn.historical.Omon.r4i1p1f1\",\n", | |
" \"CESM2-WACCM.gn.ssp245.Omon.r1i1p1f1\",\n", | |
" \"CESM2-WACCM.gn.historical.Omon.r1i1p1f1\",\n", | |
" \"CESM2-WACCM.gr.ssp245.Omon.r2i1p1f1\",\n", | |
" \"CESM2-WACCM.gr.historical.Omon.r2i1p1f1\",\n", | |
" \"CESM2-WACCM.gr.ssp245.Omon.r3i1p1f1\",\n", | |
" \"CESM2-WACCM.gr.historical.Omon.r3i1p1f1\",\n", | |
" \"CMCC-ESM2.gn.ssp245.Omon.r1i1p1f1\",\n", | |
" \"CMCC-ESM2.gn.historical.Omon.r1i1p1f1\",\n", | |
" \"CanESM5.gn.ssp245.Omon.r10i1p2f1\",\n", | |
" \"CanESM5.gn.historical.Omon.r10i1p2f1\",\n", | |
" \"CanESM5.gn.ssp245.Omon.r1i1p1f1\",\n", | |
" \"CanESM5.gn.historical.Omon.r1i1p1f1\",\n", | |
" \"CanESM5.gn.ssp245.Omon.r1i1p2f1\",\n", | |
" \"CanESM5.gn.historical.Omon.r1i1p2f1\",\n", | |
" \"CanESM5.gn.ssp245.Omon.r2i1p1f1\",\n", | |
" \"CanESM5.gn.historical.Omon.r2i1p1f1\",\n", | |
" \"CanESM5.gn.ssp245.Omon.r2i1p2f1\",\n", | |
" \"CanESM5.gn.historical.Omon.r2i1p2f1\",\n", | |
" \"CanESM5.gn.ssp245.Omon.r3i1p1f1\",\n", | |
" \"CanESM5.gn.historical.Omon.r3i1p1f1\",\n", | |
" \"CanESM5.gn.ssp245.Omon.r3i1p2f1\",\n", | |
" \"CanESM5.gn.historical.Omon.r3i1p2f1\",\n", | |
" \"CanESM5.gn.ssp245.Omon.r4i1p1f1\",\n", | |
" \"CanESM5.gn.historical.Omon.r4i1p1f1\",\n", | |
" \"CanESM5.gn.ssp245.Omon.r4i1p2f1\",\n", | |
" \"CanESM5.gn.historical.Omon.r4i1p2f1\",\n", | |
" \"CanESM5.gn.ssp245.Omon.r5i1p1f1\",\n", | |
" \"CanESM5.gn.historical.Omon.r5i1p1f1\",\n", | |
" \"CanESM5.gn.ssp245.Omon.r5i1p2f1\",\n", | |
" \"CanESM5.gn.historical.Omon.r5i1p2f1\",\n", | |
" \"CanESM5.gn.ssp245.Omon.r6i1p1f1\",\n", | |
" \"CanESM5.gn.historical.Omon.r6i1p1f1\",\n", | |
" \"CanESM5.gn.ssp245.Omon.r6i1p2f1\",\n", | |
" \"CanESM5.gn.historical.Omon.r6i1p2f1\",\n", | |
" \"CanESM5.gn.ssp245.Omon.r7i1p1f1\",\n", | |
" \"CanESM5.gn.historical.Omon.r7i1p1f1\",\n", | |
" \"CanESM5.gn.ssp245.Omon.r7i1p2f1\",\n", | |
" \"CanESM5.gn.historical.Omon.r7i1p2f1\",\n", | |
" \"CanESM5.gn.ssp245.Omon.r8i1p1f1\",\n", | |
" \"CanESM5.gn.historical.Omon.r8i1p1f1\",\n", | |
" \"CanESM5.gn.ssp245.Omon.r8i1p2f1\",\n", | |
" \"CanESM5.gn.historical.Omon.r8i1p2f1\",\n", | |
" \"CanESM5.gn.ssp245.Omon.r9i1p2f1\",\n", | |
" \"CanESM5.gn.historical.Omon.r9i1p2f1\",\n", | |
" \"CanESM5-CanOE.gn.ssp245.Omon.r1i1p2f1\",\n", | |
" \"CanESM5-CanOE.gn.historical.Omon.r1i1p2f1\",\n", | |
" \"CanESM5-CanOE.gn.ssp245.Omon.r2i1p2f1\",\n", | |
" \"CanESM5-CanOE.gn.historical.Omon.r2i1p2f1\",\n", | |
" \"CanESM5-CanOE.gn.ssp245.Omon.r3i1p2f1\",\n", | |
" \"CanESM5-CanOE.gn.historical.Omon.r3i1p2f1\",\n", | |
" \"GFDL-ESM4.gr.ssp245.Omon.r1i1p1f1\",\n", | |
" \"GFDL-ESM4.gr.historical.Omon.r1i1p1f1\",\n", | |
" \"MPI-ESM1-2-LR.gn.ssp245.Omon.r11i1p1f1\",\n", | |
" \"MPI-ESM1-2-LR.gn.historical.Omon.r11i1p1f1\",\n", | |
" \"MPI-ESM1-2-LR.gn.ssp245.Omon.r12i1p1f1\",\n", | |
" \"MPI-ESM1-2-LR.gn.historical.Omon.r12i1p1f1\",\n", | |
" \"MPI-ESM1-2-LR.gn.ssp245.Omon.r14i1p1f1\",\n", | |
" \"MPI-ESM1-2-LR.gn.historical.Omon.r14i1p1f1\",\n", | |
" \"MPI-ESM1-2-LR.gn.ssp245.Omon.r15i1p1f1\",\n", | |
" \"MPI-ESM1-2-LR.gn.historical.Omon.r15i1p1f1\",\n", | |
" \"MPI-ESM1-2-LR.gn.ssp245.Omon.r16i1p1f1\",\n", | |
" \"MPI-ESM1-2-LR.gn.historical.Omon.r16i1p1f1\",\n", | |
" \"MPI-ESM1-2-LR.gn.ssp245.Omon.r22i1p1f1\",\n", | |
" \"MPI-ESM1-2-LR.gn.historical.Omon.r22i1p1f1\",\n", | |
" \"MPI-ESM1-2-LR.gn.ssp245.Omon.r23i1p1f1\",\n", | |
" \"MPI-ESM1-2-LR.gn.historical.Omon.r23i1p1f1\",\n", | |
" \"MPI-ESM1-2-LR.gn.ssp245.Omon.r26i1p1f1\",\n", | |
" \"MPI-ESM1-2-LR.gn.historical.Omon.r26i1p1f1\",\n", | |
" \"MPI-ESM1-2-LR.gn.ssp245.Omon.r27i1p1f1\",\n", | |
" \"MPI-ESM1-2-LR.gn.historical.Omon.r27i1p1f1\",\n", | |
" \"UKESM1-0-LL.gn.ssp245.Omon.r1i1p1f2\",\n", | |
" \"UKESM1-0-LL.gn.historical.Omon.r1i1p1f2\",\n", | |
" \"UKESM1-0-LL.gn.ssp245.Omon.r2i1p1f2\",\n", | |
" \"UKESM1-0-LL.gn.historical.Omon.r2i1p1f2\",\n", | |
" \"UKESM1-0-LL.gn.ssp245.Omon.r3i1p1f2\",\n", | |
" \"UKESM1-0-LL.gn.historical.Omon.r3i1p1f2\",\n", | |
" \"UKESM1-0-LL.gn.ssp245.Omon.r4i1p1f2\",\n", | |
" \"UKESM1-0-LL.gn.historical.Omon.r4i1p1f2\",\n", | |
" \"UKESM1-0-LL.gn.ssp245.Omon.r8i1p1f2\",\n", | |
" \"UKESM1-0-LL.gn.historical.Omon.r8i1p1f2\"\n", | |
"]\n", | |
"# Im gonna cheat here a bit. I am pretty sure that the dois are per simulation, so lets just search for the source_id and experiment_id\n", | |
"\n", | |
"partial_instance_ids_pruned = sorted(list(set(['.'.join(iid.split('.')[:-1]) for iid in partial_instance_ids])))\n", | |
"partial_instance_ids_pruned" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 34, | |
"id": "8623fb75", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "0748d44cf648430ba7a5fbd1c16ec6ca", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"Instance ID Loop: 0%| | 0/20 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"ACCESS-ESM1-5.gn.historical.Omon\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "6e3f99e4c13546b2b886327281720c57", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"Dataset Loop: 0%| | 0/1 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"100%|██████████| 1/1 [00:00<00:00, 1758.62it/s]\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"ACCESS-ESM1-5.historical {'doi:10.22033/ESGF/CMIP6.4272'}\n", | |
"ACCESS-ESM1-5.gn.ssp245.Omon\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "5d7072d711d24e6c9979eecd21b6ba83", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"Dataset Loop: 0%| | 0/1 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"100%|██████████| 1/1 [00:00<00:00, 2123.70it/s]\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"ACCESS-ESM1-5.ssp245 {'doi:10.22033/ESGF/CMIP6.4322'}\n", | |
"CESM2-WACCM.gn.historical.Omon\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "ce5f8f1299ba40e08f66f1beee04a567", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"Dataset Loop: 0%| | 0/1 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"100%|██████████| 1/1 [00:00<00:00, 1831.57it/s]\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CESM2-WACCM.historical {'doi:10.22033/ESGF/CMIP6.10071'}\n", | |
"CESM2-WACCM.gn.ssp245.Omon\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "4322ef65150649b495c4b3f7f25fe73a", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"Dataset Loop: 0%| | 0/1 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"100%|██████████| 2/2 [00:00<00:00, 4606.59it/s]\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CESM2-WACCM.ssp245 {'doi:10.22033/ESGF/CMIP6.10101'}\n", | |
"CESM2-WACCM.gr.historical.Omon\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "d5e8b77862ee44d1b06135694fd40651", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"Dataset Loop: 0%| | 0/1 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"100%|██████████| 1/1 [00:00<00:00, 1472.72it/s]\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CESM2-WACCM.historical {'doi:10.22033/ESGF/CMIP6.10071'}\n", | |
"CESM2-WACCM.gr.ssp245.Omon\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "a8faf47c904b4618a4a5e28ce141bb5f", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"Dataset Loop: 0%| | 0/1 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"100%|██████████| 2/2 [00:00<00:00, 4202.71it/s]\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CESM2-WACCM.ssp245 {'doi:10.22033/ESGF/CMIP6.10101'}\n", | |
"CESM2.gn.historical.Omon\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "9116eec45e6b496081d2b62ca9cc1686", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"Dataset Loop: 0%| | 0/1 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"100%|██████████| 4/4 [00:00<00:00, 5635.61it/s]\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CESM2.historical {'doi:10.22033/ESGF/CMIP6.7627'}\n", | |
"CESM2.gn.ssp245.Omon\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "d7a429bdc3fc45ab90c2b77940a436d6", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"Dataset Loop: 0%| | 0/1 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"100%|██████████| 2/2 [00:00<00:00, 3067.13it/s]\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CESM2.ssp245 {'doi:10.22033/ESGF/CMIP6.7748'}\n", | |
"CMCC-ESM2.gn.historical.Omon\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "a8a4c1d03cc748c696cf0b7516e1d5ca", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"Dataset Loop: 0%| | 0/1 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"100%|██████████| 1/1 [00:00<00:00, 1248.30it/s]\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CMCC-ESM2.historical {'doi:10.22033/ESGF/CMIP6.13195'}\n", | |
"CMCC-ESM2.gn.ssp245.Omon\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "c91075a618534aeba89adc0974cdb468", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"Dataset Loop: 0%| | 0/1 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"100%|██████████| 1/1 [00:00<00:00, 1492.63it/s]\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CMCC-ESM2.ssp245 {'doi:10.22033/ESGF/CMIP6.13252'}\n", | |
"CanESM5-CanOE.gn.historical.Omon\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "57c1cb85cea84ed49714f0f34c03004b", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"Dataset Loop: 0%| | 0/1 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"100%|██████████| 1/1 [00:00<00:00, 1838.80it/s]\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CanESM5-CanOE.historical {'doi:10.22033/ESGF/CMIP6.10260'}\n", | |
"CanESM5-CanOE.gn.ssp245.Omon\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "e2c08ab2ca814afbbe5ed8edbda99d2e", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"Dataset Loop: 0%| | 0/1 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"100%|██████████| 1/1 [00:00<00:00, 1326.47it/s]\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CanESM5-CanOE.ssp245 {'doi:10.22033/ESGF/CMIP6.10270'}\n", | |
"CanESM5.gn.historical.Omon\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "90a389cfe9b649e9b68d1148be20a3cd", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"Dataset Loop: 0%| | 0/1 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"100%|██████████| 1/1 [00:00<00:00, 3536.51it/s]\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CanESM5.historical {'doi:10.22033/ESGF/CMIP6.3610'}\n", | |
"CanESM5.gn.ssp245.Omon\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "6270aec497674838a096b76ff191658e", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"Dataset Loop: 0%| | 0/1 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"100%|██████████| 1/1 [00:00<00:00, 1937.32it/s]\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CanESM5.ssp245 {'doi:10.22033/ESGF/CMIP6.3685'}\n", | |
"GFDL-ESM4.gr.historical.Omon\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "294a76ffb18944b49f041fd8e6a7a3ee", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"Dataset Loop: 0%| | 0/1 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"100%|██████████| 9/9 [00:00<00:00, 10742.38it/s]\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"GFDL-ESM4.historical {'doi:10.22033/ESGF/CMIP6.8597'}\n", | |
"GFDL-ESM4.gr.ssp245.Omon\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "fef3851f50f844ec88410a06277f386a", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"Dataset Loop: 0%| | 0/1 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"100%|██████████| 5/5 [00:00<00:00, 4425.30it/s]\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"GFDL-ESM4.ssp245 {'doi:10.22033/ESGF/CMIP6.8686'}\n", | |
"MPI-ESM1-2-LR.gn.historical.Omon\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "4f779eb7a575412c8112bcb3c2f55b28", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"Dataset Loop: 0%| | 0/1 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"100%|██████████| 9/9 [00:00<00:00, 13414.62it/s]\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"MPI-ESM1-2-LR.historical {'doi:10.22033/ESGF/CMIP6.6595'}\n", | |
"MPI-ESM1-2-LR.gn.ssp245.Omon\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "d2d085112576458895af2f7d07ab9772", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"Dataset Loop: 0%| | 0/1 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"100%|██████████| 5/5 [00:00<00:00, 10082.46it/s]\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"MPI-ESM1-2-LR.ssp245 {'doi:10.22033/ESGF/CMIP6.6693'}\n", | |
"UKESM1-0-LL.gn.historical.Omon\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "28915813399a4f51b07d841699296d63", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"Dataset Loop: 0%| | 0/1 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"100%|██████████| 2/2 [00:00<00:00, 4914.24it/s]\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"UKESM1-0-LL.historical {'doi:10.22033/ESGF/CMIP6.6113'}\n", | |
"UKESM1-0-LL.gn.ssp245.Omon\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "c117a01d4deb4c92989c688e5f59a0b4", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"Dataset Loop: 0%| | 0/1 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"100%|██████████| 2/2 [00:00<00:00, 4017.53it/s]\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"UKESM1-0-LL.ssp245 {'doi:10.22033/ESGF/CMIP6.6339'}\n" | |
] | |
} | |
], | |
"source": [ | |
"from tqdm.auto import tqdm\n", | |
"\n", | |
"doi_dict = {}\n", | |
"for iid in tqdm(partial_instance_ids_pruned, desc=\"Instance ID Loop\"):\n", | |
" print(iid)\n", | |
" source_id, grid_label, experiment_id, table_id = iid.split('.')\n", | |
" cat = col.search(\n", | |
" source_id=source_id,\n", | |
" grid_label=grid_label,\n", | |
" experiment_id=experiment_id,\n", | |
" table_id=table_id,\n", | |
" variable_id=['tos'], #TODO: I need to add the others that were used\n", | |
" )\n", | |
" # more thorough search but the results are redundant anyways and the above search is much faster\n", | |
"\n", | |
" # source_id, grid_label, experiment_id, table_id, member_id = iid.split('.')\n", | |
" # cat = col.search(\n", | |
" # source_id=source_id,\n", | |
" # grid_label=grid_label,\n", | |
" # experiment_id=experiment_id,\n", | |
" # table_id=table_id,\n", | |
" # member_id=member_id\n", | |
" # variable_id=['tos'], #TODO: I need to add the others that were used\n", | |
" # )\n", | |
" datasets = cat.to_dataset_dict(\n", | |
" aggregate=False,\n", | |
" skip_on_error=True,\n", | |
" xarray_open_kwargs={\n", | |
" 'consolidated':True,\n", | |
" 'decode_times':False\n", | |
" },\n", | |
" progressbar=False,\n", | |
" )\n", | |
" # some datasets have no tracking_id....seems to be a problem with some BGC variables? Needs further investigation\n", | |
" # should not be a deal breaker since we are looking for simulation level DOIs here. So if we find a single one per simulation we are good\n", | |
"\n", | |
"\n", | |
" # Im gonna cheat here a bit. I am pretty sure that the dois are per simulation, so lets just get the firste one that has the tracking_id attribute\n", | |
" datasets_pruned = {}\n", | |
" for name, ds in datasets.items():\n", | |
" if 'tracking_id' in ds.attrs:\n", | |
" datasets_pruned[name] = ds\n", | |
" break\n", | |
" \n", | |
" dois = [await get_doi_from_tracking_id(ds.attrs['tracking_id']) for ds in tqdm(datasets_pruned.values(), desc=\"Dataset Loop\") if 'tracking_id' in ds.attrs]\n", | |
"\n", | |
" # create new source_id key in dict or if source_id key already existst append to list\n", | |
" key = '.'.join([source_id, experiment_id])\n", | |
" print(key,set(dois))\n", | |
" if key not in doi_dict:\n", | |
" doi_dict[key] = set(dois)\n", | |
" else:\n", | |
" doi_dict[key] = doi_dict[key].union(set(dois))\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 38, | |
"id": "1a1b17ab", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'ACCESS-ESM1-5.historical': {'doi:10.22033/ESGF/CMIP6.4272'},\n", | |
" 'ACCESS-ESM1-5.ssp245': {'doi:10.22033/ESGF/CMIP6.4322'},\n", | |
" 'CESM2-WACCM.historical': {'doi:10.22033/ESGF/CMIP6.10071'},\n", | |
" 'CESM2-WACCM.ssp245': {'doi:10.22033/ESGF/CMIP6.10101'},\n", | |
" 'CESM2.historical': {'doi:10.22033/ESGF/CMIP6.7627'},\n", | |
" 'CESM2.ssp245': {'doi:10.22033/ESGF/CMIP6.7748'},\n", | |
" 'CMCC-ESM2.historical': {'doi:10.22033/ESGF/CMIP6.13195'},\n", | |
" 'CMCC-ESM2.ssp245': {'doi:10.22033/ESGF/CMIP6.13252'},\n", | |
" 'CanESM5-CanOE.historical': {'doi:10.22033/ESGF/CMIP6.10260'},\n", | |
" 'CanESM5-CanOE.ssp245': {'doi:10.22033/ESGF/CMIP6.10270'},\n", | |
" 'CanESM5.historical': {'doi:10.22033/ESGF/CMIP6.3610'},\n", | |
" 'CanESM5.ssp245': {'doi:10.22033/ESGF/CMIP6.3685'},\n", | |
" 'GFDL-ESM4.historical': {'doi:10.22033/ESGF/CMIP6.8597'},\n", | |
" 'GFDL-ESM4.ssp245': {'doi:10.22033/ESGF/CMIP6.8686'},\n", | |
" 'MPI-ESM1-2-LR.historical': {'doi:10.22033/ESGF/CMIP6.6595'},\n", | |
" 'MPI-ESM1-2-LR.ssp245': {'doi:10.22033/ESGF/CMIP6.6693'},\n", | |
" 'UKESM1-0-LL.historical': {'doi:10.22033/ESGF/CMIP6.6113'},\n", | |
" 'UKESM1-0-LL.ssp245': {'doi:10.22033/ESGF/CMIP6.6339'}}" | |
] | |
}, | |
"execution_count": 38, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"doi_dict" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 39, | |
"id": "0c192c5a", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 42, | |
"id": "376eec5b", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df = pd.DataFrame.from_dict(doi_dict, orient='index').reset_index().rename(columns={'index':'source_id_experiment_id', 0:'dois'})" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 43, | |
"id": "1af84ab4", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>source_id_experiment_id</th>\n", | |
" <th>dois</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>ACCESS-ESM1-5.historical</td>\n", | |
" <td>doi:10.22033/ESGF/CMIP6.4272</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>ACCESS-ESM1-5.ssp245</td>\n", | |
" <td>doi:10.22033/ESGF/CMIP6.4322</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>CESM2-WACCM.historical</td>\n", | |
" <td>doi:10.22033/ESGF/CMIP6.10071</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>CESM2-WACCM.ssp245</td>\n", | |
" <td>doi:10.22033/ESGF/CMIP6.10101</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>CESM2.historical</td>\n", | |
" <td>doi:10.22033/ESGF/CMIP6.7627</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>CESM2.ssp245</td>\n", | |
" <td>doi:10.22033/ESGF/CMIP6.7748</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>CMCC-ESM2.historical</td>\n", | |
" <td>doi:10.22033/ESGF/CMIP6.13195</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>CMCC-ESM2.ssp245</td>\n", | |
" <td>doi:10.22033/ESGF/CMIP6.13252</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>CanESM5-CanOE.historical</td>\n", | |
" <td>doi:10.22033/ESGF/CMIP6.10260</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>CanESM5-CanOE.ssp245</td>\n", | |
" <td>doi:10.22033/ESGF/CMIP6.10270</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>10</th>\n", | |
" <td>CanESM5.historical</td>\n", | |
" <td>doi:10.22033/ESGF/CMIP6.3610</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>11</th>\n", | |
" <td>CanESM5.ssp245</td>\n", | |
" <td>doi:10.22033/ESGF/CMIP6.3685</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>12</th>\n", | |
" <td>GFDL-ESM4.historical</td>\n", | |
" <td>doi:10.22033/ESGF/CMIP6.8597</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>13</th>\n", | |
" <td>GFDL-ESM4.ssp245</td>\n", | |
" <td>doi:10.22033/ESGF/CMIP6.8686</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>14</th>\n", | |
" <td>MPI-ESM1-2-LR.historical</td>\n", | |
" <td>doi:10.22033/ESGF/CMIP6.6595</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>15</th>\n", | |
" <td>MPI-ESM1-2-LR.ssp245</td>\n", | |
" <td>doi:10.22033/ESGF/CMIP6.6693</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>16</th>\n", | |
" <td>UKESM1-0-LL.historical</td>\n", | |
" <td>doi:10.22033/ESGF/CMIP6.6113</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>17</th>\n", | |
" <td>UKESM1-0-LL.ssp245</td>\n", | |
" <td>doi:10.22033/ESGF/CMIP6.6339</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" source_id_experiment_id dois\n", | |
"0 ACCESS-ESM1-5.historical doi:10.22033/ESGF/CMIP6.4272\n", | |
"1 ACCESS-ESM1-5.ssp245 doi:10.22033/ESGF/CMIP6.4322\n", | |
"2 CESM2-WACCM.historical doi:10.22033/ESGF/CMIP6.10071\n", | |
"3 CESM2-WACCM.ssp245 doi:10.22033/ESGF/CMIP6.10101\n", | |
"4 CESM2.historical doi:10.22033/ESGF/CMIP6.7627\n", | |
"5 CESM2.ssp245 doi:10.22033/ESGF/CMIP6.7748\n", | |
"6 CMCC-ESM2.historical doi:10.22033/ESGF/CMIP6.13195\n", | |
"7 CMCC-ESM2.ssp245 doi:10.22033/ESGF/CMIP6.13252\n", | |
"8 CanESM5-CanOE.historical doi:10.22033/ESGF/CMIP6.10260\n", | |
"9 CanESM5-CanOE.ssp245 doi:10.22033/ESGF/CMIP6.10270\n", | |
"10 CanESM5.historical doi:10.22033/ESGF/CMIP6.3610\n", | |
"11 CanESM5.ssp245 doi:10.22033/ESGF/CMIP6.3685\n", | |
"12 GFDL-ESM4.historical doi:10.22033/ESGF/CMIP6.8597\n", | |
"13 GFDL-ESM4.ssp245 doi:10.22033/ESGF/CMIP6.8686\n", | |
"14 MPI-ESM1-2-LR.historical doi:10.22033/ESGF/CMIP6.6595\n", | |
"15 MPI-ESM1-2-LR.ssp245 doi:10.22033/ESGF/CMIP6.6693\n", | |
"16 UKESM1-0-LL.historical doi:10.22033/ESGF/CMIP6.6113\n", | |
"17 UKESM1-0-LL.ssp245 doi:10.22033/ESGF/CMIP6.6339" | |
] | |
}, | |
"execution_count": 43, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 44, | |
"id": "7a418a28", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df.to_csv('doi_dict.csv', index=False)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "52dcfb32", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "cmip6", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.13.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment