Skip to content

Instantly share code, notes, and snippets.

@andersy005
Last active March 27, 2024 18:44
Show Gist options
  • Save andersy005/b83592c90eb12f8d18ab3405ef562cd1 to your computer and use it in GitHub Desktop.
Save andersy005/b83592c90eb12f8d18ab3405ef562cd1 to your computer and use it in GitHub Desktop.
[
{
"title": "CMIP6 Downscaled products",
"description": "dataset_description.",
"maintainers": [
{
"name": "Oriana Chegwidden",
"github": "orianac"
}
],
"provenance": {
"providers": [
{
"name": "carbonplan",
"description": "carbonplan_test_dataset",
"roles": [
"producer",
"licensor"
],
"url": "https://carbonplan.org/"
}
],
"license": "CC-BY-NC v.4.0",
"license_link": null
},
"thumbnail": null,
"tags": [
"zarr",
"climate"
],
"links": null,
"stores": [
{
"name": "ScenarioMIP.CCCma.CanESM5.ssp245.r1i1p1f1.annual.GARD-SV.pr.zarr",
"url": "s3://carbonplan-data-viewer/demo/ncview-2.0/ScenarioMIP.CCCma.CanESM5.ssp245.r1i1p1f1.annual.GARD-SV.pr.zarr"
},
{
"name": "ScenarioMIP.CCCma.CanESM5.ssp245.r1i1p1f1.annual.GARD-SV.tasmax.zarr",
"url": "s3://carbonplan-data-viewer/demo/ncview-2.0/ScenarioMIP.CCCma.CanESM5.ssp245.r1i1p1f1.annual.GARD-SV.tasmax.zarr"
}
],
"doi_citation": null,
"demo": true
},
{
"title": "Dry Spell Corn",
"description": "dataset_description.",
"maintainers": [
{
"name": "Oriana Chegwidden",
"github": "orianac"
}
],
"provenance": {
"providers": [
{
"name": "carbonplan",
"description": "carbonplan_test_dataset",
"roles": [
"producer",
"licensor"
],
"url": "https://carbonplan.org/"
}
],
"license": "CC-BY-NC v.4.0",
"license_link": null
},
"thumbnail": null,
"tags": [
"zarr",
"climate"
],
"links": null,
"stores": [
{
"name": "CanESM5-ssp370-full-time-extent.zarr",
"url": "s3://carbonplan-data-viewer/demo/ncview-2.0/dryspells_corn/CanESM5-ssp370-full-time-extent.zarr"
}
],
"doi_citation": null,
"demo": true
},
{
"title": "Sample Australia Cordex Data",
"description": "dataset_description.",
"maintainers": [
{
"name": "Anderson Banihirwe",
"github": "andersy005"
}
],
"provenance": {
"providers": [
{
"name": "carbonplan",
"description": "carbonplan_test_dataset",
"roles": [
"producer",
"licensor"
],
"url": "https://carbonplan.org/"
}
],
"license": "CC-BY-NC v.4.0",
"license_link": null
},
"thumbnail": null,
"tags": [
"zarr",
"climate"
],
"links": null,
"stores": [
{
"name": "sample_australia_cordex_data.zarr",
"url": "s3://carbonplan-data-viewer/demo/ncview-2.0/single_timestep/sample_australia_cordex_data.zarr"
}
],
"doi_citation": null,
"demo": true
},
{
"title": "dataset_1",
"description": "dataset_description.",
"maintainers": [
{
"name": "Raphael Hagen",
"github": "norlandrhagen"
}
],
"provenance": {
"providers": [
{
"name": "carbonplan",
"description": "carbonplan_test_dataset",
"roles": [
"producer",
"licensor"
],
"url": "https://dapds00.nci.org.au/thredds/catalogs/zv2/catalog.html"
}
],
"license": "CC-BY-NC v.4.0",
"license_link": null
},
"thumbnail": null,
"tags": [
"zarr",
"climate"
],
"links": null,
"stores": [
{
"name": "test_dataset1.zarr",
"url": "s3://carbonplan-data-viewer/demo/ncview-2.0/test_dataset1.zarr"
}
],
"doi_citation": null,
"demo": true
},
{
"title": "dataset_2",
"description": "dataset_description.",
"maintainers": [
{
"name": "Raphael Hagen",
"github": "norlandrhagen"
}
],
"provenance": {
"providers": [
{
"name": "carbonplan",
"description": "carbonplan_test_dataset",
"roles": [
"producer",
"licensor"
],
"url": "https://dapds00.nci.org.au/thredds/catalogs/zv2/catalog.html"
}
],
"license": "CC-BY-NC v.4.0",
"license_link": null
},
"thumbnail": null,
"tags": [
"zarr",
"climate"
],
"links": null,
"stores": [
{
"name": "test_dataset2.zarr",
"url": "s3://carbonplan-data-viewer/demo/ncview-2.0/test_dataset2.zarr"
}
],
"doi_citation": null,
"demo": true
},
{
"title": "dataset_3",
"description": "dataset_description.",
"maintainers": [
{
"name": "Raphael Hagen",
"github": "norlandrhagen"
}
],
"provenance": {
"providers": [
{
"name": "carbonplan",
"description": "carbonplan_test_dataset",
"roles": [
"producer",
"licensor"
],
"url": "https://dapds00.nci.org.au/thredds/catalogs/zv2/catalog.html"
}
],
"license": "CC-BY-NC v.4.0",
"license_link": null
},
"thumbnail": null,
"tags": [
"zarr",
"climate"
],
"links": null,
"stores": [
{
"name": "test_dataset3.zarr",
"url": "s3://carbonplan-data-viewer/demo/ncview-2.0/test_dataset3.zarr"
}
],
"doi_citation": null,
"demo": true
}
]
title: "Dry Spell Corn"
description: "dataset_description."
recipes:
- id: recipe
object: "recipe:recipe"
provenance:
providers:
- name: "carbonplan"
description: "carbonplan_test_dataset"
roles:
- producer
- licensor
url: https://carbonplan.org/
license: "CC-BY-NC v.4.0"
maintainers:
- name: "Oriana Chegwidden"
github: orianac
title: "CMIP6 Downscaled products"
description: "dataset_description."
recipes:
- id: recipe
object: "recipe:recipe"
provenance:
providers:
- name: "carbonplan"
description: "carbonplan_test_dataset"
roles:
- producer
- licensor
url: https://carbonplan.org/
license: "CC-BY-NC v.4.0"
maintainers:
- name: "Oriana Chegwidden"
github: orianac
title: "Sample Australia Cordex Data"
description: "dataset_description."
recipes:
- id: recipe
object: "recipe:recipe"
provenance:
providers:
- name: "carbonplan"
description: "carbonplan_test_dataset"
roles:
- producer
- licensor
url: https://carbonplan.org/
license: "CC-BY-NC v.4.0"
maintainers:
- name: "Anderson Banihirwe"
github: andersy005
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 11,
"id": "90275cba-9286-4cca-8dcf-62dbae62da09",
"metadata": {},
"outputs": [],
"source": [
"import fsspec\n",
"import xarray as xr\n",
"import upath\n",
"import yaml\n",
"from collections import defaultdict\n",
"import json"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "b9e1934e-3f3c-4dee-985c-e94f82282ca0",
"metadata": {},
"outputs": [],
"source": [
"bucket = 's3://carbonplan-data-viewer/demo/ncview-2.0'\n",
"fs = fsspec.filesystem('s3')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "cf60cff6-a71d-430d-9d66-80f29f1e50e4",
"metadata": {},
"outputs": [],
"source": [
"def find_zarr_stores(fs, bucket):\n",
" \"\"\"Finds directories containing '.zarr' in the first and second level of directories.\"\"\"\n",
" stores = []\n",
"\n",
" # Process the first level\n",
" for p in fs.ls(bucket, detail=True):\n",
" if p['type'] == 'directory' and 'zarr' in p['name']:\n",
" stores.append('s3://' + p['name'])\n",
" elif p['type'] == 'directory':\n",
" # Process the second level\n",
" for d in fs.ls(p['name'], detail=True):\n",
" if d['type'] == 'directory' and 'zarr' in d['name']:\n",
" stores.append('s3://' + d['name'])\n",
"\n",
" return stores"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "f6f73778-1f6d-4bbc-8bba-bae7a482920f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['s3://carbonplan-data-viewer/demo/ncview-2.0/ScenarioMIP.CCCma.CanESM5.ssp245.r1i1p1f1.annual.GARD-SV.pr.zarr',\n",
" 's3://carbonplan-data-viewer/demo/ncview-2.0/ScenarioMIP.CCCma.CanESM5.ssp245.r1i1p1f1.annual.GARD-SV.tasmax.zarr',\n",
" 's3://carbonplan-data-viewer/demo/ncview-2.0/dryspells_corn/CanESM5-ssp370-full-time-extent.zarr',\n",
" 's3://carbonplan-data-viewer/demo/ncview-2.0/single_timestep/sample_australia_cordex_data.zarr',\n",
" 's3://carbonplan-data-viewer/demo/ncview-2.0/test_dataset1.zarr',\n",
" 's3://carbonplan-data-viewer/demo/ncview-2.0/test_dataset2.zarr',\n",
" 's3://carbonplan-data-viewer/demo/ncview-2.0/test_dataset3.zarr']"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"stores = find_zarr_stores(fs, bucket)\n",
"stores"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "924e572b-d1ec-4440-be3c-5bd7111af94d",
"metadata": {},
"outputs": [],
"source": [
"import pydantic\n",
"\n",
"\n",
"class Store(pydantic.BaseModel):\n",
" name: str = pydantic.Field(..., description='Name of the store')\n",
" url: str = pydantic.Field(..., description='URL of the store')\n",
"\n",
"\n",
"class Link(pydantic.BaseModel):\n",
" label: str = pydantic.Field(..., description='Label of the link')\n",
" url: str = pydantic.Field(..., description='URL of the link')\n",
"\n",
"\n",
"class LicenseLink(pydantic.BaseModel):\n",
" title: str = pydantic.Field(..., description='Name of the license')\n",
" url: str | None = pydantic.Field(None, description='URL of the license')\n",
"\n",
"\n",
"class Maintainer(pydantic.BaseModel):\n",
" name: str = pydantic.Field(..., description='Name of the maintainer')\n",
" github: str | None = pydantic.Field(\n",
" None, description='GitHub username of the maintainer'\n",
" )\n",
"class Provider(pydantic.BaseModel):\n",
" name: str = pydantic.Field(..., description='Name of the provider')\n",
" description: str = pydantic.Field(..., description='Description of the provider')\n",
" roles: list[str] | None = pydantic.Field(None, description='Roles of the provider')\n",
" url: str | None = pydantic.Field(None, description='URL of the provider')\n",
"\n",
"class Provenance(pydantic.BaseModel):\n",
" providers: list[Provider]\n",
" license: str\n",
" license_link: LicenseLink | None = None\n",
"\n",
"\n",
"class Feedstock(pydantic.BaseModel, validate_assignment=True):\n",
" title: str = pydantic.Field(..., description='Title of the feedstock')\n",
" description: str = pydantic.Field(..., description='Description of the feedstock')\n",
" maintainers: list[Maintainer]\n",
" provenance: Provenance\n",
" thumbnail: pydantic.HttpUrl | None = pydantic.Field(\n",
" None, description='Thumbnail of the feedstock'\n",
" )\n",
" tags: list[str] | None = pydantic.Field(None, description='Tags of the dataset')\n",
" links: list[Link] | None = None\n",
" stores: list[Store] | None = None\n",
" doi_citation: pydantic.HttpUrl | None = None\n",
" demo: bool = pydantic.Field(False, description='Whether the dataset is a demo dataset')\n",
"\n",
"\n",
"\n",
"def convert_to_raw_github_url(github_url):\n",
" # Check if the URL is already a raw URL\n",
" if 'raw.githubusercontent.com' in github_url:\n",
" return github_url\n",
"\n",
" # Replace the domain\n",
" raw_url = github_url.replace('github.com', 'raw.githubusercontent.com')\n",
" \n",
" # Remove '/blob'\n",
" raw_url = raw_url.replace('/blob', '')\n",
"\n",
" return raw_url\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "fad6f60e-e3fd-4c99-a6ff-ba114c6cb1e9",
"metadata": {},
"outputs": [],
"source": [
"catalog = []\n",
"for store in stores:\n",
" ds = xr.open_dataset(store, engine='zarr', chunks={})\n",
" meta_url = convert_to_raw_github_url(ds.attrs['pangeo-forge:meta_yaml_url'])\n",
" meta = yaml.load(upath.UPath(meta_url).read_text(), Loader=yaml.FullLoader)\n",
" data = Feedstock.model_validate(meta)\n",
" data.stores = [{'name': store.split('/')[-1], \n",
" 'url': store}]\n",
" catalog.append(data)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "020495b0-2a68-4485-94b8-ad0c5f9cec6d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Feedstock(title='CMIP6 Downscaled products', description='dataset_description.', maintainers=[Maintainer(name='Oriana Chegwidden', github='orianac')], provenance=Provenance(providers=[Provider(name='carbonplan', description='carbonplan_test_dataset', roles=['producer', 'licensor'], url='https://carbonplan.org/')], license='CC-BY-NC v.4.0', license_link=None), thumbnail=None, tags=None, links=None, stores=[Store(name='ScenarioMIP.CCCma.CanESM5.ssp245.r1i1p1f1.annual.GARD-SV.pr.zarr', url='s3://carbonplan-data-viewer/demo/ncview-2.0/ScenarioMIP.CCCma.CanESM5.ssp245.r1i1p1f1.annual.GARD-SV.pr.zarr')], doi_citation=None, demo=False),\n",
" Feedstock(title='CMIP6 Downscaled products', description='dataset_description.', maintainers=[Maintainer(name='Oriana Chegwidden', github='orianac')], provenance=Provenance(providers=[Provider(name='carbonplan', description='carbonplan_test_dataset', roles=['producer', 'licensor'], url='https://carbonplan.org/')], license='CC-BY-NC v.4.0', license_link=None), thumbnail=None, tags=None, links=None, stores=[Store(name='ScenarioMIP.CCCma.CanESM5.ssp245.r1i1p1f1.annual.GARD-SV.tasmax.zarr', url='s3://carbonplan-data-viewer/demo/ncview-2.0/ScenarioMIP.CCCma.CanESM5.ssp245.r1i1p1f1.annual.GARD-SV.tasmax.zarr')], doi_citation=None, demo=False),\n",
" Feedstock(title='Dry Spell Corn', description='dataset_description.', maintainers=[Maintainer(name='Oriana Chegwidden', github='orianac')], provenance=Provenance(providers=[Provider(name='carbonplan', description='carbonplan_test_dataset', roles=['producer', 'licensor'], url='https://carbonplan.org/')], license='CC-BY-NC v.4.0', license_link=None), thumbnail=None, tags=None, links=None, stores=[Store(name='CanESM5-ssp370-full-time-extent.zarr', url='s3://carbonplan-data-viewer/demo/ncview-2.0/dryspells_corn/CanESM5-ssp370-full-time-extent.zarr')], doi_citation=None, demo=False),\n",
" Feedstock(title='Sample Australia Cordex Data', description='dataset_description.', maintainers=[Maintainer(name='Anderson Banihirwe', github='andersy005')], provenance=Provenance(providers=[Provider(name='carbonplan', description='carbonplan_test_dataset', roles=['producer', 'licensor'], url='https://carbonplan.org/')], license='CC-BY-NC v.4.0', license_link=None), thumbnail=None, tags=None, links=None, stores=[Store(name='sample_australia_cordex_data.zarr', url='s3://carbonplan-data-viewer/demo/ncview-2.0/single_timestep/sample_australia_cordex_data.zarr')], doi_citation=None, demo=False),\n",
" Feedstock(title='dataset_1', description='dataset_description.', maintainers=[Maintainer(name='Raphael Hagen', github='norlandrhagen')], provenance=Provenance(providers=[Provider(name='carbonplan', description='carbonplan_test_dataset', roles=['producer', 'licensor'], url='https://dapds00.nci.org.au/thredds/catalogs/zv2/catalog.html')], license='CC-BY-NC v.4.0', license_link=None), thumbnail=None, tags=None, links=None, stores=[Store(name='test_dataset1.zarr', url='s3://carbonplan-data-viewer/demo/ncview-2.0/test_dataset1.zarr')], doi_citation=None, demo=False),\n",
" Feedstock(title='dataset_2', description='dataset_description.', maintainers=[Maintainer(name='Raphael Hagen', github='norlandrhagen')], provenance=Provenance(providers=[Provider(name='carbonplan', description='carbonplan_test_dataset', roles=['producer', 'licensor'], url='https://dapds00.nci.org.au/thredds/catalogs/zv2/catalog.html')], license='CC-BY-NC v.4.0', license_link=None), thumbnail=None, tags=None, links=None, stores=[Store(name='test_dataset2.zarr', url='s3://carbonplan-data-viewer/demo/ncview-2.0/test_dataset2.zarr')], doi_citation=None, demo=False),\n",
" Feedstock(title='dataset_3', description='dataset_description.', maintainers=[Maintainer(name='Raphael Hagen', github='norlandrhagen')], provenance=Provenance(providers=[Provider(name='carbonplan', description='carbonplan_test_dataset', roles=['producer', 'licensor'], url='https://dapds00.nci.org.au/thredds/catalogs/zv2/catalog.html')], license='CC-BY-NC v.4.0', license_link=None), thumbnail=None, tags=None, links=None, stores=[Store(name='test_dataset3.zarr', url='s3://carbonplan-data-viewer/demo/ncview-2.0/test_dataset3.zarr')], doi_citation=None, demo=False)]"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"catalog"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "7436cadd-bc26-4017-a4dd-5337c85ade57",
"metadata": {},
"outputs": [],
"source": [
"consolidated_data = {}\n",
"for item in catalog:\n",
" title = item.title\n",
" if title not in consolidated_data:\n",
" # If title not seen before, copy the item structure\n",
" consolidated_data[title] = item.copy()\n",
" else:\n",
" # If title already seen, extend the 'stores' list\n",
" consolidated_data[title].stores.extend(item.stores)\n",
"consolidated_web_catalog = list(consolidated_data.values())"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "3061afa0-9848-4909-bf51-18dfb032abe2",
"metadata": {},
"outputs": [],
"source": [
"# write catalog to JSON file for use in the website\n",
"with open(f'consolidated-web-catalog.json', 'w') as f:\n",
" json.dump(consolidated_web_catalog, f, indent=2, default=pydantic_encoder)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8e9bdfd8-51e7-47e6-b170-c3595cd734c6",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment