Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save rsignell-usgs/5971951d348496229ce121b52a2fb750 to your computer and use it in GitHub Desktop.
Save rsignell-usgs/5971951d348496229ce121b52a2fb750 to your computer and use it in GitHub Desktop.
coawst_open_data_create_refs-Copy1.ipynb
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"id": "9e609a1f-6b8c-4fab-9ad5-848c6ecc30fc",
"metadata": {},
"source": [
"# Explore kerchunk issue with single value int32 variables"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "outside-mayor",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import fsspec\n",
"import xarray as xr\n",
"\n",
"import kerchunk\n",
"from kerchunk.hdf import SingleHdf5ToZarr\n",
"\n",
"from pathlib import Path\n",
"import numpy as np\n",
"import ujson"
]
},
{
"cell_type": "markdown",
"id": "7a037051-26d7-4c2a-b5ed-6041c57df508",
"metadata": {},
"source": [
"Not sure why the latest version from conda-forge has version 0.0.0:"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "3a843653-81b8-4953-ad9c-a746015873c9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.0.0\n"
]
}
],
"source": [
"print(kerchunk.__version__) "
]
},
{
"cell_type": "markdown",
"id": "0f6e8365-aa8a-4d06-8661-129855593fda",
"metadata": {},
"source": [
"We can read from AWS Open Data using `anon=True`:"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "endangered-therapist",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"fs_read = fsspec.filesystem('s3', anon=True, skip_instance_cache=True, use_listings_cache=False )"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "10ec51d5-9a31-4215-8b68-b52147749895",
"metadata": {},
"outputs": [],
"source": [
"fs_local = fsspec.filesystem('file')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "packed-lightning",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"flist = fs_read.glob('s3://usgs-coawst/useast-archive/*.nc')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "destroyed-abortion",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"flist = [f's3://{f}' for f in flist]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "2faf3423-1051-4f67-b038-de107ab0e43f",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"ds_netcdf = xr.open_dataset(fs_read.open(flist[0]), chunks={})"
]
},
{
"cell_type": "markdown",
"id": "3cdec8e4-b4c2-46cc-b156-b0938d55ff9b",
"metadata": {},
"source": [
"#### When we open with NetCDF, the single value variable \"spherical\" is type `int32`:"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "3e59777e-1219-4a9a-8c07-12a75f650ced",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"dtype('int32')"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ds_netcdf.spherical.dtype"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "separated-timber",
"metadata": {},
"outputs": [],
"source": [
"so = dict(mode='rb', anon=True, skip_instance_cache=True)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "07ca55e9-a12c-4eb5-bed8-31a3c065b4d0",
"metadata": {},
"outputs": [],
"source": [
"def gen_json_local(u):\n",
" with fs_read.open(u, **so) as infile:\n",
" fname = Path(u).stem\n",
" h5chunks = SingleHdf5ToZarr(infile, u, inline_threshold=300)\n",
" outf = f'{fname}.json'\n",
" with fs_local.open(outf, 'wb') as f:\n",
" f.write(ujson.dumps(h5chunks.translate()).encode());\n",
" return outf"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "c5512b0a-d6f6-4b68-a678-28f0a4f61d3b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 6.83 s, sys: 1.99 s, total: 8.82 s\n",
"Wall time: 1min 17s\n"
]
},
{
"data": {
"text/plain": [
"'coawst_2009-08-21_0000.json'"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"gen_json_local(flist[0])"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "4eee9c1a-d285-4e3b-afef-844ab10e2389",
"metadata": {},
"outputs": [],
"source": [
"single_json = 'coawst_2009-08-21_0000.json'"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "3dc9cce7-f13f-4630-bd92-f70c0e7a99e2",
"metadata": {},
"outputs": [],
"source": [
"ds_kerchunk = xr.open_dataset(single_json, engine=\"kerchunk\", chunks={},\n",
" storage_options=dict(remote_protocol='s3', remote_options=dict(anon=True)))"
]
},
{
"cell_type": "markdown",
"id": "d0807485-b950-4488-b02e-85f6eeab4419",
"metadata": {},
"source": [
"#### When we open with Kerchunk, the single value variable \"spherical\" is type `float64`:"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "fec7c92a-3808-4ebb-bbf3-7b819080e4ef",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"float64\n"
]
}
],
"source": [
"print(ds_kerchunk.spherical.dtype)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "pangeo:Python",
"language": "python",
"name": "conda-env-pangeo-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.6"
},
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"state": {},
"version_major": 2,
"version_minor": 0
}
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment