Created
February 20, 2019 20:15
-
-
Save scottyhq/790bf19c7811b5c6243ce37aae252ca1 to your computer and use it in GitHub Desktop.
experimenting with remote HDF5 files in xarray
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Load HDF5 data w/ xarray\n", | |
"\n", | |
"A lot of NASA data is stored in HDF5 format. One example is data from the Sentinel-1 radar mission, which will be similar to the upcoming NISAR mission.\n", | |
"\n", | |
"https://aria.jpl.nasa.gov/node/97\n", | |
"\n", | |
"This notebook illustrates loading the data into xarray for analysis" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import xarray as xr\n", | |
"import h5netcdf\n", | |
"import rasterio\n", | |
"import h5py\n", | |
"import gcsfs\n", | |
"import os" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"0.11.1+64.g612d390f.dirty\n", | |
"0.6.2\n", | |
"1.0.18\n", | |
"2.9.0\n", | |
"0.2.0\n" | |
] | |
} | |
], | |
"source": [ | |
"print(xr.__version__)\n", | |
"print(h5netcdf.__version__)\n", | |
"print(rasterio.__version__)\n", | |
"print(h5py.__version__)\n", | |
"print(gcsfs.__version__)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Copied data from ASF to Google Storage bucket\n", | |
"fs = gcsfs.GCSFileSystem()\n", | |
"images = fs.ls('pangeo-data/grfn-v2/137/')\n", | |
"fileObj = fs.open('pangeo-data/grfn-v2/137/S1-GUNW-A-R-137-tops-20181129_20181123-020010-43220N_41518N-PP-e2c7-v2_0_0.nc')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Option 1) Copy file locally and read w/ xarray\n", | |
"gsPath = 'pangeo-data/grfn-v2/137/S1-GUNW-A-R-137-tops-20181129_20181123-020010-43220N_41518N-PP-e2c7-v2_0_0.nc'\n", | |
"localPath = os.path.basename(gsPath)\n", | |
"#fs.get(gsPath, localPath)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Data arrays are stored in \"science group\"\n", | |
"#da = xr.open_dataset(localPath, group='/science/grids/data', engine='h5netcdf')\n", | |
"#da " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Seems that h5py >2.9.0 can handle file-like-objects:\n", | |
"# https://github.com/h5py/h5py/pull/1105\n", | |
"h5 = h5py.File(fileObj, 'r')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"<KeysViewHDF5 ['product_type', 'Conventions', 'title', 'version', 'author', 'institution', 'source', 'references', 'ogr_geometry_field', 'ogr_layer_name', 'ogr_layer_type']>\n", | |
"ItemsViewHDF5(<HDF5 group \"/science/grids/data\" (7 members)>)\n", | |
"(682, 1386)\n" | |
] | |
} | |
], | |
"source": [ | |
"# Works but slow from home wifi (likely due to issues w/ number of network requests required)\n", | |
"# http://matthewrocklin.com/blog/work/2018/02/06/hdf-in-the-cloud\n", | |
"print(h5.attrs.keys())\n", | |
"ds = h5['science/grids/data']\n", | |
"print(ds.items())\n", | |
"print(ds['coherence'].chunks)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<xarray.Dataset>\n", | |
"Dimensions: (latitude: 2045, longitude: 4158)\n", | |
"Coordinates:\n", | |
" * longitude (longitude) float64 -123.1 -123.1 ... -119.6 -119.6\n", | |
" * latitude (latitude) float64 43.22 43.22 43.22 ... 41.52 41.52\n", | |
"Data variables:\n", | |
" crs int32 ...\n", | |
" unwrappedPhase (latitude, longitude) float32 ...\n", | |
" coherence (latitude, longitude) float32 ...\n", | |
" connectedComponents (latitude, longitude) float32 ...\n", | |
" amplitude (latitude, longitude) float32 ..." | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# but, can we open this w/ xarray anyway? Yes! with modifications to xarray and h5netcdf\n", | |
"ds = xr.open_dataset(fileObj, group='/science/grids/data', engine='h5netcdf')\n", | |
"ds " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<xarray.Dataset>\n", | |
"Dimensions: (latitude: 2045, longitude: 4158)\n", | |
"Coordinates:\n", | |
" * longitude (longitude) float64 -123.1 -123.1 ... -119.6 -119.6\n", | |
" * latitude (latitude) float64 43.22 43.22 43.22 ... 41.52 41.52\n", | |
"Data variables:\n", | |
" crs int32 ...\n", | |
" unwrappedPhase (latitude, longitude) float32 dask.array<shape=(2045, 4158), chunksize=(682, 1386)>\n", | |
" coherence (latitude, longitude) float32 dask.array<shape=(2045, 4158), chunksize=(682, 1386)>\n", | |
" connectedComponents (latitude, longitude) float32 dask.array<shape=(2045, 4158), chunksize=(682, 1386)>\n", | |
" amplitude (latitude, longitude) float32 dask.array<shape=(2045, 4158), chunksize=(682, 1386)>" | |
] | |
}, | |
"execution_count": 11, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Try as Dask array\n", | |
"ds = xr.open_dataset(fileObj, group='/science/grids/data', engine='h5netcdf',\n", | |
" chunks=dict(latitude=682, longitude=1386))\n", | |
"ds " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<matplotlib.image.AxesImage at 0x111a44390>" | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ds['coherence'].plot.imshow()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.7" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment