Created
April 30, 2018 21:27
-
-
Save JiaweiZhuang/f80eade9cca9709b4cd549f38247ed97 to your computer and use it in GitHub Desktop.
I/O comparison of NetCDF and Zarr, using NASA-NEX data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# NetCDF vs Zarr performance" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Dependencies:\n", | |
"```python\n", | |
"conda install -c conda-forge jupyterlab xarray dask netcdf4 zarr\n", | |
"pip install awscli\n", | |
"```" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import os\n", | |
"import shutil\n", | |
"\n", | |
"import xarray as xr\n", | |
"from dask.diagnostics import ProgressBar" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"download: s3://nasanex/NEX-GDDP/BCSD/rcp85/day/atmos/tasmax/r1i1p1/v1.0/tasmax_day_BCSD_rcp85_r1i1p1_inmcm4_2100.nc to ./tasmax_day_BCSD_rcp85_r1i1p1_inmcm4_2100.nc\n" | |
] | |
} | |
], | |
"source": [ | |
"%%bash\n", | |
"if [ -e tasmax_day_BCSD_rcp85_r1i1p1_inmcm4_2100.nc ]; then\n", | |
" echo \"File exists\"\n", | |
"else \n", | |
" aws s3 cp s3://nasanex/NEX-GDDP/BCSD/rcp85/day/atmos/tasmax/r1i1p1/v1.0/tasmax_day_BCSD_rcp85_r1i1p1_inmcm4_2100.nc ./\n", | |
"fi " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"ds_origin = xr.open_dataset('./tasmax_day_BCSD_rcp85_r1i1p1_inmcm4_2100.nc', chunks={'time': 20})" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"1.51373956" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ds_origin.nbytes / 1e9 # GB" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[########################################] | 100% Completed | 11.7s\n", | |
"CPU times: user 10.5 s, sys: 1.75 s, total: 12.3 s\n", | |
"Wall time: 12 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"with ProgressBar():\n", | |
" ds_origin.load() # throughput = 1.5G/12s = 125 MB/s, haven't hit SSD limit" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 2.34 s, sys: 848 ms, total: 3.18 s\n", | |
"Wall time: 2.26 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"# convert NetCDF to zarr\n", | |
"file_zarr = './tasmax.zarr'\n", | |
"\n", | |
"# a zarr \"file\" is simply a directory with many files.\n", | |
"# remove the folder if it is already exists\n", | |
"if os.path.isdir(file_zarr):\n", | |
" shutil.rmtree(file_zarr)\n", | |
" \n", | |
"ds_origin.to_zarr('./tasmax.zarr') # will throw an error if the file exists" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"ds_zarr = xr.open_zarr('./tasmax.zarr')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[########################################] | 100% Completed | 1.7s\n", | |
"CPU times: user 4.68 s, sys: 2.37 s, total: 7.05 s\n", | |
"Wall time: 2.51 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"with ProgressBar():\n", | |
" ds_zarr.load() # throughput = 1.5G/2.5s = 600 MB/s, already hit SSD limit" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"From https://aws.amazon.com/ebs/details/, the maximum throughput of EBS SSD volume is 500 MB/s, so it is impossible to go faster than that." | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.5" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment