Created
July 10, 2015 03:22
-
-
Save darothen/0ef1d87c67de69409d19 to your computer and use it in GitHub Desktop.
xray + dask random data test
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"This notebook re-implements the [xray + dask out-of-core calculation example](http://eng.climate.com/2015/06/11/xray-dask-out-of-core-labeled-arrays-in-python/), substituting randomly generated data for ERA reanalysis 2m temperatures." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Current conda install:\n", | |
"\n", | |
" platform : linux-64\n", | |
" conda version : 3.14.1\n", | |
" conda-build version : not installed\n", | |
" python version : 2.7.10.final.0\n", | |
" requests version : 2.7.0\n", | |
" root environment : /home/darothen/miniconda (writable)\n", | |
" default environment : /home/darothen/miniconda/envs/xray_dask_test\n", | |
" envs directories : /home/darothen/miniconda/envs\n", | |
" package cache : /home/darothen/miniconda/pkgs\n", | |
" channel URLs : https://conda.anaconda.org/r/linux-64/\n", | |
" https://conda.anaconda.org/r/noarch/\n", | |
" https://repo.continuum.io/pkgs/free/linux-64/\n", | |
" https://repo.continuum.io/pkgs/free/noarch/\n", | |
" https://repo.continuum.io/pkgs/pro/linux-64/\n", | |
" https://repo.continuum.io/pkgs/pro/noarch/\n", | |
" config file : /home/darothen/.condarc\n", | |
" is foreign system : False\n", | |
"\n", | |
"# conda environments:\n", | |
"#\n", | |
"xray_dask_test * /home/darothen/miniconda/envs/xray_dask_test\n", | |
"root /home/darothen/miniconda\n", | |
"\n", | |
"\n", | |
"numpy: 1.9.2\n", | |
"pandas: 0.16.2\n", | |
"xray: 0.5.1\n" | |
] | |
} | |
], | |
"source": [ | |
"import numpy as np\n", | |
"import pandas as pd\n", | |
"import xray\n", | |
"\n", | |
"import os\n", | |
"from collections import OrderedDict\n", | |
"\n", | |
"SAVE_DIR = \"/storage02/darothen/ts_data\"\n", | |
"\n", | |
"!conda info\n", | |
"!conda info -e\n", | |
"\n", | |
"print \n", | |
"print \"numpy:\", np.version.full_version\n", | |
"print \"pandas:\", pd.version.version\n", | |
"print \"xray:\", xray.version.version" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Total samples to generate: 1913651200\n", | |
" (time samples: 14600)\n" | |
] | |
} | |
], | |
"source": [ | |
"# Shape of data to generate 32 years of 6-hourly data\n", | |
"n_files = 10\n", | |
"data_shape = (365*4, 256, 512)\n", | |
"date_func = lambda i: pd.date_range(\"%4d-01-01\" % (1960+i), periods=365*4, freq=\"6H\")\n", | |
"total_samples = reduce(lambda a, b: a*b, data_shape) * n_files\n", | |
"\n", | |
"print \"Total samples to generate: %d\" % total_samples\n", | |
"print \" (time samples: %d)\" % (data_shape[0]*n_files, )" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Convenienece function for creating the random data. By default, will save netCDF4 files where each is essentially a copy of the first one." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false, | |
"scrolled": false | |
}, | |
"outputs": [], | |
"source": [ | |
"if not os.path.exists(SAVE_DIR):\n", | |
" os.mkdir(SAVE_DIR)\n", | |
"else:\n", | |
" print \"Cleaning save directory...\"\n", | |
" !rm -f {SAVE_DIR}/*.nc\n", | |
"\n", | |
"def create_datafiles(n_files=n_files, data_shape=data_shape, \n", | |
" dates=True, repeat_data=True, format='netcdf4'):\n", | |
" print \"Creating %d files with data of\" \\\n", | |
" \" shape %r\" % (n_files, data_shape)\n", | |
" print \"Generating initial data...\",\n", | |
" data_base = np.random.randint(-10., 120., data_shape)\n", | |
" print \" done.\"\n", | |
" \n", | |
" coords = OrderedDict()\n", | |
" coord_labels = ['time', 'x', 'y']\n", | |
" for key, n in zip(coord_labels, data_shape):\n", | |
" coords[key] = ([key, ], np.arange(n))\n", | |
" \n", | |
" if repeat_data:\n", | |
" data = data_base + np.random.randn(*data_shape)\n", | |
"\n", | |
" for i in xrange(1, n_files+1):\n", | |
" print \"File %03d:\" % i\n", | |
" print \" creating data\"\n", | |
" if not repeat_data:\n", | |
" data = data_base + np.random.randn(*data_shape)\n", | |
" \n", | |
" # Increment the timesteps\n", | |
" time = coords['time'][1]\n", | |
" time += len(time) * (i - 1)\n", | |
" if dates:\n", | |
" time = date_func(i)\n", | |
" coords['time'] = (['time', ], time)\n", | |
" \n", | |
" print \" constructing DataSet\"\n", | |
" ds = xray.Dataset({'temperature': (coord_labels, data)}, \n", | |
" coords)\n", | |
" \n", | |
" fn = \"ts_%03d.nc\" % i\n", | |
" print \" writing to file -> %s...\" % fn,\n", | |
" ds.to_netcdf(os.path.join(SAVE_DIR, fn), 'w',\n", | |
" format=format)\n", | |
" print \"done.\"\n", | |
" \n", | |
" file_size = ds.nbytes * (2 ** -30)\n", | |
" total_size = file_size * n_files\n", | |
" \n", | |
" print \"\\n... finished!\"\n", | |
" print \"Created a %d-file dataset of size\" % n_files + \\\n", | |
" \" %3.1fGB/file (%3.1f GB total)\" % (file_size, total_size)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# Call the data creation routine.\n", | |
"#create_datafiles()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"---\n", | |
"\n", | |
"Open the dataset we just generated" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" n_chunks: 10\n", | |
"chunk_size: 1460\n", | |
" ts/file: 1460\n", | |
"samples per core: 191365120\n", | |
"\n", | |
"chunked array shape: ( 1460 256 512 )\n" | |
] | |
} | |
], | |
"source": [ | |
"# Compute the chunk size, given the number of cores\n", | |
"n_chunks = n_files\n", | |
"time_samples = n_files*data_shape[0]\n", | |
"chunk_size = int(1 * time_samples / n_chunks)\n", | |
"print \" n_chunks:\", n_chunks\n", | |
"print \"chunk_size:\", chunk_size\n", | |
"print \" ts/file:\", data_shape[0]\n", | |
"print \"samples per core:\", chunk_size*data_shape[1]*data_shape[2]\n", | |
"print\n", | |
"print \"chunked array shape: (\", chunk_size, data_shape[1], data_shape[2], \")\" " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false, | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"<xray.Dataset>\n", | |
"Dimensions: (time: 13140, x: 256, y: 512)\n", | |
"Coordinates:\n", | |
" * x (x) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ...\n", | |
" * y (y) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ...\n", | |
" * time (time) datetime64[ns] 1961-01-01 1961-01-01T06:00:00 ...\n", | |
"Data variables:\n", | |
" temperature (time, x, y) float64 35.81 18.5 55.61 112.3 91.38 -5.46 ...\n", | |
"Frozen(SortedKeysDict({u'y': (512,), u'x': (256,), u'time': (1460, 1460, 1460, 1460, 1460, 1460, 1460, 1460, 1460)}))\n" | |
] | |
} | |
], | |
"source": [ | |
"ds = xray.open_mfdataset(os.path.join(SAVE_DIR, \"ts_00*.nc\"), engine='h5netcdf')\n", | |
"print repr(ds)\n", | |
"print ds.chunks" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 25.1 s, sys: 26.4 s, total: 51.5 s\n", | |
"Wall time: 18.2 s\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"54.502895280093526" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Timing of mean over all data\n", | |
"%time float(ds.temperature.mean())" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.10" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment