Skip to content

Instantly share code, notes, and snippets.

@darothen
Created July 10, 2015 03:22
Show Gist options
  • Save darothen/0ef1d87c67de69409d19 to your computer and use it in GitHub Desktop.
Save darothen/0ef1d87c67de69409d19 to your computer and use it in GitHub Desktop.
xray + dask random data test
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This notebook re-implements the [xray + dask out-of-core calculation example](http://eng.climate.com/2015/06/11/xray-dask-out-of-core-labeled-arrays-in-python/), substituting randomly generated data for ERA reanalysis 2m temperatures."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Current conda install:\n",
"\n",
" platform : linux-64\n",
" conda version : 3.14.1\n",
" conda-build version : not installed\n",
" python version : 2.7.10.final.0\n",
" requests version : 2.7.0\n",
" root environment : /home/darothen/miniconda (writable)\n",
" default environment : /home/darothen/miniconda/envs/xray_dask_test\n",
" envs directories : /home/darothen/miniconda/envs\n",
" package cache : /home/darothen/miniconda/pkgs\n",
" channel URLs : https://conda.anaconda.org/r/linux-64/\n",
" https://conda.anaconda.org/r/noarch/\n",
" https://repo.continuum.io/pkgs/free/linux-64/\n",
" https://repo.continuum.io/pkgs/free/noarch/\n",
" https://repo.continuum.io/pkgs/pro/linux-64/\n",
" https://repo.continuum.io/pkgs/pro/noarch/\n",
" config file : /home/darothen/.condarc\n",
" is foreign system : False\n",
"\n",
"# conda environments:\n",
"#\n",
"xray_dask_test * /home/darothen/miniconda/envs/xray_dask_test\n",
"root /home/darothen/miniconda\n",
"\n",
"\n",
"numpy: 1.9.2\n",
"pandas: 0.16.2\n",
"xray: 0.5.1\n"
]
}
],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import xray\n",
"\n",
"import os\n",
"from collections import OrderedDict\n",
"\n",
"SAVE_DIR = \"/storage02/darothen/ts_data\"\n",
"\n",
"!conda info\n",
"!conda info -e\n",
"\n",
"print \n",
"print \"numpy:\", np.version.full_version\n",
"print \"pandas:\", pd.version.version\n",
"print \"xray:\", xray.version.version"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Total samples to generate: 1913651200\n",
" (time samples: 14600)\n"
]
}
],
"source": [
"# Shape of data to generate 32 years of 6-hourly data\n",
"n_files = 10\n",
"data_shape = (365*4, 256, 512)\n",
"date_func = lambda i: pd.date_range(\"%4d-01-01\" % (1960+i), periods=365*4, freq=\"6H\")\n",
"total_samples = reduce(lambda a, b: a*b, data_shape) * n_files\n",
"\n",
"print \"Total samples to generate: %d\" % total_samples\n",
"print \" (time samples: %d)\" % (data_shape[0]*n_files, )"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Convenienece function for creating the random data. By default, will save netCDF4 files where each is essentially a copy of the first one."
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false,
"scrolled": false
},
"outputs": [],
"source": [
"if not os.path.exists(SAVE_DIR):\n",
" os.mkdir(SAVE_DIR)\n",
"else:\n",
" print \"Cleaning save directory...\"\n",
" !rm -f {SAVE_DIR}/*.nc\n",
"\n",
"def create_datafiles(n_files=n_files, data_shape=data_shape, \n",
" dates=True, repeat_data=True, format='netcdf4'):\n",
" print \"Creating %d files with data of\" \\\n",
" \" shape %r\" % (n_files, data_shape)\n",
" print \"Generating initial data...\",\n",
" data_base = np.random.randint(-10., 120., data_shape)\n",
" print \" done.\"\n",
" \n",
" coords = OrderedDict()\n",
" coord_labels = ['time', 'x', 'y']\n",
" for key, n in zip(coord_labels, data_shape):\n",
" coords[key] = ([key, ], np.arange(n))\n",
" \n",
" if repeat_data:\n",
" data = data_base + np.random.randn(*data_shape)\n",
"\n",
" for i in xrange(1, n_files+1):\n",
" print \"File %03d:\" % i\n",
" print \" creating data\"\n",
" if not repeat_data:\n",
" data = data_base + np.random.randn(*data_shape)\n",
" \n",
" # Increment the timesteps\n",
" time = coords['time'][1]\n",
" time += len(time) * (i - 1)\n",
" if dates:\n",
" time = date_func(i)\n",
" coords['time'] = (['time', ], time)\n",
" \n",
" print \" constructing DataSet\"\n",
" ds = xray.Dataset({'temperature': (coord_labels, data)}, \n",
" coords)\n",
" \n",
" fn = \"ts_%03d.nc\" % i\n",
" print \" writing to file -> %s...\" % fn,\n",
" ds.to_netcdf(os.path.join(SAVE_DIR, fn), 'w',\n",
" format=format)\n",
" print \"done.\"\n",
" \n",
" file_size = ds.nbytes * (2 ** -30)\n",
" total_size = file_size * n_files\n",
" \n",
" print \"\\n... finished!\"\n",
" print \"Created a %d-file dataset of size\" % n_files + \\\n",
" \" %3.1fGB/file (%3.1f GB total)\" % (file_size, total_size)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Call the data creation routine.\n",
"#create_datafiles()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---\n",
"\n",
"Open the dataset we just generated"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" n_chunks: 10\n",
"chunk_size: 1460\n",
" ts/file: 1460\n",
"samples per core: 191365120\n",
"\n",
"chunked array shape: ( 1460 256 512 )\n"
]
}
],
"source": [
"# Compute the chunk size, given the number of cores\n",
"n_chunks = n_files\n",
"time_samples = n_files*data_shape[0]\n",
"chunk_size = int(1 * time_samples / n_chunks)\n",
"print \" n_chunks:\", n_chunks\n",
"print \"chunk_size:\", chunk_size\n",
"print \" ts/file:\", data_shape[0]\n",
"print \"samples per core:\", chunk_size*data_shape[1]*data_shape[2]\n",
"print\n",
"print \"chunked array shape: (\", chunk_size, data_shape[1], data_shape[2], \")\" "
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<xray.Dataset>\n",
"Dimensions: (time: 13140, x: 256, y: 512)\n",
"Coordinates:\n",
" * x (x) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ...\n",
" * y (y) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ...\n",
" * time (time) datetime64[ns] 1961-01-01 1961-01-01T06:00:00 ...\n",
"Data variables:\n",
" temperature (time, x, y) float64 35.81 18.5 55.61 112.3 91.38 -5.46 ...\n",
"Frozen(SortedKeysDict({u'y': (512,), u'x': (256,), u'time': (1460, 1460, 1460, 1460, 1460, 1460, 1460, 1460, 1460)}))\n"
]
}
],
"source": [
"ds = xray.open_mfdataset(os.path.join(SAVE_DIR, \"ts_00*.nc\"), engine='h5netcdf')\n",
"print repr(ds)\n",
"print ds.chunks"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 25.1 s, sys: 26.4 s, total: 51.5 s\n",
"Wall time: 18.2 s\n"
]
},
{
"data": {
"text/plain": [
"54.502895280093526"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Timing of mean over all data\n",
"%time float(ds.temperature.mean())"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.10"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment