darothen · July 10, 2015 03:22
diff --git a/xray_dask_test.ipynb b/xray_dask_test.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This notebook re-implements the [xray + dask out-of-core calculation example](http://eng.climate.com/2015/06/11/xray-dask-out-of-core-labeled-arrays-in-python/), substituting randomly generated data for ERA reanalysis 2m temperatures."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current conda install:\n",
      "\n",
      "             platform : linux-64\n",
      "        conda version : 3.14.1\n",
      "  conda-build version : not installed\n",
      "       python version : 2.7.10.final.0\n",
      "     requests version : 2.7.0\n",
      "     root environment : /home/darothen/miniconda  (writable)\n",
      "  default environment : /home/darothen/miniconda/envs/xray_dask_test\n",
      "     envs directories : /home/darothen/miniconda/envs\n",
      "        package cache : /home/darothen/miniconda/pkgs\n",
      "         channel URLs : https://conda.anaconda.org/r/linux-64/\n",
      "                        https://conda.anaconda.org/r/noarch/\n",
      "                        https://repo.continuum.io/pkgs/free/linux-64/\n",
      "                        https://repo.continuum.io/pkgs/free/noarch/\n",
      "                        https://repo.continuum.io/pkgs/pro/linux-64/\n",
      "                        https://repo.continuum.io/pkgs/pro/noarch/\n",
      "          config file : /home/darothen/.condarc\n",
      "    is foreign system : False\n",
      "\n",
      "# conda environments:\n",
      "#\n",
      "xray_dask_test        *  /home/darothen/miniconda/envs/xray_dask_test\n",
      "root                     /home/darothen/miniconda\n",
      "\n",
      "\n",
      "numpy: 1.9.2\n",
      "pandas: 0.16.2\n",
      "xray: 0.5.1\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import xray\n",
    "\n",
    "import os\n",
    "from collections import OrderedDict\n",
    "\n",
    "SAVE_DIR = \"/storage02/darothen/ts_data\"\n",
    "\n",
    "!conda info\n",
    "!conda info -e\n",
    "\n",
    "print \n",
    "print \"numpy:\", np.version.full_version\n",
    "print \"pandas:\", pd.version.version\n",
    "print \"xray:\", xray.version.version"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total samples to generate: 1913651200\n",
      "            (time samples: 14600)\n"
     ]
    }
   ],
   "source": [
    "# Shape of data to generate 32 years of 6-hourly data\n",
    "n_files = 10\n",
    "data_shape = (365*4, 256, 512)\n",
    "date_func = lambda i: pd.date_range(\"%4d-01-01\" % (1960+i), periods=365*4, freq=\"6H\")\n",
    "total_samples = reduce(lambda a, b: a*b, data_shape) * n_files\n",
    "\n",
    "print \"Total samples to generate: %d\" % total_samples\n",
    "print \"            (time samples: %d)\" % (data_shape[0]*n_files, )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Convenienece function for creating the random data. By default, will save netCDF4 files where each is essentially a copy of the first one."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "if not os.path.exists(SAVE_DIR):\n",
    "    os.mkdir(SAVE_DIR)\n",
    "else:\n",
    "    print \"Cleaning save directory...\"\n",
    "    !rm -f {SAVE_DIR}/*.nc\n",
    "\n",
    "def create_datafiles(n_files=n_files, data_shape=data_shape, \n",
    "                     dates=True, repeat_data=True, format='netcdf4'):\n",
    "    print \"Creating %d files with data of\" \\\n",
    "          \" shape %r\" % (n_files, data_shape)\n",
    "    print \"Generating initial data...\",\n",
    "    data_base = np.random.randint(-10., 120., data_shape)\n",
    "    print \" done.\"\n",
    "    \n",
    "    coords = OrderedDict()\n",
    "    coord_labels = ['time', 'x', 'y']\n",
    "    for key, n in zip(coord_labels, data_shape):\n",
    "        coords[key] = ([key, ], np.arange(n))\n",
    "        \n",
    "    if repeat_data:\n",
    "        data = data_base + np.random.randn(*data_shape)\n",
    "\n",
    "    for i in xrange(1, n_files+1):\n",
    "        print \"File %03d:\" % i\n",
    "        print \"     creating data\"\n",
    "        if not repeat_data:\n",
    "            data = data_base + np.random.randn(*data_shape)\n",
    "        \n",
    "        # Increment the timesteps\n",
    "        time = coords['time'][1]\n",
    "        time += len(time) * (i - 1)\n",
    "        if dates:\n",
    "            time = date_func(i)\n",
    "        coords['time'] = (['time', ], time)\n",
    "        \n",
    "        print \"     constructing DataSet\"\n",
    "        ds = xray.Dataset({'temperature': (coord_labels, data)}, \n",
    "                          coords)\n",
    "        \n",
    "        fn = \"ts_%03d.nc\" % i\n",
    "        print \"     writing to file -> %s...\" % fn,\n",
    "        ds.to_netcdf(os.path.join(SAVE_DIR, fn), 'w',\n",
    "                     format=format)\n",
    "        print \"done.\"\n",
    "    \n",
    "    file_size = ds.nbytes * (2 ** -30)\n",
    "    total_size = file_size * n_files\n",
    "    \n",
    "    print \"\\n... finished!\"\n",
    "    print \"Created a %d-file dataset of size\" % n_files + \\\n",
    "          \" %3.1fGB/file (%3.1f GB total)\" % (file_size, total_size)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Call the data creation routine.\n",
    "#create_datafiles()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "---\n",
    "\n",
    "Open the dataset we just generated"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   n_chunks: 10\n",
      "chunk_size: 1460\n",
      "   ts/file: 1460\n",
      "samples per core: 191365120\n",
      "\n",
      "chunked array shape: ( 1460 256 512 )\n"
     ]
    }
   ],
   "source": [
    "# Compute the chunk size, given the number of cores\n",
    "n_chunks = n_files\n",
    "time_samples = n_files*data_shape[0]\n",
    "chunk_size = int(1 * time_samples / n_chunks)\n",
    "print \"   n_chunks:\", n_chunks\n",
    "print \"chunk_size:\", chunk_size\n",
    "print \"   ts/file:\", data_shape[0]\n",
    "print \"samples per core:\", chunk_size*data_shape[1]*data_shape[2]\n",
    "print\n",
    "print \"chunked array shape: (\", chunk_size, data_shape[1], data_shape[2], \")\" "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<xray.Dataset>\n",
      "Dimensions:      (time: 13140, x: 256, y: 512)\n",
      "Coordinates:\n",
      "  * x            (x) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ...\n",
      "  * y            (y) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ...\n",
      "  * time         (time) datetime64[ns] 1961-01-01 1961-01-01T06:00:00 ...\n",
      "Data variables:\n",
      "    temperature  (time, x, y) float64 35.81 18.5 55.61 112.3 91.38 -5.46 ...\n",
      "Frozen(SortedKeysDict({u'y': (512,), u'x': (256,), u'time': (1460, 1460, 1460, 1460, 1460, 1460, 1460, 1460, 1460)}))\n"
     ]
    }
   ],
   "source": [
    "ds = xray.open_mfdataset(os.path.join(SAVE_DIR, \"ts_00*.nc\"), engine='h5netcdf')\n",
    "print repr(ds)\n",
    "print ds.chunks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 25.1 s, sys: 26.4 s, total: 51.5 s\n",
      "Wall time: 18.2 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "54.502895280093526"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Timing of mean over all data\n",
    "%time float(ds.temperature.mean())"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"This notebook re-implements the [xray + dask out-of-core calculation example](http://eng.climate.com/2015/06/11/xray-dask-out-of-core-labeled-arrays-in-python/), substituting randomly generated data for ERA reanalysis 2m temperatures."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Current conda install:\n",
	"\n",
	" platform : linux-64\n",
	" conda version : 3.14.1\n",
	" conda-build version : not installed\n",
	" python version : 2.7.10.final.0\n",
	" requests version : 2.7.0\n",
	" root environment : /home/darothen/miniconda (writable)\n",
	" default environment : /home/darothen/miniconda/envs/xray_dask_test\n",
	" envs directories : /home/darothen/miniconda/envs\n",
	" package cache : /home/darothen/miniconda/pkgs\n",
	" channel URLs : https://conda.anaconda.org/r/linux-64/\n",
	" https://conda.anaconda.org/r/noarch/\n",
	" https://repo.continuum.io/pkgs/free/linux-64/\n",
	" https://repo.continuum.io/pkgs/free/noarch/\n",
	" https://repo.continuum.io/pkgs/pro/linux-64/\n",
	" https://repo.continuum.io/pkgs/pro/noarch/\n",
	" config file : /home/darothen/.condarc\n",
	" is foreign system : False\n",
	"\n",
	"# conda environments:\n",
	"#\n",
	"xray_dask_test * /home/darothen/miniconda/envs/xray_dask_test\n",
	"root /home/darothen/miniconda\n",
	"\n",
	"\n",
	"numpy: 1.9.2\n",
	"pandas: 0.16.2\n",
	"xray: 0.5.1\n"
	]
	}
	],
	"source": [
	"import numpy as np\n",
	"import pandas as pd\n",
	"import xray\n",
	"\n",
	"import os\n",
	"from collections import OrderedDict\n",
	"\n",
	"SAVE_DIR = \"/storage02/darothen/ts_data\"\n",
	"\n",
	"!conda info\n",
	"!conda info -e\n",
	"\n",
	"print \n",
	"print \"numpy:\", np.version.full_version\n",
	"print \"pandas:\", pd.version.version\n",
	"print \"xray:\", xray.version.version"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Total samples to generate: 1913651200\n",
	" (time samples: 14600)\n"
	]
	}
	],
	"source": [
	"# Shape of data to generate 32 years of 6-hourly data\n",
	"n_files = 10\n",
	"data_shape = (365*4, 256, 512)\n",
	"date_func = lambda i: pd.date_range(\"%4d-01-01\" % (1960+i), periods=365*4, freq=\"6H\")\n",
	"total_samples = reduce(lambda a, b: ab, data_shape) n_files\n",
	"\n",
	"print \"Total samples to generate: %d\" % total_samples\n",
	"print \" (time samples: %d)\" % (data_shape[0]*n_files, )"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Convenienece function for creating the random data. By default, will save netCDF4 files where each is essentially a copy of the first one."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"collapsed": false,
	"scrolled": false
	},
	"outputs": [],
	"source": [
	"if not os.path.exists(SAVE_DIR):\n",
	" os.mkdir(SAVE_DIR)\n",
	"else:\n",
	" print \"Cleaning save directory...\"\n",
	" !rm -f {SAVE_DIR}/*.nc\n",
	"\n",
	"def create_datafiles(n_files=n_files, data_shape=data_shape, \n",
	" dates=True, repeat_data=True, format='netcdf4'):\n",
	" print \"Creating %d files with data of\" \\\n",
	" \" shape %r\" % (n_files, data_shape)\n",
	" print \"Generating initial data...\",\n",
	" data_base = np.random.randint(-10., 120., data_shape)\n",
	" print \" done.\"\n",
	" \n",
	" coords = OrderedDict()\n",
	" coord_labels = ['time', 'x', 'y']\n",
	" for key, n in zip(coord_labels, data_shape):\n",
	" coords[key] = ([key, ], np.arange(n))\n",
	" \n",
	" if repeat_data:\n",
	" data = data_base + np.random.randn(*data_shape)\n",
	"\n",
	" for i in xrange(1, n_files+1):\n",
	" print \"File %03d:\" % i\n",
	" print \" creating data\"\n",
	" if not repeat_data:\n",
	" data = data_base + np.random.randn(*data_shape)\n",
	" \n",
	" # Increment the timesteps\n",
	" time = coords['time'][1]\n",
	" time += len(time) * (i - 1)\n",
	" if dates:\n",
	" time = date_func(i)\n",
	" coords['time'] = (['time', ], time)\n",
	" \n",
	" print \" constructing DataSet\"\n",
	" ds = xray.Dataset({'temperature': (coord_labels, data)}, \n",
	" coords)\n",
	" \n",
	" fn = \"ts_%03d.nc\" % i\n",
	" print \" writing to file -> %s...\" % fn,\n",
	" ds.to_netcdf(os.path.join(SAVE_DIR, fn), 'w',\n",
	" format=format)\n",
	" print \"done.\"\n",
	" \n",
	" file_size = ds.nbytes * (2 ** -30)\n",
	" total_size = file_size * n_files\n",
	" \n",
	" print \"\\n... finished!\"\n",
	" print \"Created a %d-file dataset of size\" % n_files + \\\n",
	" \" %3.1fGB/file (%3.1f GB total)\" % (file_size, total_size)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# Call the data creation routine.\n",
	"#create_datafiles()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"---\n",
	"\n",
	"Open the dataset we just generated"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" n_chunks: 10\n",
	"chunk_size: 1460\n",
	" ts/file: 1460\n",
	"samples per core: 191365120\n",
	"\n",
	"chunked array shape: ( 1460 256 512 )\n"
	]
	}
	],
	"source": [
	"# Compute the chunk size, given the number of cores\n",
	"n_chunks = n_files\n",
	"time_samples = n_files*data_shape[0]\n",
	"chunk_size = int(1 * time_samples / n_chunks)\n",
	"print \" n_chunks:\", n_chunks\n",
	"print \"chunk_size:\", chunk_size\n",
	"print \" ts/file:\", data_shape[0]\n",
	"print \"samples per core:\", chunk_sizedata_shape[1]data_shape[2]\n",
	"print\n",
	"print \"chunked array shape: (\", chunk_size, data_shape[1], data_shape[2], \")\" "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"collapsed": false,
	"scrolled": true
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"<xray.Dataset>\n",
	"Dimensions: (time: 13140, x: 256, y: 512)\n",
	"Coordinates:\n",
	" * x (x) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ...\n",
	" * y (y) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ...\n",
	" * time (time) datetime64[ns] 1961-01-01 1961-01-01T06:00:00 ...\n",
	"Data variables:\n",
	" temperature (time, x, y) float64 35.81 18.5 55.61 112.3 91.38 -5.46 ...\n",
	"Frozen(SortedKeysDict({u'y': (512,), u'x': (256,), u'time': (1460, 1460, 1460, 1460, 1460, 1460, 1460, 1460, 1460)}))\n"
	]
	}
	],
	"source": [
	"ds = xray.open_mfdataset(os.path.join(SAVE_DIR, \"ts_00*.nc\"), engine='h5netcdf')\n",
	"print repr(ds)\n",
	"print ds.chunks"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 25.1 s, sys: 26.4 s, total: 51.5 s\n",
	"Wall time: 18.2 s\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"54.502895280093526"
	]
	},
	"execution_count": 6,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# Timing of mean over all data\n",
	"%time float(ds.temperature.mean())"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.10"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}