stuarteberg · February 13, 2018 20:03
diff --git a/benchmark.ipynb b/benchmark.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Quick HDF5 benchmark"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Comparing read access in a big contiguous uncompressed array between h5py and memmap, using the same HDF5 file."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Imports:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import h5py\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "np.random.seed(2016)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We'll use this function to bypass the slow h5py data access with a faster memory mapping (only works on uncompressed contiguous datasets):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def _mmap_h5(path, h5path):\n",
    "    with h5py.File(path) as f:\n",
    "        ds = f[h5path]\n",
    "        # We get the dataset address in the HDF5 fiel.\n",
    "        offset = ds.id.get_offset()\n",
    "        # We ensure we have a non-compressed contiguous array.\n",
    "        assert ds.chunks is None\n",
    "        assert ds.compression is None\n",
    "        assert offset > 0\n",
    "        dtype = ds.dtype\n",
    "        shape = ds.shape\n",
    "    arr = np.memmap(path, mode='r', shape=shape, offset=offset, dtype=dtype)\n",
    "    return arr"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Number of lines in our test array:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "n = 100000"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We generate a random array:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "arr = np.random.rand(n, 1000)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We write it to a file:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "with h5py.File('test.h5', 'w') as f:\n",
    "    f['/test'] = arr"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## With h5py"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Fancy indexing:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "ind = np.arange(0, 10000, 10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In-memory access:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1 loops, best of 1: 2.63 ms per loop\n"
     ]
    }
   ],
   "source": [
    "%%timeit -r1 -n1\n",
    "arr[ind, :]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "With h5py:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1 loops, best of 1: 92.2 ms per loop\n"
     ]
    }
   ],
   "source": [
    "%%timeit -r1 -n1 f = h5py.File('test.h5', 'r'); dset = f['/test']\n",
    "copy = dset[ind, :]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## With memmap"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here we do the same, but we use `np.memmap()` on the data buffer, bypassing HDF5 completely:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false,
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1 loops, best of 1: 6.44 ms per loop\n"
     ]
    }
   ],
   "source": [
    "%%timeit -r1 -n1 a = _mmap_h5('test.h5', '/test')\n",
    "copy = np.array(a[ind, :])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'numpy.core.memmap.memmap'>\n",
      "<type 'numpy.ndarray'>\n"
     ]
    }
   ],
   "source": [
    "a = _mmap_h5('test.h5', '/test')\n",
    "copy = np.array(a[ind, :])\n",
    "print type(a)\n",
    "print type(copy)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "14.316770186335404"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "92.2/6.44"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## With fancy indexing of *columns*, not rows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "ind2 = np.arange(0, 1000, 3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "HDF5 is twice as fast in this case..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1 loops, best of 1: 550 ms per loop\n"
     ]
    }
   ],
   "source": [
    "%%timeit -r1 -n1 f = h5py.File('test.h5', 'r'); dset = f['/test']\n",
    "copy = dset[:, ind2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1 loops, best of 1: 1.07 s per loop\n"
     ]
    }
   ],
   "source": [
    "%%timeit -r1 -n1 a = _mmap_h5('test.h5', '/test')\n",
    "copy = np.array(a[:, ind2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Quick HDF5 benchmark"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Comparing read access in a big contiguous uncompressed array between h5py and memmap, using the same HDF5 file."
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Imports:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import h5py\n",
	"import numpy as np"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"np.random.seed(2016)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"We'll use this function to bypass the slow h5py data access with a faster memory mapping (only works on uncompressed contiguous datasets):"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def _mmap_h5(path, h5path):\n",
	" with h5py.File(path) as f:\n",
	" ds = f[h5path]\n",
	" # We get the dataset address in the HDF5 fiel.\n",
	" offset = ds.id.get_offset()\n",
	" # We ensure we have a non-compressed contiguous array.\n",
	" assert ds.chunks is None\n",
	" assert ds.compression is None\n",
	" assert offset > 0\n",
	" dtype = ds.dtype\n",
	" shape = ds.shape\n",
	" arr = np.memmap(path, mode='r', shape=shape, offset=offset, dtype=dtype)\n",
	" return arr"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Number of lines in our test array:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"n = 100000"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"We generate a random array:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"arr = np.random.rand(n, 1000)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"We write it to a file:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"with h5py.File('test.h5', 'w') as f:\n",
	" f['/test'] = arr"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## With h5py"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Fancy indexing:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"ind = np.arange(0, 10000, 10)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"In-memory access:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 17,
	"metadata": {
	"collapsed": false,
	"scrolled": true
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"1 loops, best of 1: 2.63 ms per loop\n"
	]
	}
	],
	"source": [
	"%%timeit -r1 -n1\n",
	"arr[ind, :]"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"With h5py:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"metadata": {
	"collapsed": false,
	"scrolled": true
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"1 loops, best of 1: 92.2 ms per loop\n"
	]
	}
	],
	"source": [
	"%%timeit -r1 -n1 f = h5py.File('test.h5', 'r'); dset = f['/test']\n",
	"copy = dset[ind, :]"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## With memmap"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Here we do the same, but we use `np.memmap()` on the data buffer, bypassing HDF5 completely:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 20,
	"metadata": {
	"collapsed": false,
	"scrolled": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"1 loops, best of 1: 6.44 ms per loop\n"
	]
	}
	],
	"source": [
	"%%timeit -r1 -n1 a = _mmap_h5('test.h5', '/test')\n",
	"copy = np.array(a[ind, :])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 26,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"<class 'numpy.core.memmap.memmap'>\n",
	"<type 'numpy.ndarray'>\n"
	]
	}
	],
	"source": [
	"a = _mmap_h5('test.h5', '/test')\n",
	"copy = np.array(a[ind, :])\n",
	"print type(a)\n",
	"print type(copy)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 27,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"14.316770186335404"
	]
	},
	"execution_count": 27,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"92.2/6.44"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## With fancy indexing of columns, not rows"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 29,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"ind2 = np.arange(0, 1000, 3)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"HDF5 is twice as fast in this case..."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 32,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"1 loops, best of 1: 550 ms per loop\n"
	]
	}
	],
	"source": [
	"%%timeit -r1 -n1 f = h5py.File('test.h5', 'r'); dset = f['/test']\n",
	"copy = dset[:, ind2]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 33,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"1 loops, best of 1: 1.07 s per loop\n"
	]
	}
	],
	"source": [
	"%%timeit -r1 -n1 a = _mmap_h5('test.h5', '/test')\n",
	"copy = np.array(a[:, ind2])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.11"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}