Last active
February 13, 2018 20:03
-
-
Save stuarteberg/57386ac8c257ddb22e29 to your computer and use it in GitHub Desktop.
Quick HDF5 Benchmark, modified from rossant/benchmark.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Quick HDF5 benchmark" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Comparing read access in a big contiguous uncompressed array between h5py and memmap, using the same HDF5 file." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Imports:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import h5py\n", | |
"import numpy as np" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"np.random.seed(2016)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"We'll use this function to bypass the slow h5py data access with a faster memory mapping (only works on uncompressed contiguous datasets):" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def _mmap_h5(path, h5path):\n", | |
" with h5py.File(path) as f:\n", | |
" ds = f[h5path]\n", | |
" # We get the dataset address in the HDF5 fiel.\n", | |
" offset = ds.id.get_offset()\n", | |
" # We ensure we have a non-compressed contiguous array.\n", | |
" assert ds.chunks is None\n", | |
" assert ds.compression is None\n", | |
" assert offset > 0\n", | |
" dtype = ds.dtype\n", | |
" shape = ds.shape\n", | |
" arr = np.memmap(path, mode='r', shape=shape, offset=offset, dtype=dtype)\n", | |
" return arr" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Number of lines in our test array:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"n = 100000" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"We generate a random array:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"arr = np.random.rand(n, 1000)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"We write it to a file:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"with h5py.File('test.h5', 'w') as f:\n", | |
" f['/test'] = arr" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## With h5py" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Fancy indexing:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"ind = np.arange(0, 10000, 10)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"In-memory access:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": { | |
"collapsed": false, | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"1 loops, best of 1: 2.63 ms per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit -r1 -n1\n", | |
"arr[ind, :]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"With h5py:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": { | |
"collapsed": false, | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"1 loops, best of 1: 92.2 ms per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit -r1 -n1 f = h5py.File('test.h5', 'r'); dset = f['/test']\n", | |
"copy = dset[ind, :]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## With memmap" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Here we do the same, but we use `np.memmap()` on the data buffer, bypassing HDF5 completely:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": { | |
"collapsed": false, | |
"scrolled": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"1 loops, best of 1: 6.44 ms per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit -r1 -n1 a = _mmap_h5('test.h5', '/test')\n", | |
"copy = np.array(a[ind, :])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"<class 'numpy.core.memmap.memmap'>\n", | |
"<type 'numpy.ndarray'>\n" | |
] | |
} | |
], | |
"source": [ | |
"a = _mmap_h5('test.h5', '/test')\n", | |
"copy = np.array(a[ind, :])\n", | |
"print type(a)\n", | |
"print type(copy)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"14.316770186335404" | |
] | |
}, | |
"execution_count": 27, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"92.2/6.44" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## With fancy indexing of *columns*, not rows" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 29, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"ind2 = np.arange(0, 1000, 3)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"HDF5 is twice as fast in this case..." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 32, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"1 loops, best of 1: 550 ms per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit -r1 -n1 f = h5py.File('test.h5', 'r'); dset = f['/test']\n", | |
"copy = dset[:, ind2]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 33, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"1 loops, best of 1: 1.07 s per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit -r1 -n1 a = _mmap_h5('test.h5', '/test')\n", | |
"copy = np.array(a[:, ind2])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.11" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment