Created
October 25, 2017 12:25
-
-
Save maartenbreddels/09e1da79577151e5f7fec660c209f06e to your computer and use it in GitHub Desktop.
memory mapping hdf5 continuous data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"For continuous arrays there is no need to use the hdf5 library, once the offset, shape and dtype is know, we can close the hdf5 file, open it ourselves and mmap the data. This gives zero overhead in reading the dat." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"import h5py\n", | |
"path = '/Users/users/breddels/.vaex/data/helmi-dezeeuw-2000-10p.hdf5'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"h5file = h5py.File(path, 'r')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"data <HDF5 group \"/data\" (11 members)> group\n", | |
"data/E <HDF5 dataset \"E\": shape (330000,), type \"<f8\"> dataset\n", | |
"data/FeH <HDF5 dataset \"FeH\": shape (330000,), type \"<f8\"> dataset\n", | |
"data/L <HDF5 dataset \"L\": shape (330000,), type \"<f8\"> dataset\n", | |
"data/Lz <HDF5 dataset \"Lz\": shape (330000,), type \"<f8\"> dataset\n", | |
"data/random_index <HDF5 dataset \"random_index\": shape (330000,), type \"<i8\"> dataset\n", | |
"data/vx <HDF5 dataset \"vx\": shape (330000,), type \"<f8\"> dataset\n", | |
"data/vy <HDF5 dataset \"vy\": shape (330000,), type \"<f8\"> dataset\n", | |
"data/vz <HDF5 dataset \"vz\": shape (330000,), type \"<f8\"> dataset\n", | |
"data/x <HDF5 dataset \"x\": shape (330000,), type \"<f8\"> dataset\n", | |
"data/y <HDF5 dataset \"y\": shape (330000,), type \"<f8\"> dataset\n", | |
"data/z <HDF5 dataset \"z\": shape (330000,), type \"<f8\"> dataset\n" | |
] | |
} | |
], | |
"source": [ | |
"# maps from hdf5 path to metadata\n", | |
"arrays_metadata = {}\n", | |
"def f(name, item):\n", | |
" is_dataset = isinstance(item, h5py.Dataset)\n", | |
" print(name, item, 'dataset' if is_dataset else 'group')\n", | |
" if is_dataset:\n", | |
" offset = item.id.get_offset()\n", | |
" if offset is not None:\n", | |
" arrays_metadata[name] = dict(offset=offset, shape=item.shape, dtype=item.dtype)\n", | |
" else:\n", | |
" print('could not get offset, probably not a continuous array')\n", | |
"h5file.visititems(f)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# no need to keep this open\n", | |
"h5file.close()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"import mmap\n", | |
"import numpy as np\n", | |
"file = open(path, \"rb\")\n", | |
"fileno = file.fileno()\n", | |
"mapping = mmap.mmap(fileno, 0, access=mmap.ACCESS_READ)\n", | |
"def to_array(metadata):\n", | |
" shape = metadata['shape']\n", | |
" dtype = metadata['dtype']\n", | |
" offset = metadata['offset']\n", | |
" length = np.prod(shape)\n", | |
" return np.frombuffer(mapping, dtype=dtype, count=length, offset=offset).reshape(shape)\n", | |
"# map the metadata to a numpy array\n", | |
"arrays = {name:to_array(metadata) for name, metadata in arrays_metadata.items()}" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'data/E': array([-121238.171875 , -100819.9140625, -100559.9609375, ...,\n", | |
" -112580.359375 , -74862.90625 , -95361.765625 ]),\n", | |
" 'data/FeH': array([-2.30922761, -1.78873549, -0.7618109 , ..., -1.93062276,\n", | |
" -1.22501982, -2.56896369]),\n", | |
" 'data/L': array([ 831.07995605, 1435.18395996, 1039.2989502 , ..., 1182.4362793 ,\n", | |
" 1324.59265137, 351.09555054]),\n", | |
" 'data/Lz': array([ -336.42651367, -828.7567749 , 920.80249023, ..., 115.58557892,\n", | |
" 1057.01733398, -309.81439209]),\n", | |
" 'data/random_index': array([1511648, 2728665, 1202632, ..., 374845, 425745, 289364]),\n", | |
" 'data/vx': array([ 53.276722 , 252.810791 , 96.276474 , ..., 8.46711349,\n", | |
" 110.221558 , -2.10541415]),\n", | |
" 'data/vy': array([ 288.386047 , -69.9498444, 226.440201 , ..., -38.2765236,\n", | |
" -31.3925591, -27.6108856]),\n", | |
" 'data/vz': array([ -95.2649078 , -56.3121033 , -34.7527161 , ..., -127.541473 ,\n", | |
" 86.2726822 , 3.80799961]),\n", | |
" 'data/x': array([ -0.77747077, 3.77427316, 1.3757627 , ..., -1.14041007,\n", | |
" -14.2985935 , 10.5450506 ]),\n", | |
" 'data/y': array([ 2.10626292, 2.23387194, -6.3283844 , ..., -8.4957695 ,\n", | |
" -5.51750422, -8.86106777]),\n", | |
" 'data/z': array([ 1.93743467, 3.76209331, 2.63250017, ..., 2.25749826,\n", | |
" -8.65472317, -4.65835428])}" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"arrays" | |
] | |
} | |
], | |
"metadata": { | |
"anaconda-cloud": {}, | |
"kernelspec": { | |
"display_name": "Python [default]", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment