Skip to content

Instantly share code, notes, and snippets.

@pelson
Last active December 23, 2015 17:29
Show Gist options
  • Save pelson/6669402 to your computer and use it in GitHub Desktop.
Save pelson/6669402 to your computer and use it in GitHub Desktop.
Measures the speed of reading an entire 2d array then indexing, vs reading a single value after seeking.
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": "Data indexing v2"
},
"nbformat": 3,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"input": [
"import numpy as np\n",
"import numpy.random as rand \n",
"\n",
"# Set the seed, so that we always get the same random numbers.\n",
"rand.seed(1)\n",
"\n",
"data = rand.randn(400, 200) * 1000\n",
"dtype = data.dtype"
],
"language": "python",
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"input": [
"print dtype\n",
"print 'Data at [100, 20]', data[100, 20]"
],
"language": "python",
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"float64\n",
"Data at [100, 20] -97.7272376673\n"
]
}
],
"prompt_number": 2
},
{
"cell_type": "code",
"input": [
"# Write the numpy array's data buffer.\n",
"with open('data.dat', 'wb') as fh:\n",
" fh.write(data.data)"
],
"language": "python",
"outputs": [],
"prompt_number": 3
},
{
"cell_type": "code",
"input": [
"def indexed_read(index, shape, dtype):\n",
" with open('data.dat', 'rb') as fh:\n",
" ind_position = np.ravel_multi_index(index, shape) \n",
" fh.seek(ind_position * dtype.itemsize)\n",
" return np.fromfile(fh, dtype=dtype, count=1)[0]"
],
"language": "python",
"outputs": [],
"prompt_number": 4
},
{
"cell_type": "code",
"input": [
"print indexed_read((100, 20), data.shape, dtype)"
],
"language": "python",
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"-97.7272376673\n"
]
}
],
"prompt_number": 5
},
{
"cell_type": "markdown",
"source": [
"Time the time to seek to the specified location, and just read a single float from there."
]
},
{
"cell_type": "code",
"input": [
"%timeit indexed_read((100, 20), data.shape, dtype)"
],
"language": "python",
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"1000 loops, best of 3: 337 us per loop\n"
]
}
],
"prompt_number": 6
},
{
"cell_type": "markdown",
"source": [
"Just in case there was some disk caching, or some other such file system magic."
]
},
{
"cell_type": "code",
"input": [
"%timeit indexed_read((100, 20), data.shape, dtype)"
],
"language": "python",
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"1000 loops, best of 3: 311 us per loop\n"
]
}
],
"prompt_number": 7
},
{
"cell_type": "markdown",
"source": [
"Now time the time to load the whole array, and index it down."
]
},
{
"cell_type": "code",
"input": [
"def read_first_then_index(index, shape, dtype):\n",
" with open('data.dat', 'rb') as fh:\n",
" return np.fromfile(fh, dtype=dtype, count=-1).reshape(shape)[index]"
],
"language": "python",
"outputs": [],
"prompt_number": 8
},
{
"cell_type": "code",
"input": [
"print read_first_then_index((100, 20), data.shape, dtype)"
],
"language": "python",
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"-97.7272376673\n"
]
}
],
"prompt_number": 9
},
{
"cell_type": "code",
"input": [
"%timeit read_first_then_index((100, 20), data.shape, dtype)"
],
"language": "python",
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"1000 loops, best of 3: 430 us per loop\n"
]
}
],
"prompt_number": 10
},
{
"cell_type": "code",
"input": [
"%timeit read_first_then_index((100, 20), data.shape, dtype)"
],
"language": "python",
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"1000 loops, best of 3: 422 us per loop\n"
]
}
],
"prompt_number": 11
},
{
"cell_type": "markdown",
"source": [
"--------------------------"
]
},
{
"cell_type": "markdown",
"source": [
"Lets try to demonstrate performance of loading a slice with 37**2 data points from an array of 2000x3200."
]
},
{
"cell_type": "code",
"input": [
"import numpy as np\n",
"import numpy.random as rand \n",
"\n",
"# Set the seed, so that we always get the same random numbers.\n",
"rand.seed(1)\n",
"\n",
"data = rand.randn(2000, 2300) * 1000\n",
"dtype = data.dtype"
],
"language": "python",
"outputs": [],
"prompt_number": 23
},
{
"cell_type": "code",
"input": [
"# Write the numpy array's data buffer.\n",
"with open('data.dat', 'wb') as fh:\n",
" fh.write(data.data)"
],
"language": "python",
"outputs": [],
"prompt_number": 13
},
{
"cell_type": "code",
"input": [
"def indexed_read(index, shape, dtype):\n",
" \"\"\"\n",
" A dirty implementation of sliced loading. This is nowhere near production quality code.\n",
" \"\"\"\n",
" assert len(shape) == 2, 'Only handles 2D data.'\n",
" assert index[1].step is None\n",
" \n",
" # Note: Assumes C order.\n",
" row_start = index[1].start or 0\n",
" row_len = (index[1].stop or shape[1]) - row_start\n",
" \n",
" col_len = (index[0].stop or shape[0]) - (index[0].start or 0)\n",
" array = np.empty([col_len, row_len], dtype=dtype)\n",
" \n",
" with open('data.dat', 'rb') as fh:\n",
" for i, column in enumerate(range(*index[0].indices(shape[0]))):\n",
" this_index = [column, row_start]\n",
" ind_position = np.ravel_multi_index(this_index, shape)\n",
" fh.seek(ind_position * dtype.itemsize)\n",
" array[i, :] = np.fromfile(fh, dtype=dtype, count=row_len)\n",
" return array"
],
"language": "python",
"outputs": [],
"prompt_number": 14
},
{
"cell_type": "code",
"input": [
"print indexed_read((slice(100, 137), slice(20, 57)), data.shape, dtype)"
],
"language": "python",
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"[[ 36.45765261 584.03104586 672.81583883 ..., -1135.05114042\n",
" 1519.42669281 -15.01216673]\n",
" [ -858.37403831 -17.46844211 -408.61175401 ..., -143.70874343\n",
" 1087.78456648 -2257.91913272]\n",
" [ -493.43205609 506.66145279 1658.73365532 ..., 703.13735591\n",
" -778.14278185 683.13502235]\n",
" ..., \n",
" [ -263.04030576 -407.3809142 -738.90982064 ..., 46.90285342\n",
" 1427.72710339 1026.40209575]\n",
" [-1883.60966548 1339.98293698 -537.81004422 ..., -528.52062211\n",
" -1185.61622574 -370.07687284]\n",
" [ 1591.09510803 -2322.59168283 -208.95143324 ..., 782.05643564\n",
" -312.11496452 -2822.87503771]]\n"
]
}
],
"prompt_number": 15
},
{
"cell_type": "code",
"input": [
"%timeit indexed_read((slice(100, 137), slice(20, 57)), data.shape, dtype)"
],
"language": "python",
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"1000 loops, best of 3: 1.11 ms per loop\n"
]
}
],
"prompt_number": 22
},
{
"cell_type": "code",
"input": [
"%timeit indexed_read((slice(100, 137), slice(20, 57)), data.shape, dtype)"
],
"language": "python",
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"1000 loops, best of 3: 1.18 ms per loop\n"
]
}
],
"prompt_number": 17
},
{
"cell_type": "code",
"input": [
"def read_first_then_index(index, shape, dtype):\n",
" with open('data.dat', 'rb') as fh:\n",
" return np.fromfile(fh, dtype=dtype, count=-1).reshape(shape)[index]"
],
"language": "python",
"outputs": [],
"prompt_number": 18
},
{
"cell_type": "code",
"input": [
"print read_first_then_index((slice(100, 137), slice(20, 57)), data.shape, dtype)"
],
"language": "python",
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"[[ 36.45765261 584.03104586 672.81583883 ..., -1135.05114042\n",
" 1519.42669281 -15.01216673]\n",
" [ -858.37403831 -17.46844211 -408.61175401 ..., -143.70874343\n",
" 1087.78456648 -2257.91913272]\n",
" [ -493.43205609 506.66145279 1658.73365532 ..., 703.13735591\n",
" -778.14278185 683.13502235]\n",
" ..., \n",
" [ -263.04030576 -407.3809142 -738.90982064 ..., 46.90285342\n",
" 1427.72710339 1026.40209575]\n",
" [-1883.60966548 1339.98293698 -537.81004422 ..., -528.52062211\n",
" -1185.61622574 -370.07687284]\n",
" [ 1591.09510803 -2322.59168283 -208.95143324 ..., 782.05643564\n",
" -312.11496452 -2822.87503771]]\n"
]
}
],
"prompt_number": 19
},
{
"cell_type": "code",
"input": [
"%timeit read_first_then_index((100, 20), data.shape, dtype)"
],
"language": "python",
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"100 loops, best of 3: 13.3 ms per loop\n"
]
}
],
"prompt_number": 20
},
{
"cell_type": "code",
"input": [
"%timeit read_first_then_index((100, 20), data.shape, dtype)"
],
"language": "python",
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"100 loops, best of 3: 13.2 ms per loop\n"
]
}
],
"prompt_number": 21
},
{
"cell_type": "code",
"input": [],
"language": "python",
"outputs": [],
"prompt_number": 21
}
]
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment