Last active
December 23, 2015 17:29
-
-
Save pelson/6669402 to your computer and use it in GitHub Desktop.
Measures the speed of reading an entire 2d array then indexing, vs reading a single value after seeking.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "metadata": { | |
| "name": "Data indexing v2" | |
| }, | |
| "nbformat": 3, | |
| "worksheets": [ | |
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "input": [ | |
| "import numpy as np\n", | |
| "import numpy.random as rand \n", | |
| "\n", | |
| "# Set the seed, so that we always get the same random numbers.\n", | |
| "rand.seed(1)\n", | |
| "\n", | |
| "data = rand.randn(400, 200) * 1000\n", | |
| "dtype = data.dtype" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 1 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "input": [ | |
| "print dtype\n", | |
| "print 'Data at [100, 20]', data[100, 20]" | |
| ], | |
| "language": "python", | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "float64\n", | |
| "Data at [100, 20] -97.7272376673\n" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 2 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "input": [ | |
| "# Write the numpy array's data buffer.\n", | |
| "with open('data.dat', 'wb') as fh:\n", | |
| " fh.write(data.data)" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 3 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "input": [ | |
| "def indexed_read(index, shape, dtype):\n", | |
| " with open('data.dat', 'rb') as fh:\n", | |
| " ind_position = np.ravel_multi_index(index, shape) \n", | |
| " fh.seek(ind_position * dtype.itemsize)\n", | |
| " return np.fromfile(fh, dtype=dtype, count=1)[0]" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 4 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "input": [ | |
| "print indexed_read((100, 20), data.shape, dtype)" | |
| ], | |
| "language": "python", | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "-97.7272376673\n" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 5 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "Time the time to seek to the specified location, and just read a single float from there." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "input": [ | |
| "%timeit indexed_read((100, 20), data.shape, dtype)" | |
| ], | |
| "language": "python", | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "1000 loops, best of 3: 337 us per loop\n" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 6 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "Just in case there was some disk caching, or some other such file system magic." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "input": [ | |
| "%timeit indexed_read((100, 20), data.shape, dtype)" | |
| ], | |
| "language": "python", | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "1000 loops, best of 3: 311 us per loop\n" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 7 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "Now time the time to load the whole array, and index it down." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "input": [ | |
| "def read_first_then_index(index, shape, dtype):\n", | |
| " with open('data.dat', 'rb') as fh:\n", | |
| " return np.fromfile(fh, dtype=dtype, count=-1).reshape(shape)[index]" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 8 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "input": [ | |
| "print read_first_then_index((100, 20), data.shape, dtype)" | |
| ], | |
| "language": "python", | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "-97.7272376673\n" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 9 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "input": [ | |
| "%timeit read_first_then_index((100, 20), data.shape, dtype)" | |
| ], | |
| "language": "python", | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "1000 loops, best of 3: 430 us per loop\n" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 10 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "input": [ | |
| "%timeit read_first_then_index((100, 20), data.shape, dtype)" | |
| ], | |
| "language": "python", | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "1000 loops, best of 3: 422 us per loop\n" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 11 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "--------------------------" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "Lets try to demonstrate performance of loading a slice with 37**2 data points from an array of 2000x3200." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "input": [ | |
| "import numpy as np\n", | |
| "import numpy.random as rand \n", | |
| "\n", | |
| "# Set the seed, so that we always get the same random numbers.\n", | |
| "rand.seed(1)\n", | |
| "\n", | |
| "data = rand.randn(2000, 2300) * 1000\n", | |
| "dtype = data.dtype" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 23 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "input": [ | |
| "# Write the numpy array's data buffer.\n", | |
| "with open('data.dat', 'wb') as fh:\n", | |
| " fh.write(data.data)" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 13 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "input": [ | |
| "def indexed_read(index, shape, dtype):\n", | |
| " \"\"\"\n", | |
| " A dirty implementation of sliced loading. This is nowhere near production quality code.\n", | |
| " \"\"\"\n", | |
| " assert len(shape) == 2, 'Only handles 2D data.'\n", | |
| " assert index[1].step is None\n", | |
| " \n", | |
| " # Note: Assumes C order.\n", | |
| " row_start = index[1].start or 0\n", | |
| " row_len = (index[1].stop or shape[1]) - row_start\n", | |
| " \n", | |
| " col_len = (index[0].stop or shape[0]) - (index[0].start or 0)\n", | |
| " array = np.empty([col_len, row_len], dtype=dtype)\n", | |
| " \n", | |
| " with open('data.dat', 'rb') as fh:\n", | |
| " for i, column in enumerate(range(*index[0].indices(shape[0]))):\n", | |
| " this_index = [column, row_start]\n", | |
| " ind_position = np.ravel_multi_index(this_index, shape)\n", | |
| " fh.seek(ind_position * dtype.itemsize)\n", | |
| " array[i, :] = np.fromfile(fh, dtype=dtype, count=row_len)\n", | |
| " return array" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 14 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "input": [ | |
| "print indexed_read((slice(100, 137), slice(20, 57)), data.shape, dtype)" | |
| ], | |
| "language": "python", | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "[[ 36.45765261 584.03104586 672.81583883 ..., -1135.05114042\n", | |
| " 1519.42669281 -15.01216673]\n", | |
| " [ -858.37403831 -17.46844211 -408.61175401 ..., -143.70874343\n", | |
| " 1087.78456648 -2257.91913272]\n", | |
| " [ -493.43205609 506.66145279 1658.73365532 ..., 703.13735591\n", | |
| " -778.14278185 683.13502235]\n", | |
| " ..., \n", | |
| " [ -263.04030576 -407.3809142 -738.90982064 ..., 46.90285342\n", | |
| " 1427.72710339 1026.40209575]\n", | |
| " [-1883.60966548 1339.98293698 -537.81004422 ..., -528.52062211\n", | |
| " -1185.61622574 -370.07687284]\n", | |
| " [ 1591.09510803 -2322.59168283 -208.95143324 ..., 782.05643564\n", | |
| " -312.11496452 -2822.87503771]]\n" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 15 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "input": [ | |
| "%timeit indexed_read((slice(100, 137), slice(20, 57)), data.shape, dtype)" | |
| ], | |
| "language": "python", | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "1000 loops, best of 3: 1.11 ms per loop\n" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 22 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "input": [ | |
| "%timeit indexed_read((slice(100, 137), slice(20, 57)), data.shape, dtype)" | |
| ], | |
| "language": "python", | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "1000 loops, best of 3: 1.18 ms per loop\n" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 17 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "input": [ | |
| "def read_first_then_index(index, shape, dtype):\n", | |
| " with open('data.dat', 'rb') as fh:\n", | |
| " return np.fromfile(fh, dtype=dtype, count=-1).reshape(shape)[index]" | |
| ], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 18 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "input": [ | |
| "print read_first_then_index((slice(100, 137), slice(20, 57)), data.shape, dtype)" | |
| ], | |
| "language": "python", | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "[[ 36.45765261 584.03104586 672.81583883 ..., -1135.05114042\n", | |
| " 1519.42669281 -15.01216673]\n", | |
| " [ -858.37403831 -17.46844211 -408.61175401 ..., -143.70874343\n", | |
| " 1087.78456648 -2257.91913272]\n", | |
| " [ -493.43205609 506.66145279 1658.73365532 ..., 703.13735591\n", | |
| " -778.14278185 683.13502235]\n", | |
| " ..., \n", | |
| " [ -263.04030576 -407.3809142 -738.90982064 ..., 46.90285342\n", | |
| " 1427.72710339 1026.40209575]\n", | |
| " [-1883.60966548 1339.98293698 -537.81004422 ..., -528.52062211\n", | |
| " -1185.61622574 -370.07687284]\n", | |
| " [ 1591.09510803 -2322.59168283 -208.95143324 ..., 782.05643564\n", | |
| " -312.11496452 -2822.87503771]]\n" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 19 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "input": [ | |
| "%timeit read_first_then_index((100, 20), data.shape, dtype)" | |
| ], | |
| "language": "python", | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "100 loops, best of 3: 13.3 ms per loop\n" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 20 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "input": [ | |
| "%timeit read_first_then_index((100, 20), data.shape, dtype)" | |
| ], | |
| "language": "python", | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "100 loops, best of 3: 13.2 ms per loop\n" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 21 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "input": [], | |
| "language": "python", | |
| "outputs": [], | |
| "prompt_number": 21 | |
| } | |
| ] | |
| } | |
| ] | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment