pelson · December 23, 2015 17:29
diff --git a/speed_of_loading.ipynb b/speed_of_loading.ipynb
 {
 "metadata": {
  "name": "Data indexing v2"
 },
 "nbformat": 3,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "code",
     "input": [
      "import numpy as np\n",
      "import numpy.random as rand \n",
      "\n",
      "# Set the seed, so that we always get the same random numbers.\n",
      "rand.seed(1)\n",
      "\n",
      "data = rand.randn(400, 200) * 1000\n",
      "dtype = data.dtype"
     ],
     "language": "python",
     "outputs": [],
     "prompt_number": 1
    },
    {
     "cell_type": "code",
     "input": [
      "print dtype\n",
      "print 'Data at [100, 20]', data[100, 20]"
     ],
     "language": "python",
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "float64\n",
        "Data at [100, 20] -97.7272376673\n"
       ]
      }
     ],
     "prompt_number": 2
    },
    {
     "cell_type": "code",
     "input": [
      "# Write the numpy array's data buffer.\n",
      "with open('data.dat', 'wb') as fh:\n",
      "    fh.write(data.data)"
     ],
     "language": "python",
     "outputs": [],
     "prompt_number": 3
    },
    {
     "cell_type": "code",
     "input": [
      "def indexed_read(index, shape, dtype):\n",
      "    with open('data.dat', 'rb') as fh:\n",
      "        ind_position = np.ravel_multi_index(index, shape)        \n",
      "        fh.seek(ind_position * dtype.itemsize)\n",
      "        return np.fromfile(fh, dtype=dtype, count=1)[0]"
     ],
     "language": "python",
     "outputs": [],
     "prompt_number": 4
    },
    {
     "cell_type": "code",
     "input": [
      "print indexed_read((100, 20), data.shape, dtype)"
     ],
     "language": "python",
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "-97.7272376673\n"
       ]
      }
     ],
     "prompt_number": 5
    },
    {
     "cell_type": "markdown",
     "source": [
      "Time the time to seek to the specified location, and just read a single float from there."
     ]
    },
    {
     "cell_type": "code",
     "input": [
      "%timeit indexed_read((100, 20), data.shape, dtype)"
     ],
     "language": "python",
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "1000 loops, best of 3: 337 us per loop\n"
       ]
      }
     ],
     "prompt_number": 6
    },
    {
     "cell_type": "markdown",
     "source": [
      "Just in case there was some disk caching, or some other such file system magic."
     ]
    },
    {
     "cell_type": "code",
     "input": [
      "%timeit indexed_read((100, 20), data.shape, dtype)"
     ],
     "language": "python",
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "1000 loops, best of 3: 311 us per loop\n"
       ]
      }
     ],
     "prompt_number": 7
    },
    {
     "cell_type": "markdown",
     "source": [
      "Now time the time to load the whole array, and index it down."
     ]
    },
    {
     "cell_type": "code",
     "input": [
      "def read_first_then_index(index, shape, dtype):\n",
      "    with open('data.dat', 'rb') as fh:\n",
      "        return np.fromfile(fh, dtype=dtype, count=-1).reshape(shape)[index]"
     ],
     "language": "python",
     "outputs": [],
     "prompt_number": 8
    },
    {
     "cell_type": "code",
     "input": [
      "print read_first_then_index((100, 20), data.shape, dtype)"
     ],
     "language": "python",
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "-97.7272376673\n"
       ]
      }
     ],
     "prompt_number": 9
    },
    {
     "cell_type": "code",
     "input": [
      "%timeit read_first_then_index((100, 20), data.shape, dtype)"
     ],
     "language": "python",
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "1000 loops, best of 3: 430 us per loop\n"
       ]
      }
     ],
     "prompt_number": 10
    },
    {
     "cell_type": "code",
     "input": [
      "%timeit read_first_then_index((100, 20), data.shape, dtype)"
     ],
     "language": "python",
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "1000 loops, best of 3: 422 us per loop\n"
       ]
      }
     ],
     "prompt_number": 11
    },
    {
     "cell_type": "markdown",
     "source": [
      "--------------------------"
     ]
    },
    {
     "cell_type": "markdown",
     "source": [
      "Lets try to demonstrate performance of loading a slice with 37**2 data points from an array of 2000x3200."
     ]
    },
    {
     "cell_type": "code",
     "input": [
      "import numpy as np\n",
      "import numpy.random as rand \n",
      "\n",
      "# Set the seed, so that we always get the same random numbers.\n",
      "rand.seed(1)\n",
      "\n",
      "data = rand.randn(2000, 2300) * 1000\n",
      "dtype = data.dtype"
     ],
     "language": "python",
     "outputs": [],
     "prompt_number": 23
    },
    {
     "cell_type": "code",
     "input": [
      "# Write the numpy array's data buffer.\n",
      "with open('data.dat', 'wb') as fh:\n",
      "    fh.write(data.data)"
     ],
     "language": "python",
     "outputs": [],
     "prompt_number": 13
    },
    {
     "cell_type": "code",
     "input": [
      "def indexed_read(index, shape, dtype):\n",
      "    \"\"\"\n",
      "    A dirty implementation of sliced loading. This is nowhere near production quality code.\n",
      "    \"\"\"\n",
      "    assert len(shape) == 2, 'Only handles 2D data.'\n",
      "    assert index[1].step is None\n",
      "    \n",
      "    # Note: Assumes C order.\n",
      "    row_start = index[1].start or 0\n",
      "    row_len = (index[1].stop or shape[1]) - row_start\n",
      "    \n",
      "    col_len = (index[0].stop or shape[0]) - (index[0].start or 0)\n",
      "    array = np.empty([col_len, row_len], dtype=dtype)\n",
      "    \n",
      "    with open('data.dat', 'rb') as fh:\n",
      "        for i, column in enumerate(range(*index[0].indices(shape[0]))):\n",
      "            this_index = [column, row_start]\n",
      "            ind_position = np.ravel_multi_index(this_index, shape)\n",
      "            fh.seek(ind_position * dtype.itemsize)\n",
      "            array[i, :] = np.fromfile(fh, dtype=dtype, count=row_len)\n",
      "    return array"
     ],
     "language": "python",
     "outputs": [],
     "prompt_number": 14
    },
    {
     "cell_type": "code",
     "input": [
      "print indexed_read((slice(100, 137), slice(20, 57)), data.shape, dtype)"
     ],
     "language": "python",
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "[[   36.45765261   584.03104586   672.81583883 ..., -1135.05114042\n",
        "   1519.42669281   -15.01216673]\n",
        " [ -858.37403831   -17.46844211  -408.61175401 ...,  -143.70874343\n",
        "   1087.78456648 -2257.91913272]\n",
        " [ -493.43205609   506.66145279  1658.73365532 ...,   703.13735591\n",
        "   -778.14278185   683.13502235]\n",
        " ..., \n",
        " [ -263.04030576  -407.3809142   -738.90982064 ...,    46.90285342\n",
        "   1427.72710339  1026.40209575]\n",
        " [-1883.60966548  1339.98293698  -537.81004422 ...,  -528.52062211\n",
        "  -1185.61622574  -370.07687284]\n",
        " [ 1591.09510803 -2322.59168283  -208.95143324 ...,   782.05643564\n",
        "   -312.11496452 -2822.87503771]]\n"
       ]
      }
     ],
     "prompt_number": 15
    },
    {
     "cell_type": "code",
     "input": [
      "%timeit indexed_read((slice(100, 137), slice(20, 57)), data.shape, dtype)"
     ],
     "language": "python",
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "1000 loops, best of 3: 1.11 ms per loop\n"
       ]
      }
     ],
     "prompt_number": 22
    },
    {
     "cell_type": "code",
     "input": [
      "%timeit indexed_read((slice(100, 137), slice(20, 57)), data.shape, dtype)"
     ],
     "language": "python",
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "1000 loops, best of 3: 1.18 ms per loop\n"
       ]
      }
     ],
     "prompt_number": 17
    },
    {
     "cell_type": "code",
     "input": [
      "def read_first_then_index(index, shape, dtype):\n",
      "    with open('data.dat', 'rb') as fh:\n",
      "        return np.fromfile(fh, dtype=dtype, count=-1).reshape(shape)[index]"
     ],
     "language": "python",
     "outputs": [],
     "prompt_number": 18
    },
    {
     "cell_type": "code",
     "input": [
      "print read_first_then_index((slice(100, 137), slice(20, 57)), data.shape, dtype)"
     ],
     "language": "python",
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "[[   36.45765261   584.03104586   672.81583883 ..., -1135.05114042\n",
        "   1519.42669281   -15.01216673]\n",
        " [ -858.37403831   -17.46844211  -408.61175401 ...,  -143.70874343\n",
        "   1087.78456648 -2257.91913272]\n",
        " [ -493.43205609   506.66145279  1658.73365532 ...,   703.13735591\n",
        "   -778.14278185   683.13502235]\n",
        " ..., \n",
        " [ -263.04030576  -407.3809142   -738.90982064 ...,    46.90285342\n",
        "   1427.72710339  1026.40209575]\n",
        " [-1883.60966548  1339.98293698  -537.81004422 ...,  -528.52062211\n",
        "  -1185.61622574  -370.07687284]\n",
        " [ 1591.09510803 -2322.59168283  -208.95143324 ...,   782.05643564\n",
        "   -312.11496452 -2822.87503771]]\n"
       ]
      }
     ],
     "prompt_number": 19
    },
    {
     "cell_type": "code",
     "input": [
      "%timeit read_first_then_index((100, 20), data.shape, dtype)"
     ],
     "language": "python",
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "100 loops, best of 3: 13.3 ms per loop\n"
       ]
      }
     ],
     "prompt_number": 20
    },
    {
     "cell_type": "code",
     "input": [
      "%timeit read_first_then_index((100, 20), data.shape, dtype)"
     ],
     "language": "python",
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "100 loops, best of 3: 13.2 ms per loop\n"
       ]
      }
     ],
     "prompt_number": 21
    },
    {
     "cell_type": "code",
     "input": [],
     "language": "python",
     "outputs": [],
     "prompt_number": 21
    }
   ]
  }
 ]
 }
	{
	"metadata": {
	"name": "Data indexing v2"
	},
	"nbformat": 3,
	"worksheets": [
	{
	"cells": [
	{
	"cell_type": "code",
	"input": [
	"import numpy as np\n",
	"import numpy.random as rand \n",
	"\n",
	"# Set the seed, so that we always get the same random numbers.\n",
	"rand.seed(1)\n",
	"\n",
	"data = rand.randn(400, 200) * 1000\n",
	"dtype = data.dtype"
	],
	"language": "python",
	"outputs": [],
	"prompt_number": 1
	},
	{
	"cell_type": "code",
	"input": [
	"print dtype\n",
	"print 'Data at [100, 20]', data[100, 20]"
	],
	"language": "python",
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"float64\n",
	"Data at [100, 20] -97.7272376673\n"
	]
	}
	],
	"prompt_number": 2
	},
	{
	"cell_type": "code",
	"input": [
	"# Write the numpy array's data buffer.\n",
	"with open('data.dat', 'wb') as fh:\n",
	" fh.write(data.data)"
	],
	"language": "python",
	"outputs": [],
	"prompt_number": 3
	},
	{
	"cell_type": "code",
	"input": [
	"def indexed_read(index, shape, dtype):\n",
	" with open('data.dat', 'rb') as fh:\n",
	" ind_position = np.ravel_multi_index(index, shape) \n",
	" fh.seek(ind_position * dtype.itemsize)\n",
	" return np.fromfile(fh, dtype=dtype, count=1)[0]"
	],
	"language": "python",
	"outputs": [],
	"prompt_number": 4
	},
	{
	"cell_type": "code",
	"input": [
	"print indexed_read((100, 20), data.shape, dtype)"
	],
	"language": "python",
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"-97.7272376673\n"
	]
	}
	],
	"prompt_number": 5
	},
	{
	"cell_type": "markdown",
	"source": [
	"Time the time to seek to the specified location, and just read a single float from there."
	]
	},
	{
	"cell_type": "code",
	"input": [
	"%timeit indexed_read((100, 20), data.shape, dtype)"
	],
	"language": "python",
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"1000 loops, best of 3: 337 us per loop\n"
	]
	}
	],
	"prompt_number": 6
	},
	{
	"cell_type": "markdown",
	"source": [
	"Just in case there was some disk caching, or some other such file system magic."
	]
	},
	{
	"cell_type": "code",
	"input": [
	"%timeit indexed_read((100, 20), data.shape, dtype)"
	],
	"language": "python",
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"1000 loops, best of 3: 311 us per loop\n"
	]
	}
	],
	"prompt_number": 7
	},
	{
	"cell_type": "markdown",
	"source": [
	"Now time the time to load the whole array, and index it down."
	]
	},
	{
	"cell_type": "code",
	"input": [
	"def read_first_then_index(index, shape, dtype):\n",
	" with open('data.dat', 'rb') as fh:\n",
	" return np.fromfile(fh, dtype=dtype, count=-1).reshape(shape)[index]"
	],
	"language": "python",
	"outputs": [],
	"prompt_number": 8
	},
	{
	"cell_type": "code",
	"input": [
	"print read_first_then_index((100, 20), data.shape, dtype)"
	],
	"language": "python",
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"-97.7272376673\n"
	]
	}
	],
	"prompt_number": 9
	},
	{
	"cell_type": "code",
	"input": [
	"%timeit read_first_then_index((100, 20), data.shape, dtype)"
	],
	"language": "python",
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"1000 loops, best of 3: 430 us per loop\n"
	]
	}
	],
	"prompt_number": 10
	},
	{
	"cell_type": "code",
	"input": [
	"%timeit read_first_then_index((100, 20), data.shape, dtype)"
	],
	"language": "python",
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"1000 loops, best of 3: 422 us per loop\n"
	]
	}
	],
	"prompt_number": 11
	},
	{
	"cell_type": "markdown",
	"source": [
	"--------------------------"
	]
	},
	{
	"cell_type": "markdown",
	"source": [
	"Lets try to demonstrate performance of loading a slice with 37**2 data points from an array of 2000x3200."
	]
	},
	{
	"cell_type": "code",
	"input": [
	"import numpy as np\n",
	"import numpy.random as rand \n",
	"\n",
	"# Set the seed, so that we always get the same random numbers.\n",
	"rand.seed(1)\n",
	"\n",
	"data = rand.randn(2000, 2300) * 1000\n",
	"dtype = data.dtype"
	],
	"language": "python",
	"outputs": [],
	"prompt_number": 23
	},
	{
	"cell_type": "code",
	"input": [
	"# Write the numpy array's data buffer.\n",
	"with open('data.dat', 'wb') as fh:\n",
	" fh.write(data.data)"
	],
	"language": "python",
	"outputs": [],
	"prompt_number": 13
	},
	{
	"cell_type": "code",
	"input": [
	"def indexed_read(index, shape, dtype):\n",
	" \"\"\"\n",
	" A dirty implementation of sliced loading. This is nowhere near production quality code.\n",
	" \"\"\"\n",
	" assert len(shape) == 2, 'Only handles 2D data.'\n",
	" assert index[1].step is None\n",
	" \n",
	" # Note: Assumes C order.\n",
	" row_start = index[1].start or 0\n",
	" row_len = (index[1].stop or shape[1]) - row_start\n",
	" \n",
	" col_len = (index[0].stop or shape[0]) - (index[0].start or 0)\n",
	" array = np.empty([col_len, row_len], dtype=dtype)\n",
	" \n",
	" with open('data.dat', 'rb') as fh:\n",
	" for i, column in enumerate(range(*index[0].indices(shape[0]))):\n",
	" this_index = [column, row_start]\n",
	" ind_position = np.ravel_multi_index(this_index, shape)\n",
	" fh.seek(ind_position * dtype.itemsize)\n",
	" array[i, :] = np.fromfile(fh, dtype=dtype, count=row_len)\n",
	" return array"
	],
	"language": "python",
	"outputs": [],
	"prompt_number": 14
	},
	{
	"cell_type": "code",
	"input": [
	"print indexed_read((slice(100, 137), slice(20, 57)), data.shape, dtype)"
	],
	"language": "python",
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"[[ 36.45765261 584.03104586 672.81583883 ..., -1135.05114042\n",
	" 1519.42669281 -15.01216673]\n",
	" [ -858.37403831 -17.46844211 -408.61175401 ..., -143.70874343\n",
	" 1087.78456648 -2257.91913272]\n",
	" [ -493.43205609 506.66145279 1658.73365532 ..., 703.13735591\n",
	" -778.14278185 683.13502235]\n",
	" ..., \n",
	" [ -263.04030576 -407.3809142 -738.90982064 ..., 46.90285342\n",
	" 1427.72710339 1026.40209575]\n",
	" [-1883.60966548 1339.98293698 -537.81004422 ..., -528.52062211\n",
	" -1185.61622574 -370.07687284]\n",
	" [ 1591.09510803 -2322.59168283 -208.95143324 ..., 782.05643564\n",
	" -312.11496452 -2822.87503771]]\n"
	]
	}
	],
	"prompt_number": 15
	},
	{
	"cell_type": "code",
	"input": [
	"%timeit indexed_read((slice(100, 137), slice(20, 57)), data.shape, dtype)"
	],
	"language": "python",
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"1000 loops, best of 3: 1.11 ms per loop\n"
	]
	}
	],
	"prompt_number": 22
	},
	{
	"cell_type": "code",
	"input": [
	"%timeit indexed_read((slice(100, 137), slice(20, 57)), data.shape, dtype)"
	],
	"language": "python",
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"1000 loops, best of 3: 1.18 ms per loop\n"
	]
	}
	],
	"prompt_number": 17
	},
	{
	"cell_type": "code",
	"input": [
	"def read_first_then_index(index, shape, dtype):\n",
	" with open('data.dat', 'rb') as fh:\n",
	" return np.fromfile(fh, dtype=dtype, count=-1).reshape(shape)[index]"
	],
	"language": "python",
	"outputs": [],
	"prompt_number": 18
	},
	{
	"cell_type": "code",
	"input": [
	"print read_first_then_index((slice(100, 137), slice(20, 57)), data.shape, dtype)"
	],
	"language": "python",
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"[[ 36.45765261 584.03104586 672.81583883 ..., -1135.05114042\n",
	" 1519.42669281 -15.01216673]\n",
	" [ -858.37403831 -17.46844211 -408.61175401 ..., -143.70874343\n",
	" 1087.78456648 -2257.91913272]\n",
	" [ -493.43205609 506.66145279 1658.73365532 ..., 703.13735591\n",
	" -778.14278185 683.13502235]\n",
	" ..., \n",
	" [ -263.04030576 -407.3809142 -738.90982064 ..., 46.90285342\n",
	" 1427.72710339 1026.40209575]\n",
	" [-1883.60966548 1339.98293698 -537.81004422 ..., -528.52062211\n",
	" -1185.61622574 -370.07687284]\n",
	" [ 1591.09510803 -2322.59168283 -208.95143324 ..., 782.05643564\n",
	" -312.11496452 -2822.87503771]]\n"
	]
	}
	],
	"prompt_number": 19
	},
	{
	"cell_type": "code",
	"input": [
	"%timeit read_first_then_index((100, 20), data.shape, dtype)"
	],
	"language": "python",
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"100 loops, best of 3: 13.3 ms per loop\n"
	]
	}
	],
	"prompt_number": 20
	},
	{
	"cell_type": "code",
	"input": [
	"%timeit read_first_then_index((100, 20), data.shape, dtype)"
	],
	"language": "python",
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"100 loops, best of 3: 13.2 ms per loop\n"
	]
	}
	],
	"prompt_number": 21
	},
	{
	"cell_type": "code",
	"input": [],
	"language": "python",
	"outputs": [],
	"prompt_number": 21
	}
	]
	}
	]
	}