rossant · August 29, 2015 13:57
diff --git a/h5py_indices.ipynb b/h5py_indices.ipynb
diff --git a/h5py_slice.ipynb b/h5py_slice.ipynb
diff --git a/pytables_slice.ipynb b/pytables_slice.ipynb
 {
 "metadata": {
  "name": "",
  "signature": "sha256:be2cab0cf304efb4c7e20ad93755c71ee5ec6f147a9d8a250536dc54ed4f6b3a"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "# Slicing huge arrays in PyTables"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Let's import PyTables and NumPy."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import tables as tb\n",
      "import numpy as np\n",
      "import os"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 1
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## Chunked array"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "We create one `(1M, 100)` extendable dataset with chunks `(1000, 100)`."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "n, k = 1000000, 100"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 2
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "if not os.path.exists('test_tb'):\n",
      "    # We create the file.\n",
      "    with tb.openFile(\"test_tb\", \"w\") as f:\n",
      "        a = f.createEArray('/', 'test', tb.Float32Atom(),\n",
      "                           (0, k), chunkshape=(1000, k))\n",
      "        # We fill the array progressively.\n",
      "        for i in range(10):\n",
      "            print i,\n",
      "            a.append(np.random.rand((n//10), k))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "0 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "1 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "2 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "3 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "4 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "5 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "6 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "7 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "8 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "9\n"
       ]
      }
     ],
     "prompt_number": 3
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Now, let's do some benchmarks."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "with tb.openFile(\"test_tb\", \"r\") as f:\n",
      "    a = f.root.test\n",
      "    \n",
      "    # First, we load everything in RAM.\n",
      "    %timeit -r1 -n1 a[:,...]\n",
      "    \n",
      "    # Now, we load only 10% of the array.\n",
      "    %timeit -r1 -n1 a[::10,...]\n",
      "    \n",
      "    # Now, we load all in RAM, and then we select 10%.\n",
      "    %timeit -r1 -n1 a[:,...][::10,...]\n",
      "    \n",
      "    assert np.allclose(a[::10,...], a[:,...][::10,...])"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "1 loops, best of 1: 257 ms per loop\n",
        "1 loops, best of 1: 13.6 s per loop"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "1 loops, best of 1: 232 ms per loop"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n"
       ]
      }
     ],
     "prompt_number": 4
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## Contiguous array"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Let's try with a contiguous array."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "if not os.path.exists('test_tb_contiguous'):\n",
      "    # We create the file.\n",
      "    with tb.openFile(\"test_tb_contiguous\", \"w\") as f:\n",
      "        a = f.createArray('/', 'test', atom=tb.Float32Atom(),\n",
      "                           shape=(n, k))\n",
      "        # We fill the array progressively.\n",
      "        for i in range(10):\n",
      "            print i,\n",
      "            a[i*(n//10):(i+1)*(n//10)] = np.random.rand((n//10), k)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "0 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "1 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "2 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "3 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "4 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "5 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "6 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "7 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "8 "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "9\n"
       ]
      }
     ],
     "prompt_number": 8
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "with tb.openFile(\"test_tb_contiguous\", \"r\") as f:\n",
      "    a = f.root.test\n",
      "    \n",
      "    # First, we load everything in RAM.\n",
      "    %timeit -r1 -n1 a[:,...]\n",
      "    \n",
      "    # Now, we load only 10% of the array.\n",
      "    %timeit -r1 -n1 a[::10,...]\n",
      "    \n",
      "    # Now, we load all in RAM, and then we select 10%.\n",
      "    %timeit -r1 -n1 a[:,...][::10,...]\n",
      "    \n",
      "    assert np.allclose(a[::10,...], a[:,...][::10,...])"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "1 loops, best of 1: 204 ms per loop\n",
        "1 loops, best of 1: 112 ms per loop"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "1 loops, best of 1: 204 ms per loop"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n"
       ]
      }
     ],
     "prompt_number": 9
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "All benchmarks performed on a desktop PC with 8GB RAM, Windows 8.1 64-bit, Python 2.7.5, PyTables 3.1.0."
     ]
    }
   ],
   "metadata": {}
  }
 ]
 }
	{
	"metadata": {
	"name": "",
	"signature": "sha256:be2cab0cf304efb4c7e20ad93755c71ee5ec6f147a9d8a250536dc54ed4f6b3a"
	},
	"nbformat": 3,
	"nbformat_minor": 0,
	"worksheets": [
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Slicing huge arrays in PyTables"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Let's import PyTables and NumPy."
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"import tables as tb\n",
	"import numpy as np\n",
	"import os"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 1
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Chunked array"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"We create one `(1M, 100)` extendable dataset with chunks `(1000, 100)`."
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"n, k = 1000000, 100"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 2
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"if not os.path.exists('test_tb'):\n",
	" # We create the file.\n",
	" with tb.openFile(\"test_tb\", \"w\") as f:\n",
	" a = f.createEArray('/', 'test', tb.Float32Atom(),\n",
	" (0, k), chunkshape=(1000, k))\n",
	" # We fill the array progressively.\n",
	" for i in range(10):\n",
	" print i,\n",
	" a.append(np.random.rand((n//10), k))"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"0 "
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"1 "
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"2 "
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"3 "
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"4 "
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"5 "
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"6 "
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"7 "
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"8 "
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"9\n"
	]
	}
	],
	"prompt_number": 3
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Now, let's do some benchmarks."
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"with tb.openFile(\"test_tb\", \"r\") as f:\n",
	" a = f.root.test\n",
	" \n",
	" # First, we load everything in RAM.\n",
	" %timeit -r1 -n1 a[:,...]\n",
	" \n",
	" # Now, we load only 10% of the array.\n",
	" %timeit -r1 -n1 a[::10,...]\n",
	" \n",
	" # Now, we load all in RAM, and then we select 10%.\n",
	" %timeit -r1 -n1 a[:,...][::10,...]\n",
	" \n",
	" assert np.allclose(a[::10,...], a[:,...][::10,...])"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"1 loops, best of 1: 257 ms per loop\n",
	"1 loops, best of 1: 13.6 s per loop"
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"\n",
	"1 loops, best of 1: 232 ms per loop"
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"\n"
	]
	}
	],
	"prompt_number": 4
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Contiguous array"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Let's try with a contiguous array."
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"if not os.path.exists('test_tb_contiguous'):\n",
	" # We create the file.\n",
	" with tb.openFile(\"test_tb_contiguous\", \"w\") as f:\n",
	" a = f.createArray('/', 'test', atom=tb.Float32Atom(),\n",
	" shape=(n, k))\n",
	" # We fill the array progressively.\n",
	" for i in range(10):\n",
	" print i,\n",
	" a[i(n//10):(i+1)(n//10)] = np.random.rand((n//10), k)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"0 "
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"1 "
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"2 "
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"3 "
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"4 "
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"5 "
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"6 "
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"7 "
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"8 "
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"9\n"
	]
	}
	],
	"prompt_number": 8
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"with tb.openFile(\"test_tb_contiguous\", \"r\") as f:\n",
	" a = f.root.test\n",
	" \n",
	" # First, we load everything in RAM.\n",
	" %timeit -r1 -n1 a[:,...]\n",
	" \n",
	" # Now, we load only 10% of the array.\n",
	" %timeit -r1 -n1 a[::10,...]\n",
	" \n",
	" # Now, we load all in RAM, and then we select 10%.\n",
	" %timeit -r1 -n1 a[:,...][::10,...]\n",
	" \n",
	" assert np.allclose(a[::10,...], a[:,...][::10,...])"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"1 loops, best of 1: 204 ms per loop\n",
	"1 loops, best of 1: 112 ms per loop"
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"\n",
	"1 loops, best of 1: 204 ms per loop"
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"\n"
	]
	}
	],
	"prompt_number": 9
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"All benchmarks performed on a desktop PC with 8GB RAM, Windows 8.1 64-bit, Python 2.7.5, PyTables 3.1.0."
	]
	}
	],
	"metadata": {}
	}
	]
	}
No results found