Last active
August 29, 2015 13:57
-
-
Save rossant/9467085 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "metadata": { | |
| "name": "", | |
| "signature": "sha256:be2cab0cf304efb4c7e20ad93755c71ee5ec6f147a9d8a250536dc54ed4f6b3a" | |
| }, | |
| "nbformat": 3, | |
| "nbformat_minor": 0, | |
| "worksheets": [ | |
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# Slicing huge arrays in PyTables" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Let's import PyTables and NumPy." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "import tables as tb\n", | |
| "import numpy as np\n", | |
| "import os" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 1 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Chunked array" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "We create one `(1M, 100)` extendable dataset with chunks `(1000, 100)`." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "n, k = 1000000, 100" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 2 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "if not os.path.exists('test_tb'):\n", | |
| " # We create the file.\n", | |
| " with tb.openFile(\"test_tb\", \"w\") as f:\n", | |
| " a = f.createEArray('/', 'test', tb.Float32Atom(),\n", | |
| " (0, k), chunkshape=(1000, k))\n", | |
| " # We fill the array progressively.\n", | |
| " for i in range(10):\n", | |
| " print i,\n", | |
| " a.append(np.random.rand((n//10), k))" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "0 " | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "1 " | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "2 " | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "3 " | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "4 " | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "5 " | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "6 " | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "7 " | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "8 " | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "9\n" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 3 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Now, let's do some benchmarks." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "with tb.openFile(\"test_tb\", \"r\") as f:\n", | |
| " a = f.root.test\n", | |
| " \n", | |
| " # First, we load everything in RAM.\n", | |
| " %timeit -r1 -n1 a[:,...]\n", | |
| " \n", | |
| " # Now, we load only 10% of the array.\n", | |
| " %timeit -r1 -n1 a[::10,...]\n", | |
| " \n", | |
| " # Now, we load all in RAM, and then we select 10%.\n", | |
| " %timeit -r1 -n1 a[:,...][::10,...]\n", | |
| " \n", | |
| " assert np.allclose(a[::10,...], a[:,...][::10,...])" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "1 loops, best of 1: 257 ms per loop\n", | |
| "1 loops, best of 1: 13.6 s per loop" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1 loops, best of 1: 232 ms per loop" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 4 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Contiguous array" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Let's try with a contiguous array." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "if not os.path.exists('test_tb_contiguous'):\n", | |
| " # We create the file.\n", | |
| " with tb.openFile(\"test_tb_contiguous\", \"w\") as f:\n", | |
| " a = f.createArray('/', 'test', atom=tb.Float32Atom(),\n", | |
| " shape=(n, k))\n", | |
| " # We fill the array progressively.\n", | |
| " for i in range(10):\n", | |
| " print i,\n", | |
| " a[i*(n//10):(i+1)*(n//10)] = np.random.rand((n//10), k)" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "0 " | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "1 " | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "2 " | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "3 " | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "4 " | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "5 " | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "6 " | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "7 " | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "8 " | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "9\n" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 8 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "with tb.openFile(\"test_tb_contiguous\", \"r\") as f:\n", | |
| " a = f.root.test\n", | |
| " \n", | |
| " # First, we load everything in RAM.\n", | |
| " %timeit -r1 -n1 a[:,...]\n", | |
| " \n", | |
| " # Now, we load only 10% of the array.\n", | |
| " %timeit -r1 -n1 a[::10,...]\n", | |
| " \n", | |
| " # Now, we load all in RAM, and then we select 10%.\n", | |
| " %timeit -r1 -n1 a[:,...][::10,...]\n", | |
| " \n", | |
| " assert np.allclose(a[::10,...], a[:,...][::10,...])" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "1 loops, best of 1: 204 ms per loop\n", | |
| "1 loops, best of 1: 112 ms per loop" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n", | |
| "1 loops, best of 1: 204 ms per loop" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 9 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "All benchmarks performed on a desktop PC with 8GB RAM, Windows 8.1 64-bit, Python 2.7.5, PyTables 3.1.0." | |
| ] | |
| } | |
| ], | |
| "metadata": {} | |
| } | |
| ] | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment