kforeman · May 7, 2014 21:52
diff --git a/histogram.ipynb b/histogram.ipynb
 {
 "metadata": {
  "name": ""
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "heading",
     "level": 1,
     "metadata": {},
     "source": [
      "Efficient Histograms"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "<table>\n",
      "<tr>\n",
      "    <th>Author</th>\n",
      "    <td>Kyle Foreman</td>\n",
      "</tr>\n",
      "<tr>\n",
      "    <td></td>\n",
      "    <td>[email protected]</td>\n",
      "</tr>\n",
      "<tr>\n",
      "    <th>Date</th>\n",
      "    <td>07 May 2014</td>\n",
      "</tr>\n",
      "<tr>\n",
      "    <th>Purpose</th>\n",
      "    <td>use map-reduce and multiprocessing to quickly generate histograms with lower memory overhead</td>\n",
      "</tr>\n",
      "</table>"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import numpy as np\n",
      "from multiprocessing import Pool"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 1
    },
    {
     "cell_type": "heading",
     "level": 3,
     "metadata": {},
     "source": [
      "Simulate Data"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# simulate data\n",
      "size = 1e8\n",
      "data = np.random.normal(0, 5, size)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 2
    },
    {
     "cell_type": "heading",
     "level": 3,
     "metadata": {},
     "source": [
      "Find Bins"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# find min/max\n",
      "mn = data.min() - 1e-5\n",
      "mx = data.max() + 1e-5\n",
      "\n",
      "# find bins\n",
      "num_bins = 20\n",
      "bins = np.linspace(mn, mx, num_bins+1)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 3
    },
    {
     "cell_type": "heading",
     "level": 3,
     "metadata": {},
     "source": [
      "Map-Reduce Method"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# function to find bin counts for elements an array\n",
      "def find_bin_counts(x, bins=bins, array=data):\n",
      "    # Note: you'll probably want some way of loading the data dynamically so it doesn't all have to be in memory\n",
      "    binned = np.digitize(array[x[0]:x[1]], bins)\n",
      "    return np.bincount(binned, minlength=num_bins+1)[1:] \n",
      "    # Note: not sure why there always seems to be an extra 0 at the front - seems to be built in to bincount to always count the number of zeros (of which we have none)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 4
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# create a pool\n",
      "num_proc = 10\n",
      "pool = Pool(processes=num_proc)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 5
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# partition data\n",
      "def partition(data, l):\n",
      "  for i in xrange(0, len(data), l):\n",
      "    yield [i,i+l]\n",
      "partitions = list(partition(data, len(data)/num_proc))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 6
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# wrap the whole thing up in a function\n",
      "def fast_hist(data=data, partitions=partitions, bins=bins):\n",
      "    # first map\n",
      "    mapped = pool.map(find_bin_counts, partitions)\n",
      "    # then reduce\n",
      "    reduced = np.vstack(mapped).sum(axis=0)\n",
      "    return reduced"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 7
    },
    {
     "cell_type": "heading",
     "level": 3,
     "metadata": {},
     "source": [
      "Check Results"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# check to make sure map/reduce gets the same result as numpy's built in histogram\n",
      "result1 = fast_hist()\n",
      "result2 = np.histogram(data, bins=bins)\n",
      "np.array_equal(result1, result2[0])"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 8,
       "text": [
        "True"
       ]
      }
     ],
     "prompt_number": 8
    },
    {
     "cell_type": "heading",
     "level": 3,
     "metadata": {},
     "source": [
      "Compare Timing"
     ]
    },
    {
     "cell_type": "heading",
     "level": 4,
     "metadata": {},
     "source": [
      "Numpy"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%%timeit \n",
      "hist_result = np.histogram(data, bins=bins)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "1 loops, best of 3: 7.22 s per loop\n"
       ]
      }
     ],
     "prompt_number": 9
    },
    {
     "cell_type": "heading",
     "level": 4,
     "metadata": {},
     "source": [
      "Map-Reduce"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%%timeit\n",
      "fast_hist()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "1 loops, best of 3: 924 ms per loop\n"
       ]
      }
     ],
     "prompt_number": 10
    }
   ],
   "metadata": {}
  }
 ]
 }
	{
	"metadata": {
	"name": ""
	},
	"nbformat": 3,
	"nbformat_minor": 0,
	"worksheets": [
	{
	"cells": [
	{
	"cell_type": "heading",
	"level": 1,
	"metadata": {},
	"source": [
	"Efficient Histograms"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"<table>\n",
	"<tr>\n",
	" <th>Author</th>\n",
	" <td>Kyle Foreman</td>\n",
	"</tr>\n",
	"<tr>\n",
	" <td></td>\n",
	" <td>[email protected]</td>\n",
	"</tr>\n",
	"<tr>\n",
	" <th>Date</th>\n",
	" <td>07 May 2014</td>\n",
	"</tr>\n",
	"<tr>\n",
	" <th>Purpose</th>\n",
	" <td>use map-reduce and multiprocessing to quickly generate histograms with lower memory overhead</td>\n",
	"</tr>\n",
	"</table>"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"import numpy as np\n",
	"from multiprocessing import Pool"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 1
	},
	{
	"cell_type": "heading",
	"level": 3,
	"metadata": {},
	"source": [
	"Simulate Data"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# simulate data\n",
	"size = 1e8\n",
	"data = np.random.normal(0, 5, size)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 2
	},
	{
	"cell_type": "heading",
	"level": 3,
	"metadata": {},
	"source": [
	"Find Bins"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# find min/max\n",
	"mn = data.min() - 1e-5\n",
	"mx = data.max() + 1e-5\n",
	"\n",
	"# find bins\n",
	"num_bins = 20\n",
	"bins = np.linspace(mn, mx, num_bins+1)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 3
	},
	{
	"cell_type": "heading",
	"level": 3,
	"metadata": {},
	"source": [
	"Map-Reduce Method"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# function to find bin counts for elements an array\n",
	"def find_bin_counts(x, bins=bins, array=data):\n",
	" # Note: you'll probably want some way of loading the data dynamically so it doesn't all have to be in memory\n",
	" binned = np.digitize(array[x[0]:x[1]], bins)\n",
	" return np.bincount(binned, minlength=num_bins+1)[1:] \n",
	" # Note: not sure why there always seems to be an extra 0 at the front - seems to be built in to bincount to always count the number of zeros (of which we have none)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 4
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# create a pool\n",
	"num_proc = 10\n",
	"pool = Pool(processes=num_proc)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 5
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# partition data\n",
	"def partition(data, l):\n",
	" for i in xrange(0, len(data), l):\n",
	" yield [i,i+l]\n",
	"partitions = list(partition(data, len(data)/num_proc))"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 6
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# wrap the whole thing up in a function\n",
	"def fast_hist(data=data, partitions=partitions, bins=bins):\n",
	" # first map\n",
	" mapped = pool.map(find_bin_counts, partitions)\n",
	" # then reduce\n",
	" reduced = np.vstack(mapped).sum(axis=0)\n",
	" return reduced"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 7
	},
	{
	"cell_type": "heading",
	"level": 3,
	"metadata": {},
	"source": [
	"Check Results"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# check to make sure map/reduce gets the same result as numpy's built in histogram\n",
	"result1 = fast_hist()\n",
	"result2 = np.histogram(data, bins=bins)\n",
	"np.array_equal(result1, result2[0])"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 8,
	"text": [
	"True"
	]
	}
	],
	"prompt_number": 8
	},
	{
	"cell_type": "heading",
	"level": 3,
	"metadata": {},
	"source": [
	"Compare Timing"
	]
	},
	{
	"cell_type": "heading",
	"level": 4,
	"metadata": {},
	"source": [
	"Numpy"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"%%timeit \n",
	"hist_result = np.histogram(data, bins=bins)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"1 loops, best of 3: 7.22 s per loop\n"
	]
	}
	],
	"prompt_number": 9
	},
	{
	"cell_type": "heading",
	"level": 4,
	"metadata": {},
	"source": [
	"Map-Reduce"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"%%timeit\n",
	"fast_hist()"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"1 loops, best of 3: 924 ms per loop\n"
	]
	}
	],
	"prompt_number": 10
	}
	],
	"metadata": {}
	}
	]
	}