Skip to content

Instantly share code, notes, and snippets.

@kforeman
Created May 7, 2014 21:52
Show Gist options
  • Save kforeman/9d4e2b44bc5fc935d0fb to your computer and use it in GitHub Desktop.
Save kforeman/9d4e2b44bc5fc935d0fb to your computer and use it in GitHub Desktop.
map-reduce for efficient histograms in python
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": ""
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"Efficient Histograms"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<table>\n",
"<tr>\n",
" <th>Author</th>\n",
" <td>Kyle Foreman</td>\n",
"</tr>\n",
"<tr>\n",
" <td></td>\n",
" <td>[email protected]</td>\n",
"</tr>\n",
"<tr>\n",
" <th>Date</th>\n",
" <td>07 May 2014</td>\n",
"</tr>\n",
"<tr>\n",
" <th>Purpose</th>\n",
" <td>use map-reduce and multiprocessing to quickly generate histograms with lower memory overhead</td>\n",
"</tr>\n",
"</table>"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import numpy as np\n",
"from multiprocessing import Pool"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"Simulate Data"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# simulate data\n",
"size = 1e8\n",
"data = np.random.normal(0, 5, size)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"Find Bins"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# find min/max\n",
"mn = data.min() - 1e-5\n",
"mx = data.max() + 1e-5\n",
"\n",
"# find bins\n",
"num_bins = 20\n",
"bins = np.linspace(mn, mx, num_bins+1)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 3
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"Map-Reduce Method"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# function to find bin counts for elements an array\n",
"def find_bin_counts(x, bins=bins, array=data):\n",
" # Note: you'll probably want some way of loading the data dynamically so it doesn't all have to be in memory\n",
" binned = np.digitize(array[x[0]:x[1]], bins)\n",
" return np.bincount(binned, minlength=num_bins+1)[1:] \n",
" # Note: not sure why there always seems to be an extra 0 at the front - seems to be built in to bincount to always count the number of zeros (of which we have none)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# create a pool\n",
"num_proc = 10\n",
"pool = Pool(processes=num_proc)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# partition data\n",
"def partition(data, l):\n",
" for i in xrange(0, len(data), l):\n",
" yield [i,i+l]\n",
"partitions = list(partition(data, len(data)/num_proc))"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 6
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# wrap the whole thing up in a function\n",
"def fast_hist(data=data, partitions=partitions, bins=bins):\n",
" # first map\n",
" mapped = pool.map(find_bin_counts, partitions)\n",
" # then reduce\n",
" reduced = np.vstack(mapped).sum(axis=0)\n",
" return reduced"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 7
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"Check Results"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# check to make sure map/reduce gets the same result as numpy's built in histogram\n",
"result1 = fast_hist()\n",
"result2 = np.histogram(data, bins=bins)\n",
"np.array_equal(result1, result2[0])"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 8,
"text": [
"True"
]
}
],
"prompt_number": 8
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"Compare Timing"
]
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"Numpy"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"%%timeit \n",
"hist_result = np.histogram(data, bins=bins)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"1 loops, best of 3: 7.22 s per loop\n"
]
}
],
"prompt_number": 9
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"Map-Reduce"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"%%timeit\n",
"fast_hist()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"1 loops, best of 3: 924 ms per loop\n"
]
}
],
"prompt_number": 10
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment