Created
May 7, 2014 21:52
-
-
Save kforeman/9d4e2b44bc5fc935d0fb to your computer and use it in GitHub Desktop.
map-reduce for efficient histograms in python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "heading", | |
"level": 1, | |
"metadata": {}, | |
"source": [ | |
"Efficient Histograms" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"<table>\n", | |
"<tr>\n", | |
" <th>Author</th>\n", | |
" <td>Kyle Foreman</td>\n", | |
"</tr>\n", | |
"<tr>\n", | |
" <td></td>\n", | |
" <td>[email protected]</td>\n", | |
"</tr>\n", | |
"<tr>\n", | |
" <th>Date</th>\n", | |
" <td>07 May 2014</td>\n", | |
"</tr>\n", | |
"<tr>\n", | |
" <th>Purpose</th>\n", | |
" <td>use map-reduce and multiprocessing to quickly generate histograms with lower memory overhead</td>\n", | |
"</tr>\n", | |
"</table>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import numpy as np\n", | |
"from multiprocessing import Pool" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 1 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": [ | |
"Simulate Data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# simulate data\n", | |
"size = 1e8\n", | |
"data = np.random.normal(0, 5, size)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 2 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": [ | |
"Find Bins" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# find min/max\n", | |
"mn = data.min() - 1e-5\n", | |
"mx = data.max() + 1e-5\n", | |
"\n", | |
"# find bins\n", | |
"num_bins = 20\n", | |
"bins = np.linspace(mn, mx, num_bins+1)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 3 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": [ | |
"Map-Reduce Method" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# function to find bin counts for elements an array\n", | |
"def find_bin_counts(x, bins=bins, array=data):\n", | |
" # Note: you'll probably want some way of loading the data dynamically so it doesn't all have to be in memory\n", | |
" binned = np.digitize(array[x[0]:x[1]], bins)\n", | |
" return np.bincount(binned, minlength=num_bins+1)[1:] \n", | |
" # Note: not sure why there always seems to be an extra 0 at the front - seems to be built in to bincount to always count the number of zeros (of which we have none)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 4 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# create a pool\n", | |
"num_proc = 10\n", | |
"pool = Pool(processes=num_proc)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 5 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# partition data\n", | |
"def partition(data, l):\n", | |
" for i in xrange(0, len(data), l):\n", | |
" yield [i,i+l]\n", | |
"partitions = list(partition(data, len(data)/num_proc))" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 6 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# wrap the whole thing up in a function\n", | |
"def fast_hist(data=data, partitions=partitions, bins=bins):\n", | |
" # first map\n", | |
" mapped = pool.map(find_bin_counts, partitions)\n", | |
" # then reduce\n", | |
" reduced = np.vstack(mapped).sum(axis=0)\n", | |
" return reduced" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 7 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": [ | |
"Check Results" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# check to make sure map/reduce gets the same result as numpy's built in histogram\n", | |
"result1 = fast_hist()\n", | |
"result2 = np.histogram(data, bins=bins)\n", | |
"np.array_equal(result1, result2[0])" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 8, | |
"text": [ | |
"True" | |
] | |
} | |
], | |
"prompt_number": 8 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": [ | |
"Compare Timing" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 4, | |
"metadata": {}, | |
"source": [ | |
"Numpy" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"%%timeit \n", | |
"hist_result = np.histogram(data, bins=bins)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"1 loops, best of 3: 7.22 s per loop\n" | |
] | |
} | |
], | |
"prompt_number": 9 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 4, | |
"metadata": {}, | |
"source": [ | |
"Map-Reduce" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"%%timeit\n", | |
"fast_hist()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"1 loops, best of 3: 924 ms per loop\n" | |
] | |
} | |
], | |
"prompt_number": 10 | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment