Last active
August 29, 2015 14:04
-
-
Save hahastudio/dcfc9ec8d2bc7e548aee to your computer and use it in GitHub Desktop.
A simple MapReduce model in Python, introducing the concept of MapReduce in word count problem
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from itertools import groupby | |
# A sample input of a word count problem | |
source = ["Here is the first line in this source", | |
"And Here is the second line in this source", | |
"Welcome to the third line in this source"] | |
# map stage | |
map_result = map(lambda line: [(word.lower(), 1) for word in line.split()], source) | |
# [[('here', 1), ('is', 1), ('the', 1), ('first', 1), | |
# ('line', 1), ('in', 1), ('this', 1), ('source', 1)], | |
# [('and', 1), ('here', 1), ('is', 1), ('the', 1), ('second', 1), | |
# ('line', 1), ('in', 1), ('this', 1), ('source', 1)], | |
# [('welcome', 1), ('to', 1), ('the', 1), ('third', 1), | |
# ('line', 1), ('in', 1), ('this', 1), ('source', 1)]] | |
#combine stage | |
combine_result = sorted([key_val for key_values in map_result for key_val in key_values]) | |
# [('and', 1), | |
# ('first', 1), | |
# ('here', 1), ('here', 1), | |
# ('in', 1), ('in', 1), ('in', 1), | |
# ('is', 1), ('is', 1), | |
# ('line', 1), ('line', 1), ('line', 1), | |
# ('second', 1), | |
# ('source', 1), ('source', 1), ('source', 1), | |
# ('the', 1), ('the', 1), ('the', 1), | |
# ('third', 1), | |
# ('this', 1), ('this', 1), ('this', 1), | |
# ('to', 1), | |
# ('welcome', 1)] | |
#reduce stage | |
reduce_result = [(key, reduce(lambda x,y:x+y, (v for k, v in key_values))) | |
for key, key_values in groupby(combine_result, lambda kv: kv[0])] | |
# [('and', 1), | |
# ('first', 1), | |
# ('here', 2), | |
# ('in', 3), | |
# ('is', 2), | |
# ('line', 3), | |
# ('second', 1), | |
# ('source', 3), | |
# ('the', 3), | |
# ('third', 1), | |
# ('this', 3), | |
# ('to', 1), | |
# ('welcome', 1)] | |
# And a single line of this! | |
result = [(key, reduce(lambda x,y:x+y, (v for k, v in key_values))) | |
for key, key_values in groupby( | |
sorted([key_val for key_values in | |
map(lambda line: [(word.lower(), 1) for word in line.split()], source) | |
for key_val in key_values]), lambda kv: kv[0])] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment