Created
December 9, 2016 19:07
-
-
Save harshvladha/43f42c579b111743fec9c298fe6b2ad6 to your computer and use it in GitHub Desktop.
Python MR Reducer
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3.4 | |
from operator import itemgetter | |
import sys | |
current_word = None | |
current_count = 0 | |
word = None | |
#mappers output and reducers input | |
for line in sys.stdin: | |
#remove all the leading and trailing whitespace | |
line = line.strip() | |
#split the input to get the word and its count as given by mapper | |
word, count = line.split('\t', 1) | |
#typecast string count to integer | |
try: | |
count = int(count) | |
except ValueError: | |
#discard if it wasn't really a number | |
continue | |
#Hadoop MapReduce Framework has sorted the output | |
#sum up total count of the word | |
if current_word == word: | |
current_count += count | |
else: | |
if current_word: | |
#output to STDOUT | |
print('{0}\t{1}'.format(current_word, current_count)) | |
current_count = count | |
current_word = word | |
#IMPORTANT | |
#Its the last word which too needs to be printed | |
if current_word == word: | |
print('{0}\t{1}'.format(current_word, current_count)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment