Skip to content

Instantly share code, notes, and snippets.

@harshvladha
Created December 9, 2016 19:07
Show Gist options
  • Save harshvladha/43f42c579b111743fec9c298fe6b2ad6 to your computer and use it in GitHub Desktop.
Save harshvladha/43f42c579b111743fec9c298fe6b2ad6 to your computer and use it in GitHub Desktop.
Python MR Reducer
#!/usr/bin/env python3.4
from operator import itemgetter
import sys
current_word = None
current_count = 0
word = None
#mappers output and reducers input
for line in sys.stdin:
#remove all the leading and trailing whitespace
line = line.strip()
#split the input to get the word and its count as given by mapper
word, count = line.split('\t', 1)
#typecast string count to integer
try:
count = int(count)
except ValueError:
#discard if it wasn't really a number
continue
#Hadoop MapReduce Framework has sorted the output
#sum up total count of the word
if current_word == word:
current_count += count
else:
if current_word:
#output to STDOUT
print('{0}\t{1}'.format(current_word, current_count))
current_count = count
current_word = word
#IMPORTANT
#Its the last word which too needs to be printed
if current_word == word:
print('{0}\t{1}'.format(current_word, current_count))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment