Skip to content

Instantly share code, notes, and snippets.

@THEMVFFINMAN
Created May 19, 2016 19:20
Show Gist options
  • Save THEMVFFINMAN/249c8047de3088ab8fbebb2220cd1d04 to your computer and use it in GitHub Desktop.
Save THEMVFFINMAN/249c8047de3088ab8fbebb2220cd1d04 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
#
# Adapted from script by Diana MacLean 2011
#
# Adapted fro CS448G from Michael Noll's tutorial on : http://www.michael-noll.com/tutorials/writing-an-hadoop-mapreduce-program-in-python/
#
#
from operator import itemgetter
import sys
import operator
# maps words to their counts
word2count = {}
# input comes from STDIN
for line in sys.stdin:
# remove leading and trailing whitespace
line = line.strip()
# parse the input we got from mapper.py
word, count = line.split('\t', 1)
# convert count (currently a string) to int
try:
count = int(count)
word2count[word] = word2count.get(word, 0) + count
except ValueError:
# count was not a number, so silently
# ignore/discard this line
pass
counted_words = word2count.items()
# This will sort all the tuples in the list by the 2nd value
# in reverse order
counted_words.sort(key=operator.itemgetter(1), reverse=True)
#While not the most elegant solution, if it's not in the stop array, then it doesn't get printed
#A better design might be to never collect their information in the first place
stop=['i','me','my','myself','we','our','ours','ourselves','you','your','yours','yourself','yourselves','he','him','his','himself','she','her','hers','herself','it','its','itself','they','them','their','theirs','themselves','what','which','who','whom','this','that','these','those','am','is','are','was','were','be','been','being','have','has','had','having','do','does','did','doing','a','an','the','and','but','if','or','because','as','until','while','of','at','by','for','with','about','against','between','into','through','during','before','after','above','below','to','from','up','down','in','out','on','off','over','under','again','further','then','once','here','there','when','where','why','how','all','any','both','each','few','more','most','other','some','such','no','nor','not','only','own','same','so','than','too','very','s','t','can','will','just','don','should','now']
# write the results to STDOUT (standard output)
for word, count in counted_words:
if word not in stop:
print '%s\t%s'% (word, count)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment