Created
May 19, 2016 19:20
-
-
Save THEMVFFINMAN/249c8047de3088ab8fbebb2220cd1d04 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# | |
# Adapted from script by Diana MacLean 2011 | |
# | |
# Adapted fro CS448G from Michael Noll's tutorial on : http://www.michael-noll.com/tutorials/writing-an-hadoop-mapreduce-program-in-python/ | |
# | |
# | |
from operator import itemgetter | |
import sys | |
import operator | |
# maps words to their counts | |
word2count = {} | |
# input comes from STDIN | |
for line in sys.stdin: | |
# remove leading and trailing whitespace | |
line = line.strip() | |
# parse the input we got from mapper.py | |
word, count = line.split('\t', 1) | |
# convert count (currently a string) to int | |
try: | |
count = int(count) | |
word2count[word] = word2count.get(word, 0) + count | |
except ValueError: | |
# count was not a number, so silently | |
# ignore/discard this line | |
pass | |
counted_words = word2count.items() | |
# This will sort all the tuples in the list by the 2nd value | |
# in reverse order | |
counted_words.sort(key=operator.itemgetter(1), reverse=True) | |
#While not the most elegant solution, if it's not in the stop array, then it doesn't get printed | |
#A better design might be to never collect their information in the first place | |
stop=['i','me','my','myself','we','our','ours','ourselves','you','your','yours','yourself','yourselves','he','him','his','himself','she','her','hers','herself','it','its','itself','they','them','their','theirs','themselves','what','which','who','whom','this','that','these','those','am','is','are','was','were','be','been','being','have','has','had','having','do','does','did','doing','a','an','the','and','but','if','or','because','as','until','while','of','at','by','for','with','about','against','between','into','through','during','before','after','above','below','to','from','up','down','in','out','on','off','over','under','again','further','then','once','here','there','when','where','why','how','all','any','both','each','few','more','most','other','some','such','no','nor','not','only','own','same','so','than','too','very','s','t','can','will','just','don','should','now'] | |
# write the results to STDOUT (standard output) | |
for word, count in counted_words: | |
if word not in stop: | |
print '%s\t%s'% (word, count) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment