Created
December 21, 2013 07:22
-
-
Save sumanthprabhu/8066438 to your computer and use it in GitHub Desktop.
A map-reduce based python script to generate the count matrix. Given a set of input statements and the tags/documents each statement is associated with, this script can be used to count the number of times each term in the statement occurs for each tag/document. Pre-requisites: Each input statement must be pre-processed into the form "term1;term…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Count the number of occurences of each term for each tag | |
(or in each document) | |
Arguments : | |
Input file where each input line is of the form : | |
term1;term2;term3.. , associated_tag1;associated_tag2.. | |
Output: | |
Basically a count matrix with each line of the form | |
"term | associated tag | count" | |
''' | |
import sys | |
import nltk | |
from mrjob.job import MRJob | |
def is_useful(term): | |
''' | |
Checks if word is useful | |
''' | |
stop_words = nltk.corpus.stopwords.words('english') | |
if term in stop_words: | |
return False | |
if len(term) < 5: | |
return False | |
return True | |
class tagger(MRJob): | |
def mapper(self, _, line): | |
''' | |
Each input line is assumed to be of the form : | |
term1;term2;term3.. , associated_tag1;associated_tag2.. | |
''' | |
term_list, tag_list = line.split(",") | |
term_list = term_list.split(";") | |
tag_list = tag_list.split(";") | |
for term in term_list: | |
# uncomment the following code segment and make sure you have nltk | |
# installed if you plan to use the output as input to algorithms | |
# like tfidf to help extract only useful terms | |
# if is_useful(term): | |
# for tag in tag_list: | |
# yield term + ":" + tag, 1 | |
# Comment the following segment if you uncomment the | |
# preceding line | |
for tag in tag_list: | |
yield term + ":" + tag, 1 | |
def reducer(self, key, count): | |
''' | |
Reduce to form: | |
term | associated_tag | count_of_occurence | |
''' | |
term, tag = key.split(":") | |
print "%s|%s|%d" % (term, tag, sum(count)) | |
if __name__ == "__main__": | |
tagger.run() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment