sumanthprabhu · December 21, 2013 07:22
diff --git a/count_matrix.py b/count_matrix.py
 '''
    Count the number of occurences of each term for each tag
    (or in each document) 

    Arguments :
      Input file where each input line is of the form :
        term1;term2;term3.. , associated_tag1;associated_tag2..
        
    Output: 
      Basically a count matrix with each line of the form 
        "term | associated tag | count"

 '''

 import sys
 import nltk

 from mrjob.job import MRJob


 def is_useful(term):
    ''' 
        Checks if word is useful 
    '''

    stop_words = nltk.corpus.stopwords.words('english')
    if term in stop_words:
        return False

    if len(term) < 5:
        return False

    return True


 class tagger(MRJob):

    def mapper(self, _, line):
        ''' 
            Each input line is assumed to be of the form :
             term1;term2;term3.. , associated_tag1;associated_tag2..
        '''

        term_list, tag_list = line.split(",")
        term_list = term_list.split(";")
        tag_list = tag_list.split(";")

        for term in term_list: 
            # uncomment the following code segment and make sure you have nltk 
            # installed if you plan to use the output as input to algorithms 
            # like tfidf to help extract only useful terms
           
            # if is_useful(term):
            #     for tag in tag_list:
            #         yield term + ":" + tag, 1


            # Comment the following segment if you uncomment the
            # preceding line
            for tag in tag_list:
                yield term + ":" + tag, 1


    def reducer(self, key, count):
        '''
            Reduce to form:
            term | associated_tag | count_of_occurence
        '''
        term, tag = key.split(":")
        print "%s|%s|%d" % (term, tag, sum(count))


 if __name__ == "__main__":
    tagger.run()
	'''
	Count the number of occurences of each term for each tag
	(or in each document)

	Arguments :
	Input file where each input line is of the form :
	term1;term2;term3.. , associated_tag1;associated_tag2..

	Output:
	Basically a count matrix with each line of the form
	"term \| associated tag \| count"

	'''

	import sys
	import nltk

	from mrjob.job import MRJob


	def is_useful(term):
	'''
	Checks if word is useful
	'''

	stop_words = nltk.corpus.stopwords.words('english')
	if term in stop_words:
	return False

	if len(term) < 5:
	return False

	return True


	class tagger(MRJob):

	def mapper(self, _, line):
	'''
	Each input line is assumed to be of the form :
	term1;term2;term3.. , associated_tag1;associated_tag2..
	'''

	term_list, tag_list = line.split(",")
	term_list = term_list.split(";")
	tag_list = tag_list.split(";")

	for term in term_list:
	# uncomment the following code segment and make sure you have nltk
	# installed if you plan to use the output as input to algorithms
	# like tfidf to help extract only useful terms

	# if is_useful(term):
	# for tag in tag_list:
	# yield term + ":" + tag, 1


	# Comment the following segment if you uncomment the
	# preceding line
	for tag in tag_list:
	yield term + ":" + tag, 1


	def reducer(self, key, count):
	'''
	Reduce to form:
	term \| associated_tag \| count_of_occurence
	'''
	term, tag = key.split(":")
	print "%s\|%s\|%d" % (term, tag, sum(count))


	if __name__ == "__main__":
	tagger.run()