d2207197 · December 17, 2012 13:32
diff --git a/tf-idf-mrjob.py b/tf-idf-mrjob.py
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 from __future__ import division
 from mrjob.job import MRJob
 from collections import Counter

 from math import log
 from nltk import word_tokenize

 import re
 import string


 class Tf_Idf(MRJob):

    # mapper_init 0
    def gen_tf_init(self, args=None):
        # 若在 mapper 0 - gen_tf() -開檔，會使得每行 input 都開一次檔，相當慢。
        # 故需建立一個 mapper_init 來負責讀取 vocab.mail.txt
        self.vocabs = map(
            string.strip, open("vocab.mail.txt", "r").readlines())

    # mapper 0
    def gen_tf(self, _, line):
        term_cnt = Counter()
        m = re.match(r'<mail filename="(?P<fname>\d+)">(?P<text>.*$)', line)

        fname = m.group('fname')
        terms = word_tokenize(m.group(
            'text').replace('\\\\n', ' '))  # from nltk import word_tokenize

        for term in terms:
            if term in self.vocabs:
                term_cnt[term] += 1

        for term in term_cnt:
            yield term, (fname, term_cnt[term])

        del m, fname, term_cnt

    # reducer 0
    def gen_tf_idf(self, term, fname_cnt):
        fname_cnt = list(fname_cnt)
        df = len(fname_cnt)
        idf = 54000 / df
        for fname, tf in fname_cnt:
            tf_idf = tf * idf
            yield fname, (term, tf_idf)

    # mapper 2
    def just_cat(self, fname, terms_tfidf):
        yield fname, terms_tfidf

    # reducer 2
    def reduce_by_fname(self, fname, tf_idfs):
        tf_idfs = list(tf_idfs)
        tf_idf_sum = sum(tf_idf for term, tf_idf in tf_idfs)
        for i, (term, tf_idf) in enumerate(tf_idfs):
            tf_idfs[i][1] = tf_idf / tf_idf_sum
        yield fname, tf_idfs

    # 定義 map reduce 的步驟
    def steps(self):
        return [
            self.mr(            # mapreduce step 0
                mapper_init=self.gen_tf_init,
                mapper=self.gen_tf,
                reducer=self.gen_tf_idf,
            ),
            self.mr(            # mapreduce step 1
                mapper=self.just_cat,
                reducer=self.reduce_by_fname
            )
        ]


 if __name__ == '__main__':
    Tf_Idf.run()

    # docid term count
    # docid term tf*N/df
	#!/usr/bin/env python
	# -- coding: utf-8 --
	from __future__ import division
	from mrjob.job import MRJob
	from collections import Counter

	from math import log
	from nltk import word_tokenize

	import re
	import string


	class Tf_Idf(MRJob):

	# mapper_init 0
	def gen_tf_init(self, args=None):
	# 若在 mapper 0 - gen_tf() -開檔，會使得每行 input 都開一次檔，相當慢。
	# 故需建立一個 mapper_init 來負責讀取 vocab.mail.txt
	self.vocabs = map(
	string.strip, open("vocab.mail.txt", "r").readlines())

	# mapper 0
	def gen_tf(self, _, line):
	term_cnt = Counter()
	m = re.match(r'<mail filename="(?P<fname>\d+)">(?P<text>.*$)', line)

	fname = m.group('fname')
	terms = word_tokenize(m.group(
	'text').replace('\\\\n', ' ')) # from nltk import word_tokenize

	for term in terms:
	if term in self.vocabs:
	term_cnt[term] += 1

	for term in term_cnt:
	yield term, (fname, term_cnt[term])

	del m, fname, term_cnt

	# reducer 0
	def gen_tf_idf(self, term, fname_cnt):
	fname_cnt = list(fname_cnt)
	df = len(fname_cnt)
	idf = 54000 / df
	for fname, tf in fname_cnt:
	tf_idf = tf * idf
	yield fname, (term, tf_idf)

	# mapper 2
	def just_cat(self, fname, terms_tfidf):
	yield fname, terms_tfidf

	# reducer 2
	def reduce_by_fname(self, fname, tf_idfs):
	tf_idfs = list(tf_idfs)
	tf_idf_sum = sum(tf_idf for term, tf_idf in tf_idfs)
	for i, (term, tf_idf) in enumerate(tf_idfs):
	tf_idfs[i][1] = tf_idf / tf_idf_sum
	yield fname, tf_idfs

	# 定義 map reduce 的步驟
	def steps(self):
	return [
	self.mr( # mapreduce step 0
	mapper_init=self.gen_tf_init,
	mapper=self.gen_tf,
	reducer=self.gen_tf_idf,
	),
	self.mr( # mapreduce step 1
	mapper=self.just_cat,
	reducer=self.reduce_by_fname
	)
	]


	if __name__ == '__main__':
	Tf_Idf.run()

	# docid term count
	# docid term tf*N/df