Created
December 17, 2012 13:32
-
-
Save d2207197/4318298 to your computer and use it in GitHub Desktop.
NLP 課程 Lab11 的程式碼
https://sites.google.com/site/nthunlplab2012/lecctures/lab11
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
from __future__ import division | |
from mrjob.job import MRJob | |
from collections import Counter | |
from math import log | |
from nltk import word_tokenize | |
import re | |
import string | |
class Tf_Idf(MRJob): | |
# mapper_init 0 | |
def gen_tf_init(self, args=None): | |
# 若在 mapper 0 - gen_tf() -開檔,會使得每行 input 都開一次檔,相當慢。 | |
# 故需建立一個 mapper_init 來負責讀取 vocab.mail.txt | |
self.vocabs = map( | |
string.strip, open("vocab.mail.txt", "r").readlines()) | |
# mapper 0 | |
def gen_tf(self, _, line): | |
term_cnt = Counter() | |
m = re.match(r'<mail filename="(?P<fname>\d+)">(?P<text>.*$)', line) | |
fname = m.group('fname') | |
terms = word_tokenize(m.group( | |
'text').replace('\\\\n', ' ')) # from nltk import word_tokenize | |
for term in terms: | |
if term in self.vocabs: | |
term_cnt[term] += 1 | |
for term in term_cnt: | |
yield term, (fname, term_cnt[term]) | |
del m, fname, term_cnt | |
# reducer 0 | |
def gen_tf_idf(self, term, fname_cnt): | |
fname_cnt = list(fname_cnt) | |
df = len(fname_cnt) | |
idf = 54000 / df | |
for fname, tf in fname_cnt: | |
tf_idf = tf * idf | |
yield fname, (term, tf_idf) | |
# mapper 2 | |
def just_cat(self, fname, terms_tfidf): | |
yield fname, terms_tfidf | |
# reducer 2 | |
def reduce_by_fname(self, fname, tf_idfs): | |
tf_idfs = list(tf_idfs) | |
tf_idf_sum = sum(tf_idf for term, tf_idf in tf_idfs) | |
for i, (term, tf_idf) in enumerate(tf_idfs): | |
tf_idfs[i][1] = tf_idf / tf_idf_sum | |
yield fname, tf_idfs | |
# 定義 map reduce 的步驟 | |
def steps(self): | |
return [ | |
self.mr( # mapreduce step 0 | |
mapper_init=self.gen_tf_init, | |
mapper=self.gen_tf, | |
reducer=self.gen_tf_idf, | |
), | |
self.mr( # mapreduce step 1 | |
mapper=self.just_cat, | |
reducer=self.reduce_by_fname | |
) | |
] | |
if __name__ == '__main__': | |
Tf_Idf.run() | |
# docid term count | |
# docid term tf*N/df |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment