Calculate tf-idf for texts in Japanese.
Last active
February 4, 2017 15:18
-
-
Save shotahorii/8412933 to your computer and use it in GitHub Desktop.
tfidf
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import MeCab | |
import math | |
from collections import Counter | |
def nounlist(text): | |
"""日本語のテキストを引数にとり、テキスト中の名詞をリストとして返す。""" | |
mecab = MeCab.Tagger('mecabrc') | |
rows = [row.split('\t') for row in mecab.parse(text).split('\n') if row != '' and row != 'EOS'] | |
return [row[0] for row in rows if row[1].split(',')[0]=='名詞'] | |
def tfidf(doc, docs): | |
"""文書(名詞リスト)と全文書(名詞リストのリスト)を引数にとり、対象の文書のtfidfをdictとして返す。""" | |
lib = [set(words) for words in docs] | |
c = Counter(doc) | |
weights = {} | |
for term in set(doc): | |
w = c[term]/len(doc)*math.log(len(lib)/len([d for d in lib if term in d]),2) | |
weights[term] = w | |
return weights | |
if __name__ == '__main__': | |
doc1 = nounlist('ライオンは動物園で人気です。') | |
doc2 = nounlist('この先生とあの先生は学校で人気です。') | |
doc3 = nounlist('彼は塾でも学校でも人気です。') | |
docs = [doc1, doc2, doc3] | |
print('文章1:',tfidf(doc1, docs)) | |
print('文章2:',tfidf(doc2, docs)) | |
print('文章3:',tfidf(doc3, docs)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment