Skip to content

Instantly share code, notes, and snippets.

@shotahorii
Last active February 4, 2017 15:18
Show Gist options
  • Save shotahorii/8412933 to your computer and use it in GitHub Desktop.
Save shotahorii/8412933 to your computer and use it in GitHub Desktop.
tfidf

Calculate tf-idf for texts in Japanese.

import MeCab
import math
from collections import Counter
def nounlist(text):
"""日本語のテキストを引数にとり、テキスト中の名詞をリストとして返す。"""
mecab = MeCab.Tagger('mecabrc')
rows = [row.split('\t') for row in mecab.parse(text).split('\n') if row != '' and row != 'EOS']
return [row[0] for row in rows if row[1].split(',')[0]=='名詞']
def tfidf(doc, docs):
"""文書(名詞リスト)と全文書(名詞リストのリスト)を引数にとり、対象の文書のtfidfをdictとして返す。"""
lib = [set(words) for words in docs]
c = Counter(doc)
weights = {}
for term in set(doc):
w = c[term]/len(doc)*math.log(len(lib)/len([d for d in lib if term in d]),2)
weights[term] = w
return weights
if __name__ == '__main__':
doc1 = nounlist('ライオンは動物園で人気です。')
doc2 = nounlist('この先生とあの先生は学校で人気です。')
doc3 = nounlist('彼は塾でも学校でも人気です。')
docs = [doc1, doc2, doc3]
print('文章1:',tfidf(doc1, docs))
print('文章2:',tfidf(doc2, docs))
print('文章3:',tfidf(doc3, docs))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment