Last active
September 10, 2024 10:06
-
-
Save kzinmr/a251cd175507e1d6bc0b700c66050370 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import defaultdict | |
from functools import reduce, partial | |
import numpy as np | |
from itertools import chain | |
def flatten(l): | |
return list(chain.from_iterable(l)) | |
def calculate_idf(docs): | |
""" | |
- テンプレ会話ほどinformativeness-score逓減する | |
- 1単語だけ異常な語が混じるような異常検知はできない | |
- Issue: 文長短い場合の idf 平均の補正 | |
- Issue: df=1のケースの idf の補正 (何通りか試す: a. df=1->2に変換, b. df=1を除外, ) | |
Arguments: | |
docs: list of list of words | |
""" | |
tfdict = defaultdict(int) | |
dfdict = defaultdict(int) | |
N = 0 | |
for i, doc in enumerate(docs): | |
word_set = set() | |
for w in doc: | |
tfdict[(i, w)] += 1 | |
word_set.add(w) | |
for w in word_set: | |
dfdict[w] += 1 | |
N += 1 | |
# idf := log(N / df) | |
idf = {w: 1 / df for w, df in dfdict.items()} | |
# idf_log = {w: np.log(N / df + 1) for w, df in dfdict.items()} | |
return idf, dfdict | |
def informativeness(sentence, idf): | |
x = [idf[w] for w in sentence.split(' ') if w in idf] | |
return np.mean(x) if x else 0. | |
def round_float(n, k=3): | |
factor = 10**k | |
return np.ceil(n * factor) / factor | |
def compose(*functions): | |
return reduce(lambda f, g: lambda x: f(g(x)), functions, lambda x: x) | |
def doc_scores(sentences, idf, mode='0'): | |
func = compose(round_float, partial(informativeness, idf=idf)) | |
return list(map(func, sentences)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment