Skip to content

Instantly share code, notes, and snippets.

@kzinmr
Last active September 10, 2024 10:06
Show Gist options
  • Save kzinmr/a251cd175507e1d6bc0b700c66050370 to your computer and use it in GitHub Desktop.
Save kzinmr/a251cd175507e1d6bc0b700c66050370 to your computer and use it in GitHub Desktop.
from collections import defaultdict
from functools import reduce, partial
import numpy as np
from itertools import chain
def flatten(l):
return list(chain.from_iterable(l))
def calculate_idf(docs):
"""
- テンプレ会話ほどinformativeness-score逓減する
- 1単語だけ異常な語が混じるような異常検知はできない
- Issue: 文長短い場合の idf 平均の補正
- Issue: df=1のケースの idf の補正 (何通りか試す: a. df=1->2に変換, b. df=1を除外, )
Arguments:
docs: list of list of words
"""
tfdict = defaultdict(int)
dfdict = defaultdict(int)
N = 0
for i, doc in enumerate(docs):
word_set = set()
for w in doc:
tfdict[(i, w)] += 1
word_set.add(w)
for w in word_set:
dfdict[w] += 1
N += 1
# idf := log(N / df)
idf = {w: 1 / df for w, df in dfdict.items()}
# idf_log = {w: np.log(N / df + 1) for w, df in dfdict.items()}
return idf, dfdict
def informativeness(sentence, idf):
x = [idf[w] for w in sentence.split(' ') if w in idf]
return np.mean(x) if x else 0.
def round_float(n, k=3):
factor = 10**k
return np.ceil(n * factor) / factor
def compose(*functions):
return reduce(lambda f, g: lambda x: f(g(x)), functions, lambda x: x)
def doc_scores(sentences, idf, mode='0'):
func = compose(round_float, partial(informativeness, idf=idf))
return list(map(func, sentences))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment