Skip to content

Instantly share code, notes, and snippets.

@ramnathv
Forked from koreyou/bm25.py
Created September 8, 2020 04:25
Show Gist options
  • Save ramnathv/142c1049dd6d62f8ee9b0da7ab291d15 to your computer and use it in GitHub Desktop.
Save ramnathv/142c1049dd6d62f8ee9b0da7ab291d15 to your computer and use it in GitHub Desktop.
Implementation of OKapi BM25 with sklearn's TfidfVectorizer
""" Implementation of OKapi BM25 with sklearn's TfidfVectorizer
Distributed as CC-0 (https://creativecommons.org/publicdomain/zero/1.0/)
"""
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
class BM25(object):
def __init__(self, b=0.75, k1=1.6):
self.vectorizer = TfidfVectorizer(norm=None, smooth_idf=False)
self.b = b
self.k1 = k1
def fit(self, X):
""" Fit IDF to documents X """
self.vectorizer.fit(X)
y = super(TfidfVectorizer, self.vectorizer).transform(X)
self.avdl = y.sum(1).mean()
def transform(self, q, X):
""" Calculate BM25 between query q and documents X """
b, k1, avdl = self.b, self.k1, self.avdl
# apply CountVectorizer
X = super(TfidfVectorizer, self.vectorizer).transform(X)
len_X = X.sum(1).A1
q, = super(TfidfVectorizer, self.vectorizer).transform([q])
assert sparse.isspmatrix_csr(q)
# convert to csc for better column slicing
X = X.tocsc()[:, q.indices]
denom = X + (k1 * (1 - b + b * len_X / avdl))[:, None]
# idf(t) = log [ n / df(t) ] + 1 in sklearn, so it need to be coneverted
# to idf(t) = log [ n / df(t) ] with minus 1
idf = self.vectorizer._tfidf.idf_[None, q.indices] - 1.
numer = X.multiply(np.broadcast_to(idf, X.shape)) * (k1 + 1)
return (numer / denom).sum(1).A1
#------------ End of library impl. Followings are the example -----------------
from sklearn.datasets import fetch_20newsgroups
texts = fetch_20newsgroups(subset='train').data
bm25 = BM25()
bm25.fit(texts[1:])
print(bm25.transform(texts[0], texts))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment