-
-
Save geekpete/93f774d04a0253960b409ca3f06cf7dd to your computer and use it in GitHub Desktop.
bm25f
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# BM25F Model | |
def bm25(idf, tf, fl, avgfl, B, K1): | |
# idf - inverse document frequency | |
# tf - term frequency in the current document | |
# fl - field length in the current document | |
# avgfl - average field length across documents in collection | |
# B, K1 - free paramters | |
return idf * ((tf * (K1 + 1)) / (tf + K1 * (1 - B + B * (fl / avgfl)))) | |
class BM25F(WeightingModel): | |
"""Implements the BM25F scoring algorithm. | |
""" | |
def __init__(self, B=0.75, K1=1.2, **kwargs): | |
""" | |
>>> from whoosh import scoring | |
>>> # Set a custom B value for the "content" field | |
>>> w = scoring.BM25F(B=0.75, content_B=1.0, K1=1.5) | |
:param B: free parameter, see the BM25 literature. Keyword arguments of | |
the form ``fieldname_B`` (for example, ``body_B``) set field- | |
specific values for B. | |
:param K1: free parameter, see the BM25 literature. | |
""" | |
self.B = B | |
self.K1 = K1 | |
self._field_B = {} | |
for k, v in iteritems(kwargs): | |
if k.endswith("_B"): | |
fieldname = k[:-2] | |
self._field_B[fieldname] = v | |
def supports_block_quality(self): | |
return True | |
def scorer(self, searcher, fieldname, text, qf=1): | |
if not searcher.schema[fieldname].scorable: | |
return WeightScorer.for_(searcher, fieldname, text) | |
if fieldname in self._field_B: | |
B = self._field_B[fieldname] | |
else: | |
B = self.B | |
return BM25FScorer(searcher, fieldname, text, B, self.K1, qf=qf) | |
class BM25FScorer(WeightLengthScorer): | |
def __init__(self, searcher, fieldname, text, B, K1, qf=1): | |
# IDF and average field length are global statistics, so get them from | |
# the top-level searcher | |
parent = searcher.get_parent() # Returns self if no parent | |
self.idf = parent.idf(fieldname, text) | |
self.avgfl = parent.avg_field_length(fieldname) or 1 | |
self.B = B | |
self.K1 = K1 | |
self.qf = qf | |
self.setup(searcher, fieldname, text) | |
def _score(self, weight, length): | |
s = bm25(self.idf, weight, length, self.avgfl, self.B, self.K1) | |
return s |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment