Created
September 15, 2011 21:47
-
-
Save marcelcaraciolo/1220584 to your computer and use it in GitHub Desktop.
bm25f
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# BM25F Model | |
def bm25(idf, tf, fl, avgfl, B, K1): | |
# idf - inverse document frequency | |
# tf - term frequency in the current document | |
# fl - field length in the current document | |
# avgfl - average field length across documents in collection | |
# B, K1 - free paramters | |
return idf * ((tf * (K1 + 1)) / (tf + K1 * (1 - B + B * (fl / avgfl)))) | |
class BM25F(WeightingModel): | |
"""Implements the BM25F scoring algorithm. | |
""" | |
def __init__(self, B=0.75, K1=1.2, **kwargs): | |
""" | |
>>> from whoosh import scoring | |
>>> # Set a custom B value for the "content" field | |
>>> w = scoring.BM25F(B=0.75, content_B=1.0, K1=1.5) | |
:param B: free parameter, see the BM25 literature. Keyword arguments of | |
the form ``fieldname_B`` (for example, ``body_B``) set field- | |
specific values for B. | |
:param K1: free parameter, see the BM25 literature. | |
""" | |
self.B = B | |
self.K1 = K1 | |
self._field_B = {} | |
for k, v in iteritems(kwargs): | |
if k.endswith("_B"): | |
fieldname = k[:-2] | |
self._field_B[fieldname] = v | |
def supports_block_quality(self): | |
return True | |
def scorer(self, searcher, fieldname, text, qf=1): | |
if not searcher.schema[fieldname].scorable: | |
return WeightScorer.for_(searcher, fieldname, text) | |
if fieldname in self._field_B: | |
B = self._field_B[fieldname] | |
else: | |
B = self.B | |
return BM25FScorer(searcher, fieldname, text, B, self.K1, qf=qf) | |
class BM25FScorer(WeightLengthScorer): | |
def __init__(self, searcher, fieldname, text, B, K1, qf=1): | |
# IDF and average field length are global statistics, so get them from | |
# the top-level searcher | |
parent = searcher.get_parent() # Returns self if no parent | |
self.idf = parent.idf(fieldname, text) | |
self.avgfl = parent.avg_field_length(fieldname) or 1 | |
self.B = B | |
self.K1 = K1 | |
self.qf = qf | |
self.setup(searcher, fieldname, text) | |
def _score(self, weight, length): | |
s = bm25(self.idf, weight, length, self.avgfl, self.B, self.K1) | |
return s |
How do you use boost value on fields?
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks!