-
-
Save StephMansour/3e7ee0fa29285fc68d28d613892ef59f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
BM25.Tokenize = function(text) { | |
text = text | |
.toLowerCase() | |
.replace(/\W/g, ' ') | |
.replace(/\s+/g, ' ') | |
.trim() | |
.split(' ') | |
.map(function(a) { return stemmer(a); }); | |
// Filter out stopStems | |
var out = []; | |
for (var i = 0, len = text.length; i < len; i++) { | |
if (stopStems.indexOf(text[i]) === -1) { | |
out.push(text[i]); | |
} | |
} | |
return out; | |
}; | |
BM25.prototype.addDocument = function(doc) { | |
if (typeof doc.id === 'undefined') { throw new Error(1000, 'ID is a required property of documents.'); }; | |
if (typeof doc.body === 'undefined') { throw new Error(1001, 'Body is a required property of documents.'); }; | |
// Raw tokenized list of words | |
var tokens = BM25.Tokenize(doc.body); | |
// Will hold unique terms and their counts and frequencies | |
var _terms = {}; | |
// docObj will eventually be added to the documents database | |
var docObj = {id: doc.id, tokens: tokens, body: doc.body}; | |
// Count number of terms | |
docObj.termCount = tokens.length; | |
// Increment totalDocuments | |
this.totalDocuments++; | |
// Readjust averageDocumentLength | |
this.totalDocumentTermLength += docObj.termCount; | |
this.averageDocumentLength = this.totalDocumentTermLength / this.totalDocuments; | |
// Calculate term frequency | |
// First get terms count | |
for (var i = 0, len = tokens.length; i < len; i++) { | |
var term = tokens[i]; | |
if (!_terms[term]) { | |
_terms[term] = { | |
count: 0, | |
freq: 0 | |
}; | |
}; | |
_terms[term].count++; | |
} | |
// Then re-loop to calculate term frequency. | |
// We'll also update inverse document frequencies here. | |
var keys = Object.keys(_terms); | |
for (var i = 0, len = keys.length; i < len; i++) { | |
var term = keys[i]; | |
// Term Frequency for this document. | |
_terms[term].freq = _terms[term].count / docObj.termCount; | |
// Inverse Document Frequency initialization | |
if (!this.terms[term]) { | |
this.terms[term] = { | |
n: 0, // Number of docs this term appears in, uniquely | |
idf: 0 | |
}; | |
} | |
this.terms[term].n++; | |
}; | |
// Calculate inverse document frequencies | |
// This is SLOWish so if you want to index a big batch of documents, | |
// comment this out and run it once at the end of your addDocuments run | |
// If you're only indexing a document or two at a time you can leave this in. | |
// this.updateIdf(); | |
// Add docObj to docs db | |
docObj.terms = _terms; | |
this.documents[docObj.id] = docObj; | |
}; | |
BM25.prototype.updateIdf = function() { | |
var keys = Object.keys(this.terms); | |
for (var i = 0, len = keys.length; i < len; i++) { | |
var term = keys[i]; | |
var num = (this.totalDocuments - this.terms[term].n + 0.5); | |
var denom = (this.terms[term].n + 0.5); | |
this.terms[term].idf = Math.max(Math.log10(num / denom), 0.01); | |
} | |
}; | |
BM25.prototype.search = function(query) { | |
var queryTerms = BM25.Tokenize(query); | |
var results = []; | |
// Look at each document in turn. There are better ways to do this with inverted indices. | |
var keys = Object.keys(this.documents); | |
for (var j = 0, nDocs = keys.length; j < nDocs; j++) { | |
var id = keys[j]; | |
// The relevance score for a document is the sum of a tf-idf-like | |
// calculation for each query term. | |
this.documents[id]._score = 0; | |
// Calculate the score for each query term | |
for (var i = 0, len = queryTerms.length; i < len; i++) { | |
var queryTerm = queryTerms[i]; | |
// We've never seen this term before so IDF will be 0. | |
// Means we can skip the whole term, it adds nothing to the score | |
// and isn't in any document. | |
if (typeof this.terms[queryTerm] === 'undefined') { | |
continue; | |
} | |
// This term isn't in the document, so the TF portion is 0 and this | |
// term contributes nothing to the search score. | |
if (typeof this.documents[id].terms[queryTerm] === 'undefined') { | |
continue; | |
} | |
// The term is in the document, let's go. | |
// The whole term is : | |
// IDF * (TF * (k1 + 1)) / (TF + k1 * (1 - b + b * docLength / avgDocLength)) | |
// IDF is pre-calculated for the whole docset. | |
var idf = this.terms[queryTerm].idf; | |
// Numerator of the TF portion. | |
var num = this.documents[id].terms[queryTerm].count * (this.k1 + 1); | |
// Denomerator of the TF portion. | |
var denom = this.documents[id].terms[queryTerm].count | |
+ (this.k1 * (1 - this.b + (this.b * this.documents[id].termCount / this.averageDocumentLength))); | |
// Add this query term to the score | |
this.documents[id]._score += idf * num / denom; | |
} | |
if (!isNaN(this.documents[id]._score) && this.documents[id]._score > 0) { | |
results.push(this.documents[id]); | |
} | |
} | |
results.sort(function(a, b) { return b._score - a._score; }); | |
return results.slice(0, 10); | |
}; | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment