Created
March 25, 2021 19:55
-
-
Save benob/d0e74565c87007be65b781b8652884de to your computer and use it in GitHub Desktop.
simple in-memory search engine with bm25 a.k.a. okapi ranking function
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"use strict" | |
var fs = require('fs'); | |
var stopwords = fs.readFileSync('stopwords.txt').toString().split('\n').reduce((set, line) => {set.add(line.trim()); return set}, new Set()); | |
var index = { | |
termFrequency: new Map(), | |
documentLength: [], | |
averageDocumentLength: 0, | |
k1: 1.2, | |
b: 0.75, | |
} | |
function tokenize(text) { | |
var tokens = []; | |
for(var word of text.trim().toLowerCase().replace(/[^a-z0-9-]/g, ' ').split(/\s+/)) { | |
if(!stopwords.has(word)) { | |
tokens.push(word); | |
} | |
} | |
return tokens; | |
} | |
function vectorize(text) { | |
var vector = new Map(); | |
for(var word of tokenize(text)) { | |
if(!vector.has(word)) vector.set(word, 1); | |
else vector.set(word, 1 + vector.get(word)); | |
} | |
return vector; | |
} | |
function add_to_index(text, id) { | |
const termFrequency = index.termFrequency; | |
var documentLength = 0; | |
vectorize(text).forEach((count, word) => { | |
if(!index.termFrequency.has(word)) termFrequency.set(word, []); | |
termFrequency.get(word).push(id, count); | |
documentLength += count; | |
}); | |
return documentLength; | |
} | |
function build_index(entries) { | |
var i = 0; | |
for(var text of entries) { | |
const length = add_to_index(text, i); | |
index.documentLength.push(length); | |
index.averageDocumentLength += length; | |
i++; | |
} | |
index.averageDocumentLength /= index.documentLength.length; | |
} | |
function load_index(filename) { | |
index = JSON.parse(fs.readFileSync(filename).toString()); | |
index.termFrequency = new Map(index.termFrequency); | |
} | |
function save_index(filename) { | |
const backup = index.termFrequency; | |
index.termFrequency = Array.from(backup.entries()); | |
fs.writeFileSync(filename, JSON.stringify(index)); | |
index.termFrequency = backup; | |
} | |
function inverseDocumentFrequency(word) { | |
if(index.termFrequency.has(word)) { | |
const documentFrequency = index.termFrequency.get(word).length / 2; | |
return Math.log( (index.documentLength.length - documentFrequency + 0.5) / (documentFrequency + 0.5) ); | |
} | |
return 0; | |
} | |
function okapi_bm25(query) { | |
var found = new Map(); | |
tokenize(query).forEach((word) => { | |
if(index.termFrequency.has(word)) { | |
const idf = inverseDocumentFrequency(word); | |
const entries = index.termFrequency.get(word); | |
for(var i = 0; i < entries.length; i += 2) { | |
const doc = entries[i], termFrequency = entries[i + 1]; | |
const score = idf * (termFrequency * (index.k1 + 1)) / termFrequency + index.k1 * (1 - index.b + index.b * index.documentLength[doc] / index.averageDocumentLength); | |
if(!found.has(doc)) found.set(doc, score); | |
else found.set(doc, score + found.get(doc)); | |
} | |
} | |
}); | |
return found; | |
} | |
function search(query) { | |
query = query || ""; | |
var found = okapi_bm25(query); | |
var results = Array.from(found.entries()); | |
results.sort((a, b) => b[1] - a[1]); | |
return results; | |
}; | |
//var docs = JSON.parse(fs.readFileSync(process.argv[2]).toString()); | |
//build_index(docs.map(doc => doc.title + ' ' + doc.abstract)); | |
//save_index('index.bm25'); | |
//load_index('index.bm25'); | |
//console.log(search('virus')); | |
module.exports = { | |
build_index: build_index, | |
save_index: save_index, | |
load_index: load_index, | |
search: search, | |
}; | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment