Last active
May 29, 2020 08:24
-
-
Save chrisws/3e33f0f785ebeb92b41ad7c5af893a12 to your computer and use it in GitHub Desktop.
BM 2.5
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
REM SmallBASIC | |
REM created: 21/09/2019 | |
REM based on the article https://burakkanber.com/blog/machine-learning-full-text-search-in-javascript-relevance-scoring/ | |
const stop_words = ["the", "at", "in", "on"] | |
func stemmer(text) | |
return text | |
end | |
func tokenize(text) | |
local words, w, out | |
split(squeeze(lower(text)), " ", words()) | |
for w in words | |
if w in stop_words == 0 then | |
out << stemmer(w) | |
end if | |
next w | |
return out | |
end | |
func map_keys(m) | |
local i, out | |
for i in m | |
out << i | |
next i | |
return out | |
end | |
sub updateIdf(byref w) | |
local keys = map_keys(w.terms) | |
local i, term, num, denom | |
for i = 0 to len(keys) - 1 | |
term = keys[i] | |
num = w.totalDocuments - w.terms[term].n + 0.5 | |
denom = w.terms[term].n + 0.5 | |
w.terms[term].idf = max(log10(num / denom), 0.01) | |
next i | |
end | |
sub addDocument(byref w, byref doc) | |
local docObj, i, tokens, terms, keys | |
if doc.id == 0 then throw "ID is a required property of documents." | |
if doc.body == 0 then throw "Body is a required property of documents." | |
#Raw tokenized list of words | |
let tokens = tokenize(doc.body) | |
# Will hold unique terms and their counts and frequencies | |
terms = {} | |
#docObj will eventually be added to the documents database docObj = {id: doc.id, tokens: tokens, body: doc.body} | |
# Count number of terms | |
docObj.termCount = len(tokens) | |
# Increment totalDocuments | |
w.totalDocuments++ | |
# Readjust averageDocumentLength | |
w.totalDocumentTermLength += docObj.termCount | |
w.averageDocumentLength = this.totalDocumentTermLength / w.totalDocuments | |
# Calculate term frequency | |
# First get terms count | |
for i = 0 to len(tokens) - 1 | |
term = tokens[i] | |
if (! term in terms) then | |
terms[term] = { | |
count: 0, | |
freq: 0 | |
} | |
endif | |
terms[term].count++ | |
next i | |
# Then re-loop to calculate term frequency. | |
# We'll also update inverse document frequencies here. | |
keys = map_keys(terms) | |
for i = 0 to len(keys) - 1 | |
term = keys[i] | |
# Term Frequency for this document. | |
terms[term].freq = terms[term].count / docObj.termCount | |
#Inverse Document Frequency initialization | |
if (! term in w.terms[term]) then | |
w.terms[term] = { | |
n: 0, | |
idf: 0 | |
} | |
end if | |
w.terms[term].n++ | |
next i | |
# Calculate inverse document frequencies | |
# This is SLOWish so if you want to index a big batch of documents, | |
# comment this out and run it once at the end of your addDocuments run | |
# If you're only indexing a document or two at a time you can leave this in | |
# Add docObj to docs db | |
docObj.terms = terms | |
docObj.id = doc.id | |
w.documents[doc.id] = docObj | |
w.nDocs++ | |
end | |
func find(w, query) | |
local j, id, i, queryTerm | |
local queryTerms = tokenize(query) | |
local results = [] | |
# Look at each document in turn. | |
# There are better ways to do this with inverted indices. | |
local keys = map_keys(w.documents) | |
for j = 0 to min(len(keys), w.nDocs) - 1 | |
id = keys[j] | |
# The relevance score for a document is the sum of a tf-idf-like | |
# calculation for each query term. | |
w.documents[id]._score = 0 | |
# Calculate the score for each query term | |
# | |
for i = 0 to len(queryTerms) - 1 | |
queryTerm = queryTerms[i] | |
# We've never seen this term before so IDF will be 0. | |
# Means we can skip the whole term, it adds nothing to the score | |
# and isn't in any document. | |
# | |
if ismap(w.terms[queryTerm]) && ismap(w.documents[id].terms[queryTerm]) then | |
# The term is in the document, let's go. | |
# The whole term is : | |
# IDF * (TF * (k1 + 1)) / (TF + k1 * (1 - b + b * docLength / avgDocLength)) | |
# IDF is pre-calculated for the whole docset. | |
idf = w.terms[queryTerm].idf | |
# Numerator of the TF portion. | |
num = w.documents[id].terms[queryTerm].count * (w.k1 + 1) | |
# Denomerator of the TF portion. | |
denom = w.documents[id].terms[queryTerm].count & | |
+ (w.k1 * (1 - w.b + (w.b * w.documents[id].termCount & | |
/ max(1, w.averageDocumentLength)))) | |
# Add this query term to the score | |
w.documents[id]._score += idf * num / denom | |
end if | |
next i | |
if w.documents[id]._score > 0 then | |
results << w.documents[id] | |
end if | |
next j | |
rem results.sort(function(a, b) { return b._score - a._score; }) | |
#return results.slice(0, 10) | |
return results | |
end | |
w = {} | |
doc.body="the cat the cat in the hat sat on the mat" | |
doc.id = 1 | |
addDocument(w, doc) | |
doc.body="text sat" | |
doc.id = 2 | |
addDocument(w, doc) | |
updateIdf(w) | |
searchResult = find(w, "sat") | |
if (isarray(searchResult)) then | |
for item in searchResult | |
print item.id | |
next item | |
endif |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment