|  | /** | 
        
          |  | * Okapi BM25 Implementation | 
        
          |  | * Burak Kanber, 2015 | 
        
          |  | * | 
        
          |  | * License: you may use this for educational purposes only. | 
        
          |  | */ | 
        
          |  |  | 
        
          |  | /** | 
        
          |  | * Porter stemmer | 
        
          |  | * | 
        
          |  | * I took the porter stemmer from here and minified it: http://tartarus.org/~martin/PorterStemmer/js.txt | 
        
          |  | * | 
        
          |  | * Original comments on the porter stemmer source: | 
        
          |  |  | 
        
          |  | // Porter stemmer in Javascript. Few comments, but it's easy to follow against the rules in the original | 
        
          |  | // paper, in | 
        
          |  | // | 
        
          |  | //  Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, | 
        
          |  | //  no. 3, pp 130-137, | 
        
          |  | // | 
        
          |  | // see also http://www.tartarus.org/~martin/PorterStemmer | 
        
          |  |  | 
        
          |  | // Release 1 be 'andargor', Jul 2004 | 
        
          |  | // Release 2 (substantially revised) by Christopher McKenzie, Aug 2009 | 
        
          |  | */ | 
        
          |  |  | 
        
          |  | var stemmer=function(){var e={ational:"ate",tional:"tion",enci:"ence",anci:"ance",izer:"ize",bli:"ble",alli:"al",entli:"ent",eli:"e",ousli:"ous",ization:"ize",ation:"ate",ator:"ate",alism:"al",iveness:"ive",fulness:"ful",ousness:"ous",aliti:"al",iviti:"ive",biliti:"ble",logi:"log"},i={icate:"ic",ative:"",alize:"al",iciti:"ic",ical:"ic",ful:"",ness:""},t="[^aeiou]",s="[aeiouy]",a=t+"[^aeiouy]*",l=s+"[aeiou]*",n="^("+a+")?"+l+a,o="^("+a+")?"+l+a+"("+l+")?$",c="^("+a+")?"+l+a+l+a,r="^("+a+")?"+s;return function(t){var l,u,x,$,p,v,g;if(t.length<3)return t;if(x=t.substr(0,1),"y"==x&&(t=x.toUpperCase()+t.substr(1)),$=/^(.+?)(ss|i)es$/,p=/^(.+?)([^s])s$/,$.test(t)?t=t.replace($,"$1$2"):p.test(t)&&(t=t.replace(p,"$1$2")),$=/^(.+?)eed$/,p=/^(.+?)(ed|ing)$/,$.test(t)){var f=$.exec(t);$=new RegExp(n),$.test(f[1])&&($=/.$/,t=t.replace($,""))}else if(p.test(t)){var f=p.exec(t);l=f[1],p=new RegExp(r),p.test(l)&&(t=l,p=/(at|bl|iz)$/,v=new RegExp("([^aeiouylsz])\\1$"),g=new RegExp("^"+a+s+"[^aeiouwxy]$"),p.test(t)?t+="e":v.test(t)?($=/.$/,t=t.replace($,"")):g.test(t)&&(t+="e"))}if($=/^(.+?)y$/,$.test(t)){var f=$.exec(t);l=f[1],$=new RegExp(r),$.test(l)&&(t=l+"i")}if($=/^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/,$.test(t)){var f=$.exec(t);l=f[1],u=f[2],$=new RegExp(n),$.test(l)&&(t=l+e[u])}if($=/^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/,$.test(t)){var f=$.exec(t);l=f[1],u=f[2],$=new RegExp(n),$.test(l)&&(t=l+i[u])}if($=/^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/,p=/^(.+?)(s|t)(ion)$/,$.test(t)){var f=$.exec(t);l=f[1],$=new RegExp(c),$.test(l)&&(t=l)}else if(p.test(t)){var f=p.exec(t);l=f[1]+f[2],p=new RegExp(c),p.test(l)&&(t=l)}if($=/^(.+?)e$/,$.test(t)){var f=$.exec(t);l=f[1],$=new RegExp(c),p=new RegExp(o),v=new RegExp("^"+a+s+"[^aeiouwxy]$"),($.test(l)||p.test(l)&&!v.test(l))&&(t=l)}return $=/ll$/,p=new RegExp(c),$.test(t)&&p.test(t)&&($=/.$/,t=t.replace($,"")),"y"==x&&(t=x.toLowerCase()+t.substr(1)),t}}(); | 
        
          |  |  | 
        
          |  | /** | 
        
          |  | * Stopwords list taken from http://www.ranks.nl/stopwords | 
        
          |  | */ | 
        
          |  |  | 
        
          |  | var stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "aren't", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "can't", "cannot", "could", "couldn't", "did", "didn't", "do", "does", "doesn't", "doing", "don't", "down", "during", "each", "few", "for", "from", "further", "had", "hadn't", "has", "hasn't", "have", "haven't", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "isn't", "it", "it's", "its", "itself", "let's", "me", "more", "most", "mustn't", "my", "myself", "no", "nor", "not", "of", "off", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "shan't", "she", "she'd", "she'll", "she's", "should", "shouldn't", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "wasn't", "we", "we'd", "we'll", "we're", "we've", "were", "weren't", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "won't", "would", "wouldn't", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves"]; | 
        
          |  |  | 
        
          |  | var stopStems = stopwords.map(stemmer); | 
        
          |  |  | 
        
          |  | function BM25() { | 
        
          |  | this.documents = {}; | 
        
          |  | this.terms = {}; | 
        
          |  | this.totalDocumentTermLength = 0; // For average doc length. | 
        
          |  | this.averageDocumentLength = 0; | 
        
          |  | this.totalDocuments = 0; | 
        
          |  | this.k1 = 1.3; | 
        
          |  | this.b = 0.75; | 
        
          |  | }; | 
        
          |  |  | 
        
          |  | // Static methods. | 
        
          |  |  | 
        
          |  | BM25.Tokenize = function(text) { | 
        
          |  | text = _.clean(text) | 
        
          |  | .split(' ') | 
        
          |  | .map(function(a) { return stemmer(a); }) | 
        
          |  | console.log(text); | 
        
          |  | window.text=text; | 
        
          |  | // Filter out stopStems | 
        
          |  | var out = []; | 
        
          |  | for (var i = 0, len = text.length; i < len; i++) { | 
        
          |  | if (stopStems.indexOf(text[i]) === -1) { | 
        
          |  | out.push(text[i]); | 
        
          |  | } | 
        
          |  | } | 
        
          |  |  | 
        
          |  | return out; | 
        
          |  | }; | 
        
          |  |  | 
        
          |  | // Instance methods. | 
        
          |  |  | 
        
          |  | /** | 
        
          |  | * @param doc Object Expects this parameter to have an id and body properties. | 
        
          |  | */ | 
        
          |  |  | 
        
          |  | BM25.prototype.addDocument = function(doc, batch) { | 
        
          |  | if (typeof doc.id === 'undefined') { throw new Error(1000, 'ID is a required property of documents.'); }; | 
        
          |  | if (typeof doc.body === 'undefined') { throw new Error(1001, 'Body is a required property of documents.'); }; | 
        
          |  |  | 
        
          |  | // Raw tokenized list of words | 
        
          |  | var tokens = BM25.Tokenize(doc.body).filter( a => a.length>0); | 
        
          |  |  | 
        
          |  | // Will hold unique terms and their counts and frequencies | 
        
          |  | var _terms = {}; | 
        
          |  |  | 
        
          |  | // docObj will eventually be added to the documents database | 
        
          |  | var docObj = {id: doc.id, tokens: tokens, body: doc.body}; | 
        
          |  |  | 
        
          |  | // Count number of terms | 
        
          |  | docObj.termCount = tokens.length; | 
        
          |  |  | 
        
          |  | // Increment totalDocuments | 
        
          |  | this.totalDocuments++; | 
        
          |  |  | 
        
          |  | // Readjust averageDocumentLength | 
        
          |  | this.totalDocumentTermLength += docObj.termCount; | 
        
          |  | this.averageDocumentLength = this.totalDocumentTermLength / this.totalDocuments; | 
        
          |  |  | 
        
          |  | // Calculate term frequency | 
        
          |  | // First get terms count | 
        
          |  | for (var i = 0, len = tokens.length; i < len; i++) { | 
        
          |  | var term = tokens[i]; | 
        
          |  | if (!_terms[term]) { | 
        
          |  | _terms[term] = { | 
        
          |  | count: 0, | 
        
          |  | freq: 0 | 
        
          |  | }; | 
        
          |  | }; | 
        
          |  | _terms[term].count++; | 
        
          |  | } | 
        
          |  |  | 
        
          |  | // Then re-loop to calculate term frequency. | 
        
          |  | // We'll also update inverse document frequencies here. | 
        
          |  | var keys = Object.keys(_terms); | 
        
          |  | for (var i = 0, len = keys.length; i < len; i++) { | 
        
          |  | var term = keys[i]; | 
        
          |  | // Term Frequency for this document. | 
        
          |  | _terms[term].freq = _terms[term].count / docObj.termCount; | 
        
          |  |  | 
        
          |  | // Inverse Document Frequency initialization | 
        
          |  | if (!this.terms[term]) { | 
        
          |  | this.terms[term] = { | 
        
          |  | n: 0, // Number of docs this term appears in, uniquely | 
        
          |  | idf: 0, | 
        
          |  | docs: {} | 
        
          |  | }; | 
        
          |  | } | 
        
          |  |  | 
        
          |  | this.terms[term].n++; | 
        
          |  | this.terms[term].docs[doc.id]=1 | 
        
          |  | }; | 
        
          |  |  | 
        
          |  | // Calculate inverse document frequencies | 
        
          |  | !batch && this.updateIdf(); | 
        
          |  |  | 
        
          |  | // Add docObj to docs db | 
        
          |  | docObj.terms = _terms; | 
        
          |  | this.documents[docObj.id] = docObj; | 
        
          |  | }; | 
        
          |  |  | 
        
          |  | BM25.prototype.updateIdf = function() { | 
        
          |  | var keys = Object.keys(this.terms); | 
        
          |  | for (var i = 0, len = keys.length; i < len; i++) { | 
        
          |  | var term = keys[i]; | 
        
          |  | var num = (this.totalDocuments - this.terms[term].n + 0.5); | 
        
          |  | var denom = (this.terms[term].n + 0.5); | 
        
          |  | this.terms[term].idf = Math.max(Math.log10(num / denom), 0.01); | 
        
          |  | } | 
        
          |  | }; | 
        
          |  |  | 
        
          |  | BM25.prototype.getSortedTerms=function(doc,size){ | 
        
          |  |  | 
        
          |  | if(doc){ | 
        
          |  | if(!this.documents[doc]) return "That document doesn't exist."; | 
        
          |  | size=size||_.keys(this.documents[doc].terms).length | 
        
          |  | return _(this.documents[doc].terms).map((v,k) => ({key:k, freq:v.freq, count:v.count})).sortBy('count').takeRight(size).value().reverse() | 
        
          |  | }else{ | 
        
          |  | size=size||_.keys(this.terms).length | 
        
          |  | return _(this.terms).map((v,k) => ({key:k, idf:v.idf, n:v.n})).sortBy('n').takeRight(size).value().reverse() | 
        
          |  | } | 
        
          |  | } | 
        
          |  |  | 
        
          |  | BM25.prototype.search = function(query, limit) { | 
        
          |  |  | 
        
          |  | var queryTerms = BM25.Tokenize(query); | 
        
          |  | var results = []; | 
        
          |  |  | 
        
          |  | // Look at each document in turn. There are better ways to do this with inverted indices. | 
        
          |  | var keys = Object.keys(this.documents); | 
        
          |  | for (var j = 0, nDocs = keys.length; j < nDocs; j++) { | 
        
          |  | var id = keys[j]; | 
        
          |  | // The relevance score for a document is the sum of a tf-idf-like | 
        
          |  | // calculation for each query term. | 
        
          |  | this.documents[id]._score = 0; | 
        
          |  |  | 
        
          |  | // Calculate the score for each query term | 
        
          |  | for (var i = 0, len = queryTerms.length; i < len; i++) { | 
        
          |  | var queryTerm = queryTerms[i]; | 
        
          |  |  | 
        
          |  | // We've never seen this term before so IDF will be 0. | 
        
          |  | // Means we can skip the whole term, it adds nothing to the score | 
        
          |  | // and isn't in any document. | 
        
          |  | if (typeof this.terms[queryTerm] === 'undefined') { | 
        
          |  | continue; | 
        
          |  | } | 
        
          |  |  | 
        
          |  | // This term isn't in the document, so the TF portion is 0 and this | 
        
          |  | // term contributes nothing to the search score. | 
        
          |  | if (typeof this.documents[id].terms[queryTerm] === 'undefined') { | 
        
          |  | continue; | 
        
          |  | } | 
        
          |  |  | 
        
          |  | // The term is in the document, let's go. | 
        
          |  | // The whole term is : | 
        
          |  | // IDF * (TF * (k1 + 1)) / (TF + k1 * (1 - b + b * docLength / avgDocLength)) | 
        
          |  |  | 
        
          |  | // IDF is pre-calculated for the whole docset. | 
        
          |  | var idf = this.terms[queryTerm].idf; | 
        
          |  | // Numerator of the TF portion. | 
        
          |  | var num = this.documents[id].terms[queryTerm].count * (this.k1 + 1); | 
        
          |  | // Denomerator of the TF portion. | 
        
          |  | var denom = this.documents[id].terms[queryTerm].count | 
        
          |  | + (this.k1 * (1 - this.b + (this.b * this.documents[id].termCount / this.averageDocumentLength))); | 
        
          |  |  | 
        
          |  | // Add this query term to the score | 
        
          |  | this.documents[id]._score += idf * num / denom; | 
        
          |  | } | 
        
          |  |  | 
        
          |  | if (!isNaN(this.documents[id]._score) && this.documents[id]._score > 0) { | 
        
          |  | results.push(this.documents[id]); | 
        
          |  | } | 
        
          |  | } | 
        
          |  |  | 
        
          |  | results.sort(function(a, b) { return b._score - a._score; }); | 
        
          |  | return results.slice(0, limit || 10); | 
        
          |  | }; | 
        
          |  |  | 
        
          |  | var bm = new BM25; | 
        
          |  |  | 
        
          |  | onmessage = function(e) { | 
        
          |  | var payload = e.data; | 
        
          |  | switch (payload.type) { | 
        
          |  |  | 
        
          |  | case 'index-batch': | 
        
          |  | //payload.data = payload.data.slice(0, 1000); | 
        
          |  | var len = payload.data.length; | 
        
          |  | for (var i in payload.data) { | 
        
          |  | bm.addDocument({id: i, body: payload.data[i]}); | 
        
          |  | if (i % 100 === 0) { | 
        
          |  | postMessage({type:"index-update", data: i/len}); | 
        
          |  | } | 
        
          |  | } | 
        
          |  | bm.updateIdf(); | 
        
          |  | postMessage({type:"index-update", data: 1}); | 
        
          |  | break; | 
        
          |  |  | 
        
          |  | case 'search': | 
        
          |  | postMessage({type: "search-results", data: bm.search(payload.data)}); | 
        
          |  | break; | 
        
          |  |  | 
        
          |  | } | 
        
          |  | } | 
        
          |  | function analyzeCollection(arr, id_attr, attrs){ | 
        
          |  | var tfidf = new BM25(); | 
        
          |  | if(!attrs) attrs=Object.keys(arr[0]); | 
        
          |  |  | 
        
          |  | // 	For each element add each of the attributes to a doc | 
        
          |  | _.map( arr, (d,i) =>{ | 
        
          |  | // With the elements id_attr as the id | 
        
          |  | _.each(attrs, attr=> d[attr] ? tfidf.addDocument({id: (d[id_attr] ? d[id_attr] : i), body: d[attr]}, true) : -1 ); | 
        
          |  | }) | 
        
          |  |  | 
        
          |  | tfidf.updateIdf(); | 
        
          |  | return tfidf; | 
        
          |  |  | 
        
          |  | } | 
        
          |  |  | 
        
          |  |  | 
        
          |  | function getScripts(){ | 
        
          |  | js=_.map(document.scripts, d => ({id: (d.src || _.uniqueId()), body: d.innerText})); | 
        
          |  | var tfidf=new BM25(); | 
        
          |  | _.each(js, d => tfidf.addDocument(d, true) ); | 
        
          |  | tfidf.updateIdf(); | 
        
          |  | console.log(tfidf); | 
        
          |  | console.log(tfidf.terms); | 
        
          |  | }; | 
        
          |  |  | 
        
          |  | var repl={ | 
        
          |  | html: ['/</gi','/>/gi','/ /gi'], | 
        
          |  | js_methods: ['console.log('], | 
        
          |  | splits:[ /"/g, /,/g, /\w\.\w/gi, /\s.{1,3}\s/gi, /\W/g, /\s+/g, /\s[0-9]*\s/g, /\s[0-9]*px\s/g] | 
        
          |  | } | 
        
          |  |  | 
        
          |  | var filters = ["nbsp","lt","gt","init","Array","Date","eval","function","hasOwnProperty","Infinity","isFinite","isNaN","isPrototypeOf","length","Math","NaN","name","Number","Object","prototype","String","toString","undefined","valueOf","abstract", "arguments", "boolean", "break", "byte", "case", "catch", "char", "class", "const", "continue", "debugger", "default", "delete", "do", "double", "else", "enum", "eval", "export", "extends*", "false", "final", "finally", "float", "for", "function", "goto", "if", "implements", "import", "in", "instanceof", "int", "interface", "let", "long", "native", "new", "null", "package", "private", "protected", "public", "return", "short", "static", "super", "switch", "synchronized", "this", "throw", "throws", "transient", "true", "try", "typeof", "var", "void", "volatile", "while", "with", "yield"]; | 
        
          |  | function replaceWith(replace_regex, replace_val){ | 
        
          |  | return _.partial(_.curryReplace,replace_regex, replace_val); | 
        
          |  | } | 
        
          |  | _.curryReplace = function (rgxWhat, rplWith, text) { | 
        
          |  | return String(text).replace(rgxWhat, rplWith); | 
        
          |  | }; | 
        
          |  |  | 
        
          |  | _.mixin({ | 
        
          |  | replaceFlow: (v) => _.partial(_.replaceWith,_,v), | 
        
          |  | replaceWith: (replace_regex, replace_val) =>{ | 
        
          |  | return _.partial(_.curryReplace,replace_regex, replace_val); | 
        
          |  | }, | 
        
          |  | curryReplace: (rgxWhat, rplWith, text) =>{ | 
        
          |  | return String(text).replace(rgxWhat, rplWith); | 
        
          |  | }, | 
        
          |  | isntEmpty:_.negate(_.isEmpty), | 
        
          |  | clean: function(s){ | 
        
          |  | s = _.toLower(s); | 
        
          |  | var replacements = _.flatten(_.values(repl)); | 
        
          |  | replacements = _.map(replacements,_.replaceFlow(' ')); | 
        
          |  | var res = _.flow(replacements)(s) | 
        
          |  | .replace(/\s.{1,3}\s/gi, " ") | 
        
          |  | .replace(/\s.{1,3}\s/gi, " ") | 
        
          |  | .replace(/\s.{1,3}\s/gi, " "); | 
        
          |  | res=_.pullAll(res.split(' '),filters).join(' '); | 
        
          |  | return res; | 
        
          |  | } | 
        
          |  | }) | 
        
          |  |  | 
        
          |  | function analyzePage(){ | 
        
          |  | var tfidf = new BM25(); | 
        
          |  |  | 
        
          |  | x=document.body.innerHTML.replace(/(<([^>]+)>)/ig,"") | 
        
          |  | x=_.flow(_.replaceFlow(' ')(repl.html))(x) | 
        
          |  | .replace(/[\(\){}[]"'`~]*/g,' ') | 
        
          |  | .replace(/\s+/g,' ') | 
        
          |  | .replace(/\s.{1,3}\s/gi, " ") | 
        
          |  | .replace(/\s.{1,3}\s/gi, " ") | 
        
          |  | .replace(/\s.{1,3}\s/gi, " ") | 
        
          |  |  | 
        
          |  |  | 
        
          |  | tfidf.addDocument({id: "html",body:_.clean(x)}, true); | 
        
          |  | tfidf.updateIdf(); | 
        
          |  | return tfidf | 
        
          |  | } |