PseudoSky · November 9, 2016 21:52
diff --git a/README.md b/README.md
diff --git a/contextualizer.js b/contextualizer.js
 /**
 * Okapi BM25 Implementation
 * Burak Kanber, 2015
 *
 * License: you may use this for educational purposes only.
 */

 /**
 * Porter stemmer
 *
 * I took the porter stemmer from here and minified it: http://tartarus.org/~martin/PorterStemmer/js.txt
 * 
 * Original comments on the porter stemmer source: 

 // Porter stemmer in Javascript. Few comments, but it's easy to follow against the rules in the original
 // paper, in
 //
 //  Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
 //  no. 3, pp 130-137,
 //
 // see also http://www.tartarus.org/~martin/PorterStemmer

 // Release 1 be 'andargor', Jul 2004
 // Release 2 (substantially revised) by Christopher McKenzie, Aug 2009
 */

 var stemmer=function(){var e={ational:"ate",tional:"tion",enci:"ence",anci:"ance",izer:"ize",bli:"ble",alli:"al",entli:"ent",eli:"e",ousli:"ous",ization:"ize",ation:"ate",ator:"ate",alism:"al",iveness:"ive",fulness:"ful",ousness:"ous",aliti:"al",iviti:"ive",biliti:"ble",logi:"log"},i={icate:"ic",ative:"",alize:"al",iciti:"ic",ical:"ic",ful:"",ness:""},t="[^aeiou]",s="[aeiouy]",a=t+"[^aeiouy]*",l=s+"[aeiou]*",n="^("+a+")?"+l+a,o="^("+a+")?"+l+a+"("+l+")?$",c="^("+a+")?"+l+a+l+a,r="^("+a+")?"+s;return function(t){var l,u,x,$,p,v,g;if(t.length<3)return t;if(x=t.substr(0,1),"y"==x&&(t=x.toUpperCase()+t.substr(1)),$=/^(.+?)(ss|i)es$/,p=/^(.+?)([^s])s$/,$.test(t)?t=t.replace($,"$1$2"):p.test(t)&&(t=t.replace(p,"$1$2")),$=/^(.+?)eed$/,p=/^(.+?)(ed|ing)$/,$.test(t)){var f=$.exec(t);$=new RegExp(n),$.test(f[1])&&($=/.$/,t=t.replace($,""))}else if(p.test(t)){var f=p.exec(t);l=f[1],p=new RegExp(r),p.test(l)&&(t=l,p=/(at|bl|iz)$/,v=new RegExp("([^aeiouylsz])\\1$"),g=new RegExp("^"+a+s+"[^aeiouwxy]$"),p.test(t)?t+="e":v.test(t)?($=/.$/,t=t.replace($,"")):g.test(t)&&(t+="e"))}if($=/^(.+?)y$/,$.test(t)){var f=$.exec(t);l=f[1],$=new RegExp(r),$.test(l)&&(t=l+"i")}if($=/^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/,$.test(t)){var f=$.exec(t);l=f[1],u=f[2],$=new RegExp(n),$.test(l)&&(t=l+e[u])}if($=/^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/,$.test(t)){var f=$.exec(t);l=f[1],u=f[2],$=new RegExp(n),$.test(l)&&(t=l+i[u])}if($=/^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/,p=/^(.+?)(s|t)(ion)$/,$.test(t)){var f=$.exec(t);l=f[1],$=new RegExp(c),$.test(l)&&(t=l)}else if(p.test(t)){var f=p.exec(t);l=f[1]+f[2],p=new RegExp(c),p.test(l)&&(t=l)}if($=/^(.+?)e$/,$.test(t)){var f=$.exec(t);l=f[1],$=new RegExp(c),p=new RegExp(o),v=new RegExp("^"+a+s+"[^aeiouwxy]$"),($.test(l)||p.test(l)&&!v.test(l))&&(t=l)}return $=/ll$/,p=new RegExp(c),$.test(t)&&p.test(t)&&($=/.$/,t=t.replace($,"")),"y"==x&&(t=x.toLowerCase()+t.substr(1)),t}}();

 /**
 * Stopwords list taken from http://www.ranks.nl/stopwords
 */

 var stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "aren't", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "can't", "cannot", "could", "couldn't", "did", "didn't", "do", "does", "doesn't", "doing", "don't", "down", "during", "each", "few", "for", "from", "further", "had", "hadn't", "has", "hasn't", "have", "haven't", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "isn't", "it", "it's", "its", "itself", "let's", "me", "more", "most", "mustn't", "my", "myself", "no", "nor", "not", "of", "off", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "shan't", "she", "she'd", "she'll", "she's", "should", "shouldn't", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "wasn't", "we", "we'd", "we'll", "we're", "we've", "were", "weren't", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "won't", "would", "wouldn't", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves"];

 var stopStems = stopwords.map(stemmer);

 function BM25() {
 	this.documents = {};
 	this.terms = {};
 	this.totalDocumentTermLength = 0; // For average doc length.
 	this.averageDocumentLength = 0;
 	this.totalDocuments = 0;
 	this.k1 = 1.3;
 	this.b = 0.75;
 };

 // Static methods.

 BM25.Tokenize = function(text) {
 	text = _.clean(text)
 		    .split(' ')
 		    .map(function(a) { return stemmer(a); })
    console.log(text);
    window.text=text;
 	// Filter out stopStems
 	var out = [];
 	for (var i = 0, len = text.length; i < len; i++) {
 		if (stopStems.indexOf(text[i]) === -1) {
 			out.push(text[i]);
 		}
 	}

 	return out;
 };

 // Instance methods.

 /**
  * @param doc Object Expects this parameter to have an id and body properties.
  */

 BM25.prototype.addDocument = function(doc, batch) {
 	if (typeof doc.id === 'undefined') { throw new Error(1000, 'ID is a required property of documents.'); };
 	if (typeof doc.body === 'undefined') { throw new Error(1001, 'Body is a required property of documents.'); };

 	// Raw tokenized list of words
 	var tokens = BM25.Tokenize(doc.body).filter( a => a.length>0);

 	// Will hold unique terms and their counts and frequencies
 	var _terms = {};

 	// docObj will eventually be added to the documents database
 	var docObj = {id: doc.id, tokens: tokens, body: doc.body};

 	// Count number of terms
 	docObj.termCount = tokens.length;

 	// Increment totalDocuments
 	this.totalDocuments++;

 	// Readjust averageDocumentLength
 	this.totalDocumentTermLength += docObj.termCount;
 	this.averageDocumentLength = this.totalDocumentTermLength / this.totalDocuments;

 	// Calculate term frequency
 	// First get terms count
 	for (var i = 0, len = tokens.length; i < len; i++) {
 		var term = tokens[i];
 		if (!_terms[term]) { 
 			_terms[term] = {
 				count: 0,
 				freq: 0
 			}; 
 		};
 		_terms[term].count++;
 	}

 	// Then re-loop to calculate term frequency.
 	// We'll also update inverse document frequencies here.
 	var keys = Object.keys(_terms);
 	for (var i = 0, len = keys.length; i < len; i++) {
 		var term = keys[i];
 		// Term Frequency for this document.
 		_terms[term].freq = _terms[term].count / docObj.termCount;

 		// Inverse Document Frequency initialization
 		if (!this.terms[term]) {
 			this.terms[term] = {
 				n: 0, // Number of docs this term appears in, uniquely
 				idf: 0,
 				docs: {}
 			};
 		}

 		this.terms[term].n++;
 		this.terms[term].docs[doc.id]=1
 	};

 	// Calculate inverse document frequencies
 	!batch && this.updateIdf();

 	// Add docObj to docs db
 	docObj.terms = _terms;
 	this.documents[docObj.id] = docObj;
 };

 BM25.prototype.updateIdf = function() {
 	var keys = Object.keys(this.terms);
 	for (var i = 0, len = keys.length; i < len; i++) {
 		var term = keys[i];
 		var num = (this.totalDocuments - this.terms[term].n + 0.5);
 		var denom = (this.terms[term].n + 0.5);
 		this.terms[term].idf = Math.max(Math.log10(num / denom), 0.01);
 	}
 };

 BM25.prototype.getSortedTerms=function(doc,size){

    if(doc){
        if(!this.documents[doc]) return "That document doesn't exist.";
        size=size||_.keys(this.documents[doc].terms).length
        return _(this.documents[doc].terms).map((v,k) => ({key:k, freq:v.freq, count:v.count})).sortBy('count').takeRight(size).value().reverse()
    }else{
        size=size||_.keys(this.terms).length
        return _(this.terms).map((v,k) => ({key:k, idf:v.idf, n:v.n})).sortBy('n').takeRight(size).value().reverse()
    }
 }

 BM25.prototype.search = function(query, limit) {

 	var queryTerms = BM25.Tokenize(query);
 	var results = [];

 	// Look at each document in turn. There are better ways to do this with inverted indices.
 	var keys = Object.keys(this.documents);
 	for (var j = 0, nDocs = keys.length; j < nDocs; j++) {
 		var id = keys[j];
 		// The relevance score for a document is the sum of a tf-idf-like
 		// calculation for each query term.
 		this.documents[id]._score = 0;

 		// Calculate the score for each query term
 		for (var i = 0, len = queryTerms.length; i < len; i++) {
 			var queryTerm = queryTerms[i];

 			// We've never seen this term before so IDF will be 0.
 			// Means we can skip the whole term, it adds nothing to the score
 			// and isn't in any document.
 			if (typeof this.terms[queryTerm] === 'undefined') {
 				continue;
 			}

 			// This term isn't in the document, so the TF portion is 0 and this
 			// term contributes nothing to the search score.
 			if (typeof this.documents[id].terms[queryTerm] === 'undefined') {
 				continue;
 			}

 			// The term is in the document, let's go.
 			// The whole term is :
 			// IDF * (TF * (k1 + 1)) / (TF + k1 * (1 - b + b * docLength / avgDocLength))

 			// IDF is pre-calculated for the whole docset.
 			var idf = this.terms[queryTerm].idf;
 			// Numerator of the TF portion.
 			var num = this.documents[id].terms[queryTerm].count * (this.k1 + 1);
 			// Denomerator of the TF portion.
 			var denom = this.documents[id].terms[queryTerm].count 
 				+ (this.k1 * (1 - this.b + (this.b * this.documents[id].termCount / this.averageDocumentLength)));

 			// Add this query term to the score
 			this.documents[id]._score += idf * num / denom;
 		}

 		if (!isNaN(this.documents[id]._score) && this.documents[id]._score > 0) {
 			results.push(this.documents[id]);
 		}
 	}

 	results.sort(function(a, b) { return b._score - a._score; });
 	return results.slice(0, limit || 10);
 };

 var bm = new BM25;

 onmessage = function(e) {
 	var payload = e.data;
 	switch (payload.type) {

 		case 'index-batch':
 			//payload.data = payload.data.slice(0, 1000);
 			var len = payload.data.length;
 			for (var i in payload.data) {
 				bm.addDocument({id: i, body: payload.data[i]});
 				if (i % 100 === 0) {
 					postMessage({type:"index-update", data: i/len});
 				}
 			}
 			bm.updateIdf();
 			postMessage({type:"index-update", data: 1});
 			break;

 		case 'search':
 			postMessage({type: "search-results", data: bm.search(payload.data)});
 			break;

 	}
 }
 function analyzeCollection(arr, id_attr, attrs){
 	var tfidf = new BM25();
 	if(!attrs) attrs=Object.keys(arr[0]);

    // 	For each element add each of the attributes to a doc
 	_.map( arr, (d,i) =>{
 		// With the elements id_attr as the id
 		_.each(attrs, attr=> d[attr] ? tfidf.addDocument({id: (d[id_attr] ? d[id_attr] : i), body: d[attr]}, true) : -1 );		
 	})

    tfidf.updateIdf();
 	return tfidf;
  
 }


 function getScripts(){
  js=_.map(document.scripts, d => ({id: (d.src || _.uniqueId()), body: d.innerText}));
  var tfidf=new BM25();
  _.each(js, d => tfidf.addDocument(d, true) );
  tfidf.updateIdf();
  console.log(tfidf);
  console.log(tfidf.terms);
 };

 var repl={
    html: ['/&lt;/gi','/&gt;/gi','/&nbsp;/gi'],
    js_methods: ['console.log('],
    splits:[ /"/g, /,/g, /\w\.\w/gi, /\s.{1,3}\s/gi, /\W/g, /\s+/g, /\s[0-9]*\s/g, /\s[0-9]*px\s/g]
 }

 var filters = ["nbsp","lt","gt","init","Array","Date","eval","function","hasOwnProperty","Infinity","isFinite","isNaN","isPrototypeOf","length","Math","NaN","name","Number","Object","prototype","String","toString","undefined","valueOf","abstract", "arguments", "boolean", "break", "byte", "case", "catch", "char", "class", "const", "continue", "debugger", "default", "delete", "do", "double", "else", "enum", "eval", "export", "extends*", "false", "final", "finally", "float", "for", "function", "goto", "if", "implements", "import", "in", "instanceof", "int", "interface", "let", "long", "native", "new", "null", "package", "private", "protected", "public", "return", "short", "static", "super", "switch", "synchronized", "this", "throw", "throws", "transient", "true", "try", "typeof", "var", "void", "volatile", "while", "with", "yield"];
 function replaceWith(replace_regex, replace_val){
    return _.partial(_.curryReplace,replace_regex, replace_val);
 }
 _.curryReplace = function (rgxWhat, rplWith, text) {
  return String(text).replace(rgxWhat, rplWith);
 };

 _.mixin({
    replaceFlow: (v) => _.partial(_.replaceWith,_,v), 
    replaceWith: (replace_regex, replace_val) =>{
       return _.partial(_.curryReplace,replace_regex, replace_val);
    },
    curryReplace: (rgxWhat, rplWith, text) =>{
        return String(text).replace(rgxWhat, rplWith);
    },
    isntEmpty:_.negate(_.isEmpty),
    clean: function(s){
        s = _.toLower(s);
        var replacements = _.flatten(_.values(repl));
        replacements = _.map(replacements,_.replaceFlow(' '));
        var res = _.flow(replacements)(s)
                   .replace(/\s.{1,3}\s/gi, " ")
                   .replace(/\s.{1,3}\s/gi, " ")
                   .replace(/\s.{1,3}\s/gi, " ");
        res=_.pullAll(res.split(' '),filters).join(' ');
        return res;
    }
 })

 function analyzePage(){
    var tfidf = new BM25();
    
    x=document.body.innerHTML.replace(/(<([^>]+)>)/ig,"")
    x=_.flow(_.replaceFlow(' ')(repl.html))(x)
                   .replace(/[\(\){}[]"'`~]*/g,' ')
                   .replace(/\s+/g,' ')
                   .replace(/\s.{1,3}\s/gi, " ")
                   .replace(/\s.{1,3}\s/gi, " ")
                   .replace(/\s.{1,3}\s/gi, " ")
    
    
    tfidf.addDocument({id: "html",body:_.clean(x)}, true);
    tfidf.updateIdf();
    return tfidf
 }
	/**
	* Okapi BM25 Implementation
	* Burak Kanber, 2015
	*
	* License: you may use this for educational purposes only.
	*/

	/**
	* Porter stemmer
	*
	* I took the porter stemmer from here and minified it: http://tartarus.org/~martin/PorterStemmer/js.txt
	*
	* Original comments on the porter stemmer source:

	// Porter stemmer in Javascript. Few comments, but it's easy to follow against the rules in the original
	// paper, in
	//
	// Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
	// no. 3, pp 130-137,
	//
	// see also http://www.tartarus.org/~martin/PorterStemmer

	// Release 1 be 'andargor', Jul 2004
	// Release 2 (substantially revised) by Christopher McKenzie, Aug 2009
	*/

	var stemmer=function(){var e={ational:"ate",tional:"tion",enci:"ence",anci:"ance",izer:"ize",bli:"ble",alli:"al",entli:"ent",eli:"e",ousli:"ous",ization:"ize",ation:"ate",ator:"ate",alism:"al",iveness:"ive",fulness:"ful",ousness:"ous",aliti:"al",iviti:"ive",biliti:"ble",logi:"log"},i={icate:"ic",ative:"",alize:"al",iciti:"ic",ical:"ic",ful:"",ness:""},t="[^aeiou]",s="[aeiouy]",a=t+"[^aeiouy]",l=s+"[aeiou]",n="^("+a+")?"+l+a,o="^("+a+")?"+l+a+"("+l+")?$",c="^("+a+")?"+l+a+l+a,r="^("+a+")?"+s;return function(t){var l,u,x,$,p,v,g;if(t.length<3)return t;if(x=t.substr(0,1),"y"==x&&(t=x.toUpperCase()+t.substr(1)),$=/^(.+?)(ss\|i)es$/,p=/^(.+?)([^s])s$/,$.test(t)?t=t.replace($,"$1$2"):p.test(t)&&(t=t.replace(p,"$1$2")),$=/^(.+?)eed$/,p=/^(.+?)(ed\|ing)$/,$.test(t)){var f=$.exec(t);$=new RegExp(n),$.test(f[1])&&($=/.$/,t=t.replace($,""))}else if(p.test(t)){var f=p.exec(t);l=f[1],p=new RegExp(r),p.test(l)&&(t=l,p=/(at\|bl\|iz)$/,v=new RegExp("([^aeiouylsz])\\1$"),g=new RegExp("^"+a+s+"[^aeiouwxy]$"),p.test(t)?t+="e":v.test(t)?($=/.$/,t=t.replace($,"")):g.test(t)&&(t+="e"))}if($=/^(.+?)y$/,$.test(t)){var f=$.exec(t);l=f[1],$=new RegExp(r),$.test(l)&&(t=l+"i")}if($=/^(.+?)(ational\|tional\|enci\|anci\|izer\|bli\|alli\|entli\|eli\|ousli\|ization\|ation\|ator\|alism\|iveness\|fulness\|ousness\|aliti\|iviti\|biliti\|logi)$/,$.test(t)){var f=$.exec(t);l=f[1],u=f[2],$=new RegExp(n),$.test(l)&&(t=l+e[u])}if($=/^(.+?)(icate\|ative\|alize\|iciti\|ical\|ful\|ness)$/,$.test(t)){var f=$.exec(t);l=f[1],u=f[2],$=new RegExp(n),$.test(l)&&(t=l+i[u])}if($=/^(.+?)(al\|ance\|ence\|er\|ic\|able\|ible\|ant\|ement\|ment\|ent\|ou\|ism\|ate\|iti\|ous\|ive\|ize)$/,p=/^(.+?)(s\|t)(ion)$/,$.test(t)){var f=$.exec(t);l=f[1],$=new RegExp(c),$.test(l)&&(t=l)}else if(p.test(t)){var f=p.exec(t);l=f[1]+f[2],p=new RegExp(c),p.test(l)&&(t=l)}if($=/^(.+?)e$/,$.test(t)){var f=$.exec(t);l=f[1],$=new RegExp(c),p=new RegExp(o),v=new RegExp("^"+a+s+"[^aeiouwxy]$"),($.test(l)\|\|p.test(l)&&!v.test(l))&&(t=l)}return $=/ll$/,p=new RegExp(c),$.test(t)&&p.test(t)&&($=/.$/,t=t.replace($,"")),"y"==x&&(t=x.toLowerCase()+t.substr(1)),t}}();

	/**
	* Stopwords list taken from http://www.ranks.nl/stopwords
	*/

	var stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "aren't", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "can't", "cannot", "could", "couldn't", "did", "didn't", "do", "does", "doesn't", "doing", "don't", "down", "during", "each", "few", "for", "from", "further", "had", "hadn't", "has", "hasn't", "have", "haven't", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "isn't", "it", "it's", "its", "itself", "let's", "me", "more", "most", "mustn't", "my", "myself", "no", "nor", "not", "of", "off", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "shan't", "she", "she'd", "she'll", "she's", "should", "shouldn't", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "wasn't", "we", "we'd", "we'll", "we're", "we've", "were", "weren't", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "won't", "would", "wouldn't", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves"];

	var stopStems = stopwords.map(stemmer);

	function BM25() {
	this.documents = {};
	this.terms = {};
	this.totalDocumentTermLength = 0; // For average doc length.
	this.averageDocumentLength = 0;
	this.totalDocuments = 0;
	this.k1 = 1.3;
	this.b = 0.75;
	};

	// Static methods.

	BM25.Tokenize = function(text) {
	text = _.clean(text)
	.split(' ')
	.map(function(a) { return stemmer(a); })
	console.log(text);
	window.text=text;
	// Filter out stopStems
	var out = [];
	for (var i = 0, len = text.length; i < len; i++) {
	if (stopStems.indexOf(text[i]) === -1) {
	out.push(text[i]);
	}
	}

	return out;
	};

	// Instance methods.

	/**
	* @param doc Object Expects this parameter to have an id and body properties.
	*/

	BM25.prototype.addDocument = function(doc, batch) {
	if (typeof doc.id === 'undefined') { throw new Error(1000, 'ID is a required property of documents.'); };
	if (typeof doc.body === 'undefined') { throw new Error(1001, 'Body is a required property of documents.'); };

	// Raw tokenized list of words
	var tokens = BM25.Tokenize(doc.body).filter( a => a.length>0);

	// Will hold unique terms and their counts and frequencies
	var _terms = {};

	// docObj will eventually be added to the documents database
	var docObj = {id: doc.id, tokens: tokens, body: doc.body};

	// Count number of terms
	docObj.termCount = tokens.length;

	// Increment totalDocuments
	this.totalDocuments++;

	// Readjust averageDocumentLength
	this.totalDocumentTermLength += docObj.termCount;
	this.averageDocumentLength = this.totalDocumentTermLength / this.totalDocuments;

	// Calculate term frequency
	// First get terms count
	for (var i = 0, len = tokens.length; i < len; i++) {
	var term = tokens[i];
	if (!_terms[term]) {
	_terms[term] = {
	count: 0,
	freq: 0
	};
	};
	_terms[term].count++;
	}

	// Then re-loop to calculate term frequency.
	// We'll also update inverse document frequencies here.
	var keys = Object.keys(_terms);
	for (var i = 0, len = keys.length; i < len; i++) {
	var term = keys[i];
	// Term Frequency for this document.
	_terms[term].freq = _terms[term].count / docObj.termCount;

	// Inverse Document Frequency initialization
	if (!this.terms[term]) {
	this.terms[term] = {
	n: 0, // Number of docs this term appears in, uniquely
	idf: 0,
	docs: {}
	};
	}

	this.terms[term].n++;
	this.terms[term].docs[doc.id]=1
	};

	// Calculate inverse document frequencies
	!batch && this.updateIdf();

	// Add docObj to docs db
	docObj.terms = _terms;
	this.documents[docObj.id] = docObj;
	};

	BM25.prototype.updateIdf = function() {
	var keys = Object.keys(this.terms);
	for (var i = 0, len = keys.length; i < len; i++) {
	var term = keys[i];
	var num = (this.totalDocuments - this.terms[term].n + 0.5);
	var denom = (this.terms[term].n + 0.5);
	this.terms[term].idf = Math.max(Math.log10(num / denom), 0.01);
	}
	};

	BM25.prototype.getSortedTerms=function(doc,size){

	if(doc){
	if(!this.documents[doc]) return "That document doesn't exist.";
	size=size\|\|_.keys(this.documents[doc].terms).length
	return _(this.documents[doc].terms).map((v,k) => ({key:k, freq:v.freq, count:v.count})).sortBy('count').takeRight(size).value().reverse()
	}else{
	size=size\|\|_.keys(this.terms).length
	return _(this.terms).map((v,k) => ({key:k, idf:v.idf, n:v.n})).sortBy('n').takeRight(size).value().reverse()
	}
	}

	BM25.prototype.search = function(query, limit) {

	var queryTerms = BM25.Tokenize(query);
	var results = [];

	// Look at each document in turn. There are better ways to do this with inverted indices.
	var keys = Object.keys(this.documents);
	for (var j = 0, nDocs = keys.length; j < nDocs; j++) {
	var id = keys[j];
	// The relevance score for a document is the sum of a tf-idf-like
	// calculation for each query term.
	this.documents[id]._score = 0;

	// Calculate the score for each query term
	for (var i = 0, len = queryTerms.length; i < len; i++) {
	var queryTerm = queryTerms[i];

	// We've never seen this term before so IDF will be 0.
	// Means we can skip the whole term, it adds nothing to the score
	// and isn't in any document.
	if (typeof this.terms[queryTerm] === 'undefined') {
	continue;
	}

	// This term isn't in the document, so the TF portion is 0 and this
	// term contributes nothing to the search score.
	if (typeof this.documents[id].terms[queryTerm] === 'undefined') {
	continue;
	}

	// The term is in the document, let's go.
	// The whole term is :
	// IDF * (TF * (k1 + 1)) / (TF + k1 * (1 - b + b * docLength / avgDocLength))

	// IDF is pre-calculated for the whole docset.
	var idf = this.terms[queryTerm].idf;
	// Numerator of the TF portion.
	var num = this.documents[id].terms[queryTerm].count * (this.k1 + 1);
	// Denomerator of the TF portion.
	var denom = this.documents[id].terms[queryTerm].count
	+ (this.k1 * (1 - this.b + (this.b * this.documents[id].termCount / this.averageDocumentLength)));

	// Add this query term to the score
	this.documents[id]._score += idf * num / denom;
	}

	if (!isNaN(this.documents[id]._score) && this.documents[id]._score > 0) {
	results.push(this.documents[id]);
	}
	}

	results.sort(function(a, b) { return b._score - a._score; });
	return results.slice(0, limit \|\| 10);
	};

	var bm = new BM25;

	onmessage = function(e) {
	var payload = e.data;
	switch (payload.type) {

	case 'index-batch':
	//payload.data = payload.data.slice(0, 1000);
	var len = payload.data.length;
	for (var i in payload.data) {
	bm.addDocument({id: i, body: payload.data[i]});
	if (i % 100 === 0) {
	postMessage({type:"index-update", data: i/len});
	}
	}
	bm.updateIdf();
	postMessage({type:"index-update", data: 1});
	break;

	case 'search':
	postMessage({type: "search-results", data: bm.search(payload.data)});
	break;

	}
	}
	function analyzeCollection(arr, id_attr, attrs){
	var tfidf = new BM25();
	if(!attrs) attrs=Object.keys(arr[0]);

	// For each element add each of the attributes to a doc
	_.map( arr, (d,i) =>{
	// With the elements id_attr as the id
	_.each(attrs, attr=> d[attr] ? tfidf.addDocument({id: (d[id_attr] ? d[id_attr] : i), body: d[attr]}, true) : -1 );
	})

	tfidf.updateIdf();
	return tfidf;

	}


	function getScripts(){
	js=_.map(document.scripts, d => ({id: (d.src \|\| _.uniqueId()), body: d.innerText}));
	var tfidf=new BM25();
	_.each(js, d => tfidf.addDocument(d, true) );
	tfidf.updateIdf();
	console.log(tfidf);
	console.log(tfidf.terms);
	};

	var repl={
	html: ['/</gi','/>/gi','/ /gi'],
	js_methods: ['console.log('],
	splits:[ /"/g, /,/g, /\w\.\w/gi, /\s.{1,3}\s/gi, /\W/g, /\s+/g, /\s[0-9]\s/g, /\s[0-9]px\s/g]
	}

	var filters = ["nbsp","lt","gt","init","Array","Date","eval","function","hasOwnProperty","Infinity","isFinite","isNaN","isPrototypeOf","length","Math","NaN","name","Number","Object","prototype","String","toString","undefined","valueOf","abstract", "arguments", "boolean", "break", "byte", "case", "catch", "char", "class", "const", "continue", "debugger", "default", "delete", "do", "double", "else", "enum", "eval", "export", "extends*", "false", "final", "finally", "float", "for", "function", "goto", "if", "implements", "import", "in", "instanceof", "int", "interface", "let", "long", "native", "new", "null", "package", "private", "protected", "public", "return", "short", "static", "super", "switch", "synchronized", "this", "throw", "throws", "transient", "true", "try", "typeof", "var", "void", "volatile", "while", "with", "yield"];
	function replaceWith(replace_regex, replace_val){
	return _.partial(_.curryReplace,replace_regex, replace_val);
	}
	_.curryReplace = function (rgxWhat, rplWith, text) {
	return String(text).replace(rgxWhat, rplWith);
	};

	_.mixin({
	replaceFlow: (v) => _.partial(_.replaceWith,_,v),
	replaceWith: (replace_regex, replace_val) =>{
	return _.partial(_.curryReplace,replace_regex, replace_val);
	},
	curryReplace: (rgxWhat, rplWith, text) =>{
	return String(text).replace(rgxWhat, rplWith);
	},
	isntEmpty:_.negate(_.isEmpty),
	clean: function(s){
	s = _.toLower(s);
	var replacements = _.flatten(_.values(repl));
	replacements = _.map(replacements,_.replaceFlow(' '));
	var res = _.flow(replacements)(s)
	.replace(/\s.{1,3}\s/gi, " ")
	.replace(/\s.{1,3}\s/gi, " ")
	.replace(/\s.{1,3}\s/gi, " ");
	res=_.pullAll(res.split(' '),filters).join(' ');
	return res;
	}
	})

	function analyzePage(){
	var tfidf = new BM25();

	x=document.body.innerHTML.replace(/(<([^>]+)>)/ig,"")
	x=_.flow(_.replaceFlow(' ')(repl.html))(x)
	.replace(/[\(\){}[]"'`~]*/g,' ')
	.replace(/\s+/g,' ')
	.replace(/\s.{1,3}\s/gi, " ")
	.replace(/\s.{1,3}\s/gi, " ")
	.replace(/\s.{1,3}\s/gi, " ")


	tfidf.addDocument({id: "html",body:_.clean(x)}, true);
	tfidf.updateIdf();
	return tfidf
	}