Last active
October 29, 2019 16:01
-
-
Save dsottimano/2af808b30de1bec51506e0855141dceb to your computer and use it in GitHub Desktop.
Keyword frequency table generator - apps script - google sheets
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Returns a table of ngrams and their importance | |
* | |
* @param {"cars are the best"} textArray REQUIRED The corpus you want statistics from | |
* @param {"3"} numberOccurances OPTIONAL Show results with at least X occurrences. Default is 2 | |
* @param {"4"} numberOfWords OPTIONAL Show statistics for one to X words. Default is 5 | |
* @param {"false"} removeStopWords OPTIONAL true or false. False by default | |
* @customfunction | |
*/ | |
function KEYWORD_FREQUENCY_TABLE(textArray,numberOccurances,numberOfWords,removeStopWords) { | |
var text = '' | |
try { | |
if (textArray.map) textArray = textArray.flat(Infinity).join('').toString() | |
removeStopWords ? text = remove_stopwords(textArray) : text = textArray | |
var atLeast = numberOccurances || 2; // Show results with at least .. occurrences | |
var numWords = numberOfWords || 5; // Show statistics for one to .. words | |
var ignoreCase = true; // Case-sensitivity | |
var REallowedChars = /[^a-zA-Z'\-]+/g; | |
// RE pattern to select valid characters. Invalid characters are replaced with a whitespace | |
var i, j, k, textlen, len, s; | |
// Prepare key hash | |
var keys = [null]; //"keys[0] = null", a word boundary with length zero is empty | |
var results = []; | |
numWords++; //for human logic, we start counting at 1 instead of 0 | |
for (i = 1; i <= numWords; i++) { | |
keys.push({}); | |
} | |
// Remove all irrelevant characters | |
text = text.replace(REallowedChars, " ").replace(/^\s+/, "").replace(/\s+$/, ""); | |
// Create a hash | |
if (ignoreCase) text = text.toLowerCase(); | |
text = text.split(/\s+/); | |
for (i = 0, textlen = text.length; i < textlen; i++) { | |
s = text[i]; | |
keys[1][s] = (keys[1][s] || 0) + 1; | |
for (j = 2; j <= numWords; j++) { | |
if (i + j <= textlen) { | |
s += " " + text[i + j - 1]; | |
keys[j][s] = (keys[j][s] || 0) + 1; | |
} else break; | |
} | |
} | |
// Prepares results for advanced analysis | |
for (var k = 1; k <= numWords; k++) { | |
results[k] = []; | |
var key = keys[k]; | |
for (var i in key) { | |
if (key[i] >= atLeast) results[k].push({ | |
"word": i, | |
"count": key[i] | |
}); | |
} | |
} | |
// Result parsing | |
var outputHTML = []; | |
var f_sortAscending = function (x, y) { | |
return y.count - x.count; | |
}; | |
for (k = 1; k < numWords; k++) { | |
results[k].sort(f_sortAscending); //sorts results | |
// Customize your output. For example: | |
var words = results[k]; | |
if (words.length) { | |
if (k>1) outputHTML.push([,,,]) | |
outputHTML.push([k + ' word' + (k == 1 ? "" : "s"),"Count","Relativity"]); | |
outputHTML.push([,,,]) | |
} | |
for (i = 0, len = words.length; i < len; i++) { | |
outputHTML.push([words[i].word, words[i].count ,Math.round(words[i].count / textlen * 10000) / 100]); | |
} | |
} | |
if (outputHTML.length < 1) return "Sorry, not enough data" | |
return outputHTML | |
} catch(e) { | |
Logger.log(e) | |
return e | |
} | |
} | |
//https://stackoverflow.com/a/57153507/2121455 | |
function remove_stopwords(str) { | |
var stopwords = ['i','me','my','myself','we','our','ours','ourselves','you','your','yours','yourself','yourselves','he','him','his','himself','she','her','hers','herself','it','its','itself','they','them','their','theirs','themselves','what','which','who','whom','this','that','these','those','am','is','are','was','were','be','been','being','have','has','had','having','do','does','did','doing','a','an','the','and','but','if','or','because','as','until','while','of','at','by','for','with','about','against','between','into','through','during','before','after','above','below','to','from','up','down','in','out','on','off','over','under','again','further','then','once','here','there','when','where','why','how','all','any','both','each','few','more','most','other','some','such','no','nor','not','only','own','same','so','than','too','very','s','t','can','will','just','don','should','now'] | |
var str = str.toString() | |
res = [] | |
words = str.split(' ') | |
for(i=0;i<words.length;i++) { | |
if(!stopwords.includes(words[i])) { | |
res.push(words[i]) | |
} | |
} | |
return(res.join(' ')) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment