Skip to content

Instantly share code, notes, and snippets.

@richard512
Last active August 29, 2015 14:23
Show Gist options
  • Save richard512/d955bc6f1d637ea06bee to your computer and use it in GitHub Desktop.
Save richard512/d955bc6f1d637ea06bee to your computer and use it in GitHub Desktop.
reddit comment word frequency analysis in javascript
function wordFreq(sWords) {
// make sure input looks right
if (!sWords) return false;
if (typeof sWords != 'string') return false;
// converts to lowercase. trims leading and trailing spaces
// removes all commas, semicolins, and periods
// converts string to array of words, split by space
sWords = sWords.toLowerCase().trim();
sWords = sWords.replace(/[,;.]/g, '');
sWords = sWords.split(/[\s\/]+/g);
// wordFreq() will ignore these words:
var ignore = [
'and', 'the', 'to', 'a', 'of', 'for', 'as', 'i', 'with', 'it',
'is', 'on', 'that', 'this', 'can', 'in', 'be', 'has', 'if'
];
var wordfreqs = []
for (i in sWords) {
sWord = sWords[i];
if (ignore.indexOf(sWord) < 0) {
// if it's not an ignored word:
wordindex = findWithAttr(wordfreqs, 'word', sWord);
if (typeof wordindex == 'undefined') {
// new word:
wordfreqs.push({
word: sWord,
freq: 1
});
} else {
// duplicate word:
wordfreqs[wordindex].freq++;
}
}
}
return wordfreqs;
}
function findWithAttr(array, attr, value) {
for (var i = 0; i < array.length; i += 1) {
if (array[i][attr] === value) {
return i;
}
}
}
function sortByKey(array, key, order) {
switch (order) {
case 'numasc': // numeric, ascending
mysortfunc = function(x, y) {
return ((x < y) ? -1 : ((x > y) ? 1 : 0));
}
break;
default:
case 'numdesc': // numeric, descending
mysortfunc = function(x, y) {
return ((x > y) ? -1 : ((x < y) ? 1 : 0))
}
break;
}
return array.sort(function(a, b) {
var x = a[key];
var y = b[key];
return mysortfunc(x, y);
});
}
function sortNumber(a,b) {
return a - b;
}
function getTopRedditCommentWords(toppercent) {
scores = $('.sitetable .thing .score.unvoted').map(function(){
score = parseInt($(this).text().replace(' points',''))
return (score)
}).get().sort(sortNumber).reverse();
cutoffindex = Math.round(scores.length * (toppercent/100))
cutoffscore = scores.slice(0,cutoffindex).pop()
console.log(cutoffscore)
comments = []
$('.sitetable .entry').map(function(){
score = $(this).find('.score.unvoted')
score = parseInt(score.text().replace(' points',''))
if (score >= cutoffscore){
$(this).find('.usertext-body').each(function(){
comment = $(this).text().trim()
if (comment) comments.push(comment)
})
}
})
console.log(comments)
doctext = comments.join('\n');
wordfreqs = wordFreq(doctext);
wordfreqs = sortByKey(wordfreqs, 'freq', 'numdesc');
wordfreqs = wordfreqs.slice(0, 10);
topten = [];
for (i in wordfreqs) {
topten.push([wordfreqs[i].freq, wordfreqs[i].word])
}
return topten;
}
// gets the words most frequently used in
// the top 20% of comments on the current page
topten = getTopRedditCommentWords(20);
alert(topten.join('\n'));
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment