-
-
Save mrkpatchaa/7b9a30fcf81880986dbe to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var request = require("request"), | |
cheerio = require("cheerio"), | |
url = "https://www.google.com/search?q=data+mining", | |
corpus = {}, | |
totalResults = 0, | |
resultsDownloaded = 0; | |
function callback () { | |
resultsDownloaded++; | |
if (resultsDownloaded !== totalResults) { | |
return; | |
} | |
var words = []; | |
// stick all words in an array | |
for (prop in corpus) { | |
words.push({ | |
word: prop, | |
count: corpus[prop] | |
}); | |
} | |
// sort array based on how often they occur | |
words.sort(function (a, b) { | |
return b.count - a.count; | |
}); | |
// finally, log the first fifty most popular words | |
console.log(words.slice(0, 20)); | |
} | |
request(url, function (error, response, body) { | |
if (error) { | |
console.log(“Couldn’t get page because of error: “ + error); | |
return; | |
} | |
// load the body of the page into Cheerio so we can traverse the DOM | |
var $ = cheerio.load(body), | |
links = $(".r a"); | |
links.each(function (i, link) { | |
// get the href attribute of each link | |
var url = $(link).attr("href"); | |
// strip out unnecessary junk | |
url = url.replace("/url?q=", "").split("&")[0]; | |
if (url.charAt(0) === "/") { | |
return; | |
} | |
// this link counts as a result, so increment results | |
totalResults++; | |
// download that page | |
request(url, function (error, response, body) { | |
if (error) { | |
console.log(“Couldn’t get page because of error: “ + error); | |
return; | |
} | |
// load the page into cheerio | |
var $page = cheerio.load(body), | |
text = $page("body").text(); | |
// throw away extra whitespace and non-alphanumeric characters | |
text = text.replace(/\s+/g, " ") | |
.replace(/[^a-zA-Z ]/g, "") | |
.toLowerCase(); | |
// split on spaces for a list of all the words on that page and | |
// loop through that list | |
text.split(" ").forEach(function (word) { | |
// we don't want to include very short or long words, as they're | |
// probably bad data | |
if (word.length < 4 || word.length > 20) { | |
return; | |
} | |
if (corpus[word]) { | |
// if this word is already in our "corpus", our collection | |
// of terms, increase the count by one | |
corpus[word]++; | |
} else { | |
// otherwise, say that we've found one of that word so far | |
corpus[word] = 1; | |
} | |
}); | |
// and when our request is completed, call the callback to wrap up! | |
callback(); | |
}); | |
}); | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment