Created
January 17, 2017 05:16
-
-
Save krisanalfa/9cab4c2d327976dda4cf176297167d32 to your computer and use it in GitHub Desktop.
Wordnet
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// npm install "i" "cheerio" "stopword" "console.table" | |
// Configuration | |
const limit = 20 | |
const lowest = 3 | |
const highest = 20 | |
const query = 'webcrawl' | |
const url = 'https://google.com/search?q=' + query | |
// Dependency | |
const inflect = require('i')() | |
const sw = require('stopword') | |
const request = require('request') | |
const cheerio = require('cheerio') | |
// Adds console.table method that prints an array | |
// of objects as a table in console | |
require('console.table') | |
// Dynamic variable | |
var corpus = {}, | |
totalResults = 0, | |
resultsDownloaded = 0 | |
function callback () { | |
resultsDownloaded++ | |
if (resultsDownloaded !== totalResults) { | |
return | |
} | |
var words = [] | |
// stick all words in an array | |
for (prop in corpus) { | |
words.push({ | |
word: prop, | |
count: corpus[prop] | |
}) | |
} | |
// sort array based on how often they occur | |
words.sort(function (first, next) { | |
return next.count - first.count | |
}) | |
// finally, log the first twenty most popular words | |
var message = 'Top ' + limit + ' appear words on "' + query + '"' | |
console.log("\r\n" + Array(message.length + 1).join('-')) | |
console.table(message, words.slice(0, limit)) | |
} | |
console.log('Requesting Google Search Result for: %s', url) | |
request(url, function (error, response, body) { | |
if (error) { | |
console.log("Couldn't get page because of error: %s", error) | |
return | |
} | |
// load the body of the page into Cheerio so we can traverse the DOM | |
const $ = cheerio.load(body), | |
links = $('.r a') | |
links.each(function (i, link) { | |
// get the href attribute of each link | |
var url = $(link).attr('href') | |
// strip out unnecessary junk | |
url = url.replace('/url?q=', '').split('&')[0] | |
if (url.charAt(0) === '/') { | |
return | |
} | |
// this link counts as a result, so increment results | |
totalResults++ | |
console.log('Requesting to a Google Search Result WebPage: %s', url) | |
// download that page | |
request(url, function (error, response, body) { | |
if (error) { | |
console.log("Couldn't get page because of error: %s", error) | |
return | |
} | |
console.log('WebPage %s is ready. Processing words on WebPage.', url) | |
// load the page into cheerio | |
var $page = cheerio.load(body), | |
text = $page('body').text(), | |
words = [] | |
// throw away extra whitespace and non-alphanumeric characters | |
text = text.replace(/\s+/g, ' ') | |
.replace(/[^a-zA-Z ]/g, '') | |
.toLowerCase() | |
words = sw.removeStopwords(text.split(' ')) | |
// split on spaces for a list of all the words on that page and | |
// loop through that list | |
words.forEach(function (word) { | |
if (query.split('+').indexOf(word) >= 0) { | |
return | |
} | |
word = inflect.singularize(word) | |
// we don't want to include very short or long words, as they're | |
// probably bad data | |
if (word.length <= lowest || word.length >= highest) { | |
return | |
} | |
if (corpus[word]) { | |
// if this word is already in our 'corpus', our collection | |
// of terms, increase the count by one | |
corpus[word]++ | |
} else { | |
// otherwise, say that we've found one of that word so far | |
corpus[word] = 1 | |
} | |
}) | |
// and when our request is completed, call the callback to wrap up! | |
callback.call(this) | |
}) | |
}) | |
}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment