Skip to content

Instantly share code, notes, and snippets.

@krisanalfa
Created January 17, 2017 05:16
Show Gist options
  • Save krisanalfa/9cab4c2d327976dda4cf176297167d32 to your computer and use it in GitHub Desktop.
Save krisanalfa/9cab4c2d327976dda4cf176297167d32 to your computer and use it in GitHub Desktop.
Wordnet
// npm install "i" "cheerio" "stopword" "console.table"
// Configuration
const limit = 20
const lowest = 3
const highest = 20
const query = 'webcrawl'
const url = 'https://google.com/search?q=' + query
// Dependency
const inflect = require('i')()
const sw = require('stopword')
const request = require('request')
const cheerio = require('cheerio')
// Adds console.table method that prints an array
// of objects as a table in console
require('console.table')
// Dynamic variable
var corpus = {},
totalResults = 0,
resultsDownloaded = 0
function callback () {
resultsDownloaded++
if (resultsDownloaded !== totalResults) {
return
}
var words = []
// stick all words in an array
for (prop in corpus) {
words.push({
word: prop,
count: corpus[prop]
})
}
// sort array based on how often they occur
words.sort(function (first, next) {
return next.count - first.count
})
// finally, log the first twenty most popular words
var message = 'Top ' + limit + ' appear words on "' + query + '"'
console.log("\r\n" + Array(message.length + 1).join('-'))
console.table(message, words.slice(0, limit))
}
console.log('Requesting Google Search Result for: %s', url)
request(url, function (error, response, body) {
if (error) {
console.log("Couldn't get page because of error: %s", error)
return
}
// load the body of the page into Cheerio so we can traverse the DOM
const $ = cheerio.load(body),
links = $('.r a')
links.each(function (i, link) {
// get the href attribute of each link
var url = $(link).attr('href')
// strip out unnecessary junk
url = url.replace('/url?q=', '').split('&')[0]
if (url.charAt(0) === '/') {
return
}
// this link counts as a result, so increment results
totalResults++
console.log('Requesting to a Google Search Result WebPage: %s', url)
// download that page
request(url, function (error, response, body) {
if (error) {
console.log("Couldn't get page because of error: %s", error)
return
}
console.log('WebPage %s is ready. Processing words on WebPage.', url)
// load the page into cheerio
var $page = cheerio.load(body),
text = $page('body').text(),
words = []
// throw away extra whitespace and non-alphanumeric characters
text = text.replace(/\s+/g, ' ')
.replace(/[^a-zA-Z ]/g, '')
.toLowerCase()
words = sw.removeStopwords(text.split(' '))
// split on spaces for a list of all the words on that page and
// loop through that list
words.forEach(function (word) {
if (query.split('+').indexOf(word) >= 0) {
return
}
word = inflect.singularize(word)
// we don't want to include very short or long words, as they're
// probably bad data
if (word.length <= lowest || word.length >= highest) {
return
}
if (corpus[word]) {
// if this word is already in our 'corpus', our collection
// of terms, increase the count by one
corpus[word]++
} else {
// otherwise, say that we've found one of that word so far
corpus[word] = 1
}
})
// and when our request is completed, call the callback to wrap up!
callback.call(this)
})
})
})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment