Skip to content

Instantly share code, notes, and snippets.

@Invertisment
Forked from WA9ACE/spider.js
Created March 22, 2016 14:32
Show Gist options
  • Save Invertisment/433c6f5d71d1b5bb50c0 to your computer and use it in GitHub Desktop.
Save Invertisment/433c6f5d71d1b5bb50c0 to your computer and use it in GitHub Desktop.
Node.js Web Crawler using Request and Cheerio
var request = require('request');
var cheerio = require('cheerio');
var fs = require('fs');
var data = fs.createWriteStream('data.txt', {'flags': 'a'});
var urlsToCrawl = [];
var spider = function(url) {
var index = urlsToCrawl.indexOf(url);
// Remove the current url we're crawling from the list to be crawled.
if(index > -1) {
urlsToCrawl.splice(index, 1);
}
try {
request(url, function(error, response, body) {
if(!error && response.statusCode == 200) {
var $ = cheerio.load(body);
data.write($.html());
console.log('Data saved for url: ' + url);
$('a').each(function(i, element) {
var link = element.attribs.href;
urlsToCrawl.push(link);
});
// console.log(urlsToCrawl.length);
return spider(urlsToCrawl[0]);
} else {
// This was probably a relative url or a page anchor,
// which I don't account for yet.
// console.log(urlsToCrawl.length);
return spider(urlsToCrawl[0]);
// console.log(error);
}
});
} catch(error) {
return spider(urlsToCrawl[0]);
// console.log(error);
}
};
spider('https://news.ycombinator.com/');
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment