Skip to content

Instantly share code, notes, and snippets.

@NickTomlin
Created June 25, 2013 15:14
Show Gist options
  • Save NickTomlin/5859287 to your computer and use it in GitHub Desktop.
Save NickTomlin/5859287 to your computer and use it in GitHub Desktop.
Simple Scraper using node-crawler
var Crawler = require('crawler').Crawler;
var fs = require("fs");
var pageCache = {};
write = fs.createWriteStream('crawled.txt');
write.on('error', function(err){
console.log(err);
});
var posts = function(error, result, $) {
$title = $('.entry-title').text();
$content = $('.entry-content').html();
console.log("====" + $title + "====");
console.log($content);
pageCache[$title] = $content;
write.write($content);
};
var pages = function(error, result, $) {
$next = $('.nav-previous a');
$posts = $(".type-post .more-link");
$next.each(function(index, a){
c.queue(
[{
"uri": a.href,
"callback": pages
}]
);
});
$posts.each(function(index, a ){
c.queue(
[{
"uri": a.href,
"callback": posts
}]
);
});
};
var c = new Crawler({
"maxConnections": 10,
"callback": pages
});
c.queue("http://addyosmani.com/blog/");
/*
for each top level page, grab all the posts on the page,
AND grab the next page link, if it exists.
*/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment