Skip to content

Instantly share code, notes, and snippets.

@beenanner
Created May 29, 2017 04:40
Show Gist options
  • Save beenanner/310812658c74f90af3c390809aab393f to your computer and use it in GitHub Desktop.
Save beenanner/310812658c74f90af3c390809aab393f to your computer and use it in GitHub Desktop.
200 URL error on simple crawler
var Crawler = require("simplecrawler");
var cheerio = require("cheerio");
var crawler = Crawler("http://www.which.co.uk/sitemap/")
.on("fetchcomplete", function () {
console.log("Fetched a resource!")
});
crawler.discoverResources = function(buffer, queueItem) {
var $ = cheerio.load(buffer.toString("utf8"));
return $("a[href]").map(function () {
return $(this).attr("href");
}).get();
};
var originalEmit = crawler.emit;
crawler.emit = function(evtName, queueItem) {
crawler.queue.countItems({ fetched: true }, function(err, completeCount) {
if (err) {
throw err;
}
crawler.queue.getLength(function(err, length) {
if (err) {
throw err;
}
console.log("fetched %d of %d — %d open requests, %d open listeners",
completeCount,
length,
crawler._openRequests.length,
crawler._openListeners);
});
});
console.log(evtName, queueItem ? queueItem.url ? queueItem.url : queueItem : null);
originalEmit.apply(crawler, arguments);
};
crawler.on("fetchcomplete", function(queueItem, responseBuffer, response) {
console.log("I just received %s (%d bytes)", queueItem.url, responseBuffer.length);
console.log("It was a resource of type %s", response.headers['content-type']);
});
crawler.start();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment