Skip to content

Instantly share code, notes, and snippets.

@nguyentamvinhlong
Forked from amoilanen/webcrawler.js
Last active August 29, 2015 14:09
Show Gist options
  • Save nguyentamvinhlong/63419e5384814c707cfb to your computer and use it in GitHub Desktop.
Save nguyentamvinhlong/63419e5384814c707cfb to your computer and use it in GitHub Desktop.
//PhantomJS http://phantomjs.org/ based web crawler Anton Ivanov [email protected] 2012
//UPDATE: This gist has been made into a Node.js module and now can be installed with "npm install js-crawler"
//the Node.js version does not use Phantom.JS, but the API available to the client is similar to the present gist
(function(host) {
function Crawler() {
this.visitedURLs = {};
};
Crawler.webpage = require('webpage');
Crawler.prototype.crawl = function (url, depth, onSuccess, onFailure) {
if (0 == depth || this.visitedURLs[url]) {
return;
};
var self = this;
var page = Crawler.webpage.create();
page.open(url, function (status) {
if ('fail' === status) {
onFailure({
url: url,
status: status
});
} else {
var documentHTML = page.evaluate(function () {
return document.body && document.body.innerHTML ? document.body.innerHTML : "";
});
self.crawlURLs(self.getAllURLs(page), depth - 1, onSuccess, onFailure);
self.visitedURLs[url] = true;
onSuccess({
url: url,
status: status,
content: documentHTML
});
};
});
};
Crawler.prototype.getAllURLs = function(page) {
return page.evaluate(function () {
return Array.prototype.slice.call(document.querySelectorAll("a"), 0)
.map(function (link) {
return link.getAttribute("href");
});
});
};
Crawler.prototype.crawlURLs = function(urls, depth, onSuccess, onFailure) {
var self = this;
urls.filter(function (url) {
return Crawler.isTopLevelURL(url);
}).forEach(function (url) {
self.crawl(url, depth, onSuccess, onFailure);
});
};
Crawler.isTopLevelURL = function(url) {
return 0 == url.indexOf("http");
};
host.Crawler = Crawler;
})(phantom);
new phantom.Crawler().crawl("https://github.com/ariya/phantomjs/wiki/Quick-Start", 2,
function onSuccess(page) {
console.log("Loaded page. URL = " + page.url + " content length = " + page.content.length + " status = " + page.status);
},
function onFailure(page) {
console.log("Could not load page. URL = " + page.url + " status = " + page.status);
}
);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment