Skip to content

Instantly share code, notes, and snippets.

@amoilanen
Last active March 24, 2022 03:14
Show Gist options
  • Save amoilanen/3848638 to your computer and use it in GitHub Desktop.
Save amoilanen/3848638 to your computer and use it in GitHub Desktop.
Simple PhantomJS-based web crawler library
//PhantomJS http://phantomjs.org/ based web crawler Anton Ivanov [email protected] 2012
//UPDATE: This gist has been made into a Node.js module and now can be installed with "npm install js-crawler"
//the Node.js version does not use Phantom.JS, but the API available to the client is similar to the present gist
(function(host) {
function Crawler() {
this.visitedURLs = {};
};
Crawler.webpage = require('webpage');
Crawler.prototype.crawl = function (url, depth, onSuccess, onFailure) {
if (0 == depth || this.visitedURLs[url]) {
return;
};
var self = this;
var page = Crawler.webpage.create();
page.open(url, function (status) {
if ('fail' === status) {
onFailure({
url: url,
status: status
});
} else {
var documentHTML = page.evaluate(function () {
return document.body && document.body.innerHTML ? document.body.innerHTML : "";
});
self.crawlURLs(self.getAllURLs(page), depth - 1, onSuccess, onFailure);
self.visitedURLs[url] = true;
onSuccess({
url: url,
status: status,
content: documentHTML
});
};
});
};
Crawler.prototype.getAllURLs = function(page) {
return page.evaluate(function () {
return Array.prototype.slice.call(document.querySelectorAll("a"), 0)
.map(function (link) {
return link.getAttribute("href");
});
});
};
Crawler.prototype.crawlURLs = function(urls, depth, onSuccess, onFailure) {
var self = this;
urls.filter(function (url) {
return Crawler.isTopLevelURL(url);
}).forEach(function (url) {
self.crawl(url, depth, onSuccess, onFailure);
});
};
Crawler.isTopLevelURL = function(url) {
return 0 == url.indexOf("http");
};
host.Crawler = Crawler;
})(phantom);
new phantom.Crawler().crawl("https://github.com/ariya/phantomjs/wiki/Quick-Start", 2,
function onSuccess(page) {
console.log("Loaded page. URL = " + page.url + " content length = " + page.content.length + " status = " + page.status);
},
function onFailure(page) {
console.log("Could not load page. URL = " + page.url + " status = " + page.status);
}
);
@brandondrew
Copy link

Any reason you switched away from PhantomJS?

@amjathk
Copy link

amjathk commented Jul 29, 2016

how to use this code in phantomjs

@amoilanen
Copy link
Author

Any reason you switched away from PhantomJS?

PhantomJS may be a bit heavier to use than just issuing HTTP requests over the network, however, maybe PhantomJS can be supported as an alternative underlying request making tool for js-crawler https://github.com/antivanov/js-crawler

@amoilanen
Copy link
Author

amoilanen commented Nov 25, 2016

how to use this code in phantomjs

You can run phantomjs webcrawler.js

@ariemeow
Copy link

ariemeow commented Feb 1, 2017

when the crawler will stop? or it never end?

#sorry for my bad english

@guiyang882
Copy link

hi,
This program can render the js code which not in the html !
I found a lot of pages need to render the HTML that can loaded the effect infomation !

thx !

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment