Skip to content

Instantly share code, notes, and snippets.

@pieplu
Last active May 22, 2017 02:22
Show Gist options
  • Save pieplu/a19d925c31a1d1d41b7226f8186af48b to your computer and use it in GitHub Desktop.
Save pieplu/a19d925c31a1d1d41b7226f8186af48b to your computer and use it in GitHub Desktop.
var casper = require("casper").create({
pageSettings: {
loadImages: false,
loadPlugins: false
}
});
var checked = [];
var currentLink = 0;
var fs = require('fs');
var upTo = ~~casper.cli.get('max-depth') || 100;
var url = casper.cli.get(0);
var baseUrl = url;
var links = [url];
var utils = require('utils');
var f = utils.format;
function absPath(url, base) {
return new URI(url).resolve(new URI(base)).toString();
}
// Clean links
function cleanLinks(urls, base) {
return utils.unique(urls).filter(function(url) {
return url.indexOf(baseUrl) === 0 || !new RegExp('^(#|ftp|javascript|http)').test(url);
}).map(function(url) {
return absPath(url, base);
}).filter(function(url) {
return checked.indexOf(url) === -1;
});
}
// Opens the page, perform tests and fetch next links
function crawl(link) {
this.start().then(function() {
this.echo(link, 'COMMENT');
this.open(link);
checked.push(link);
});
this.then(function() {
if (this.currentHTTPStatus === 404) {
this.warn(link + ' is missing (HTTP 404)');
} else if (this.currentHTTPStatus === 500) {
this.warn(link + ' is broken (HTTP 500)');
} else {
this.echo(link + f(' is okay (HTTP %s)', this.currentHTTPStatus));
}
});
this.then(function() {
var newLinks = searchLinks.call(this);
links = links.concat(newLinks).filter(function(url) {
return checked.indexOf(url) === -1;
});
this.echo(newLinks.length + " new links found on " + link);
});
}
// Fetch all <a> elements from the page and return
// the ones which contains a href starting with 'http://'
function searchLinks() {
return cleanLinks(this.evaluate(function _fetchInternalLinks() {
return [].map.call(__utils__.findAll('a[href]'), function(node) {
return node.getAttribute('href');
});
}), this.getCurrentUrl());
}
// As long as it has a next link, and is under the maximum limit, will keep running
function check() {
if (links[currentLink] && currentLink < upTo) {
crawl.call(this, links[currentLink]);
currentLink++;
this.run(check);
} else {
this.echo("All done, " + checked.length + " links checked.");
this.exit();
}
}
if (!url) {
casper.warn('No url passed, aborting.').exit();
}
casper.start('https://gist.githubusercontent.com/pieplu/6be55d1d8f27ea9b8dcf4de6b1933547/raw/5e8d996fe1d86edac93825d0050fd359caf74f8e/URI.js', function() {
var scriptCode = this.getPageContent() + '; return URI;';
window.URI = new Function(scriptCode)();
if (typeof window.URI === "function") {
this.echo('URI.js loaded');
} else {
this.warn('Could not setup URI.js').exit();
}
}).then(function() {
this.echo("Starting");
}).run(check);
@pieplu
Copy link
Author

pieplu commented May 22, 2017

Hi @PierreAmmeloot Sorry I see your message only now...

I bet, it's because the script need URI.js. But I change that to use directly the gist. Can you retry with this new version?

(Note, I will make a PR for casperjs doc)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment