Skip to content

Instantly share code, notes, and snippets.

@martincharlesworth
Last active September 10, 2018 06:52
Show Gist options
  • Save martincharlesworth/a43c2b45b138049b5831 to your computer and use it in GitHub Desktop.
Save martincharlesworth/a43c2b45b138049b5831 to your computer and use it in GitHub Desktop.
PhantomJS crawler written to detect Mixed Content
var uniqUrls = [];
var urlsToBrowse = [];
var browsedUrls = [];
function open(url, callback) {
var page = require('webpage').create();
page.settings.loadImages = true;
page.onResourceReceived = function (response) {
if (response.stage == "start" && response.url.substr(0, 4) === "http" && uniqUrls.indexOf(response.url) === -1) {
uniqUrls.push(response.url);
}
}
page.open(url, function(status) {
browsedUrls.push(url);
if (status !== "success") {
if (url === args[1]) {
console.log("Couldn't open " + url);
phantom.exit(1);
}
else {
console.log("fail " + url);
}
}
else {
var uniqHrefs = page.evaluate(function() {
var uniqHrefs = [];
var l = document.links;
for(var i=0; i<l.length; i++) {
var href = l[i].getAttribute("href");
if (href && href.length > 1 && href.charAt(0) == '/' && href.charAt(1) != '/') {
href = href.replace(/\/$/, '');
if (uniqHrefs.indexOf(href) === -1) {
uniqHrefs.push(href);
}
}
}
return uniqHrefs;
});
if (uniqHrefs) {
uniqHrefs.forEach(function(href) {
var url_without_path = url.split("/").slice(0,3).join("/");
var urlFromHref = url_without_path + href;
if (browsedUrls.indexOf(urlFromHref) === -1 && urlsToBrowse.indexOf(urlFromHref) === -1) {
urlsToBrowse.push(urlFromHref);
}
});
}
}
page.close();
callback.apply();
});
}
function crawl() {
if (urlsToBrowse.length == 0) {
uniqUrls.sort();
uniqUrls.forEach(function(url) {
console.log("uniq " + url);
});
phantom.exit(0);
}
else {
var url = urlsToBrowse.shift();
console.log("open " + url);
open(url, crawl);
}
}
var args = require('system').args;
if (args.length === 1) {
console.log('Please specify a URL.');
phantom.exit(1);
}
urlsToBrowse.push(args[1]);
crawl();
@martincharlesworth
Copy link
Author

Developed using phantomjs version 1.9.7.

Usage:
phantomjs crawluniq.js HTTPS_URL

All pages opened will be output as:
open URL

Once the crawl is complete, all unique resources requested will be output as:
uniq URL

You can then grep '^uniq http\:' to find the Mixed Content but it should be at the top of the uniq list anyway since it's output alphabetically.

@martincharlesworth
Copy link
Author

Updated to handle broken links and remove the unnecessary jquery injection

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment