Last active
September 10, 2018 06:52
-
-
Save martincharlesworth/a43c2b45b138049b5831 to your computer and use it in GitHub Desktop.
PhantomJS crawler written to detect Mixed Content
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var uniqUrls = []; | |
var urlsToBrowse = []; | |
var browsedUrls = []; | |
function open(url, callback) { | |
var page = require('webpage').create(); | |
page.settings.loadImages = true; | |
page.onResourceReceived = function (response) { | |
if (response.stage == "start" && response.url.substr(0, 4) === "http" && uniqUrls.indexOf(response.url) === -1) { | |
uniqUrls.push(response.url); | |
} | |
} | |
page.open(url, function(status) { | |
browsedUrls.push(url); | |
if (status !== "success") { | |
if (url === args[1]) { | |
console.log("Couldn't open " + url); | |
phantom.exit(1); | |
} | |
else { | |
console.log("fail " + url); | |
} | |
} | |
else { | |
var uniqHrefs = page.evaluate(function() { | |
var uniqHrefs = []; | |
var l = document.links; | |
for(var i=0; i<l.length; i++) { | |
var href = l[i].getAttribute("href"); | |
if (href && href.length > 1 && href.charAt(0) == '/' && href.charAt(1) != '/') { | |
href = href.replace(/\/$/, ''); | |
if (uniqHrefs.indexOf(href) === -1) { | |
uniqHrefs.push(href); | |
} | |
} | |
} | |
return uniqHrefs; | |
}); | |
if (uniqHrefs) { | |
uniqHrefs.forEach(function(href) { | |
var url_without_path = url.split("/").slice(0,3).join("/"); | |
var urlFromHref = url_without_path + href; | |
if (browsedUrls.indexOf(urlFromHref) === -1 && urlsToBrowse.indexOf(urlFromHref) === -1) { | |
urlsToBrowse.push(urlFromHref); | |
} | |
}); | |
} | |
} | |
page.close(); | |
callback.apply(); | |
}); | |
} | |
function crawl() { | |
if (urlsToBrowse.length == 0) { | |
uniqUrls.sort(); | |
uniqUrls.forEach(function(url) { | |
console.log("uniq " + url); | |
}); | |
phantom.exit(0); | |
} | |
else { | |
var url = urlsToBrowse.shift(); | |
console.log("open " + url); | |
open(url, crawl); | |
} | |
} | |
var args = require('system').args; | |
if (args.length === 1) { | |
console.log('Please specify a URL.'); | |
phantom.exit(1); | |
} | |
urlsToBrowse.push(args[1]); | |
crawl(); |
Updated to handle broken links and remove the unnecessary jquery injection
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Developed using phantomjs version 1.9.7.
Usage:
phantomjs crawluniq.js HTTPS_URL
All pages opened will be output as:
open URL
Once the crawl is complete, all unique resources requested will be output as:
uniq URL
You can then
grep '^uniq http\:'
to find the Mixed Content but it should be at the top of the uniq list anyway since it's output alphabetically.