Last active
September 10, 2018 06:52
-
-
Save martincharlesworth/a43c2b45b138049b5831 to your computer and use it in GitHub Desktop.
PhantomJS crawler written to detect Mixed Content
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var uniqUrls = []; | |
var urlsToBrowse = []; | |
var browsedUrls = []; | |
function open(url, callback) { | |
var page = require('webpage').create(); | |
page.settings.loadImages = true; | |
page.onResourceReceived = function (response) { | |
if (response.stage == "start" && response.url.substr(0, 4) === "http" && uniqUrls.indexOf(response.url) === -1) { | |
uniqUrls.push(response.url); | |
} | |
} | |
page.open(url, function(status) { | |
browsedUrls.push(url); | |
if (status !== "success") { | |
if (url === args[1]) { | |
console.log("Couldn't open " + url); | |
phantom.exit(1); | |
} | |
else { | |
console.log("fail " + url); | |
} | |
} | |
else { | |
var uniqHrefs = page.evaluate(function() { | |
var uniqHrefs = []; | |
var l = document.links; | |
for(var i=0; i<l.length; i++) { | |
var href = l[i].getAttribute("href"); | |
if (href && href.length > 1 && href.charAt(0) == '/' && href.charAt(1) != '/') { | |
href = href.replace(/\/$/, ''); | |
if (uniqHrefs.indexOf(href) === -1) { | |
uniqHrefs.push(href); | |
} | |
} | |
} | |
return uniqHrefs; | |
}); | |
if (uniqHrefs) { | |
uniqHrefs.forEach(function(href) { | |
var url_without_path = url.split("/").slice(0,3).join("/"); | |
var urlFromHref = url_without_path + href; | |
if (browsedUrls.indexOf(urlFromHref) === -1 && urlsToBrowse.indexOf(urlFromHref) === -1) { | |
urlsToBrowse.push(urlFromHref); | |
} | |
}); | |
} | |
} | |
page.close(); | |
callback.apply(); | |
}); | |
} | |
function crawl() { | |
if (urlsToBrowse.length == 0) { | |
uniqUrls.sort(); | |
uniqUrls.forEach(function(url) { | |
console.log("uniq " + url); | |
}); | |
phantom.exit(0); | |
} | |
else { | |
var url = urlsToBrowse.shift(); | |
console.log("open " + url); | |
open(url, crawl); | |
} | |
} | |
var args = require('system').args; | |
if (args.length === 1) { | |
console.log('Please specify a URL.'); | |
phantom.exit(1); | |
} | |
urlsToBrowse.push(args[1]); | |
crawl(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Updated to handle broken links and remove the unnecessary jquery injection