-
-
Save mradcliffe/8832090 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//PhantomJS http://phantomjs.org/ based web crawler Anton Ivanov [email protected] 2012 | |
var fs = require('fs'); | |
var system = require('system'); | |
var arg_url = system.args[1] || ''; | |
var snapshot = system.args[2] || ''; | |
fs.exists('sitemap.xml', function(exists) { | |
if (exists) { | |
fs.unlink('sitemap.xml'); | |
} | |
}); | |
var url_info = new Array(); | |
var all_urls = new Array(); | |
var sitemap_header = '<?xml version="1.0" encoding="UTF-8"?>' + "\n"; | |
sitemap_header += '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">' + "\n"; | |
var sitemap_footer = "\n</urlset>"; | |
(function(host) { | |
function Crawler() { | |
this.visitedURLs = {}; | |
}; | |
Crawler.webpage = require('webpage'); | |
Crawler.fs = require('fs'); | |
Crawler.baseUrl = arg_url; | |
Crawler.prototype.crawl = function (url, depth, takeSnapshot, onSuccess, onFailure) { | |
if (0 == depth || this.visitedURLs[url]) { | |
return; | |
}; | |
var self = this; | |
var page = Crawler.webpage.create(); | |
page.viewportSize = { | |
width: 1280, | |
height: 1024 | |
}; | |
page.onResourceError = function(resourceError) { | |
page.reason = resourceError.errorString; | |
page.reason_url = resourceError.url; | |
} | |
page.open(url, function (status) { | |
if ('fail' === status) { | |
onFailure({ | |
url: url, | |
status: status | |
}); | |
} else { | |
var delay, checker = (function() { | |
console.log('Waiting for page load... ' + url); | |
var documentHTML = page.evaluate(function () { | |
var body = document.querySelector('.main-wrapper'); | |
if (body.getAttribute('data-status') == 'ready') { | |
return document.body && document.body.innerHTML ? document.body : ""; | |
} | |
}); | |
if (documentHTML) { | |
clearTimeout(delay); | |
// Save content into directory. | |
if (page.url.substring(0, Crawler.baseUrl.length) == Crawler.baseUrl) { | |
var resource = page.url.substring(Crawler.baseUrl.length); | |
resource = resource.replace(/^#[!]?/, ''); | |
if (resource.length == 0) { | |
resource = resource + "/index.html"; | |
} else if (resource.match(/\.html$/) === null) { | |
resource = resource + ".html"; | |
} else { | |
resource = resource + ".html"; | |
} | |
fs.write('content/static/' + resource, page.content, 'w'); | |
if (takeSnapshot) { | |
page.render('tmp/' + resource + '.png'); | |
} | |
if (all_urls.indexOf(page.url) == -1) { | |
url_info.push("\t<url><loc>" + page.url + "</loc><changefreq>monthly</changefreq></url>"); | |
all_urls.push(page.url); | |
} | |
// I am going to write this file every single time because I do not understand Javascript. | |
fs.write("sitemap.xml", sitemap_header + url_info.join("\n") + sitemap_footer, 'w'); | |
} | |
// Add URL to JSON | |
self.getAllURLs(page, depth -1, onSuccess, onFailure); | |
// self.crawlURLs(urls, depth - 1, onSuccess, onFailure); | |
self.visitedURLs[url] = true; | |
onSuccess({ | |
url: url, | |
status: status, | |
content: documentHTML | |
}); | |
} | |
}); | |
delay = setInterval(checker, 100); | |
}; | |
}); | |
}; | |
Crawler.prototype.getAllURLs = function(page, depth, onSuccess, onFailure) { | |
var self = this; | |
var delay, checker = (function() { | |
console.log('Waiting for navigation links to load...'); | |
var links = page.evaluate(function () { | |
// Try to find some links in a couple of different menus on the site. | |
var nav = document.querySelector('a.ng-binding, a.nav-link'); | |
// var body = document.getElementsByTagName('body')[0]; | |
if (typeof nav !== 'undefined' || nav !== null) { | |
return [].map.call(document.querySelectorAll('a.ng-binding, a.nav-link'), function(link) { | |
return link.getAttribute('href'); | |
}); | |
} | |
}); | |
if (links) { | |
clearTimeout(delay); | |
console.log('Found ' + links.length + ' links'); | |
self.crawlURLs(links, depth, onSuccess, onFailure); | |
} | |
}); | |
delay = setInterval(checker, 100); | |
}; | |
Crawler.prototype.crawlURLs = function(urls, depth, onSuccess, onFailure) { | |
var self = this; | |
var baseUrl = arg_url; | |
urls.filter(function (url) { | |
if (url.match(/^#[!]?\/[a-zA-Z0-9\-]+/) !== null && url !== '#[!]?/') { | |
return Crawler.isTopLevelURL(baseUrl + url); | |
} else if (url.length !== '#[!]?/') { | |
return Crawler.isTopLevelURL(url); | |
} else { | |
return false; | |
} | |
}).forEach(function (url) { | |
console.log('Found: ' + url); | |
self.crawl(baseUrl + url, depth, onSuccess, onFailure); | |
}); | |
}; | |
Crawler.isTopLevelURL = function(url) { | |
// return true; | |
return 0 == url.indexOf("http"); | |
}; | |
host.Crawler = Crawler; | |
})(phantom); | |
new phantom.Crawler().crawl(arg_url, 4, snapshot, | |
function onSuccess(page) { | |
console.log("Loaded page. URL = " + page.url + " content length = " + page.content.length + " status = " + page.status); | |
}, | |
function onFailure(page) { | |
console.log("Could not load page. URL = " + page.url + " status = " + page.status); | |
} | |
); | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment