Created
January 21, 2014 20:15
-
-
Save joseraya/8547524 to your computer and use it in GitHub Desktop.
A node script that crawls a web site and stores snapshots (taken with zombie.js) to the file system. Based on code from this article: http://www.ng-newsletter.com/posts/serious-angular-seo.html
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var Browser = require('zombie'), | |
url = require('url'), | |
fs = require('fs'), | |
$q = require('Q'), | |
saveDir = __dirname + '/_snapshots'; | |
var scriptTagRegex = /<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi; | |
var stripScriptTags = function(html) { | |
return html.replace(scriptTagRegex, ''); | |
} | |
var mkdirParent = function(dirPath, mode, callback) { | |
//Call the standard fs.mkdir | |
fs.mkdir(dirPath, mode, function(error) { | |
//When it fail in this way, do the custom steps | |
if (error && error.errno === 34) { | |
//Create all the parents recursively | |
fs.mkdirParent(path.dirname(dirPath), mode, callback); | |
//And then the directory | |
fs.mkdirParent(dirPath, mode, callback); | |
} | |
//Manually run the callback since we used our own callback to do all these | |
callback && callback(error); | |
}); | |
}; | |
var saveSnapshot = function(uri, body) { | |
var lastIdx = uri.lastIndexOf('#!/'); | |
if (lastIdx < 0) { | |
// If we're using html5mode | |
path = url.parse(uri).pathname; | |
} else { | |
// If we're using hashbang mode | |
path = | |
uri.substring(lastIdx + 2, uri.length); | |
} | |
if (path === '/') path = "/index.html"; | |
if (path.indexOf('.html') == -1) | |
path += ".html"; | |
var filename = saveDir + path; | |
console.log("Saving ", uri, " to ", filename); | |
var dirname = require("path").dirname(filename); | |
mkdirParent(dirname); | |
fs.open(filename, 'w', function(e, fd) { | |
if (e) return; | |
fs.write(fd, body); | |
}); | |
}; | |
var browserOpts = { | |
waitFor: "100ms", | |
loadCSS: false, | |
waitDuration: "100ms" | |
} | |
var browser = new Browser(browserOpts); | |
var crawlPage = function(idx, arr) { | |
// location = window.location | |
if (idx < arr.length) { | |
var uri = arr[idx]; | |
console.time("voy"); | |
var promise = browser.visit(uri) | |
.then(function() { | |
console.timeEnd("voy"); | |
var intervalId = setInterval(function() { | |
console.log("checking status") | |
var status = browser.body.getAttribute('data-status'); | |
console.log(status); | |
if (status === "ready") { | |
clearInterval(intervalId); | |
// Turn links into absolute links | |
// and save them, if we need to | |
// and we haven't already crawled them | |
var links = browser.queryAll('a'); | |
links.forEach(function(link) { | |
var href = link.getAttribute('href'); | |
var absUrl = url.resolve(uri, href); | |
link.setAttribute('href', absUrl); | |
if (arr.indexOf(absUrl) < 0) { | |
arr.push(absUrl); | |
} | |
}); | |
// Save | |
saveSnapshot(uri, browser.html()); | |
// Call again on the next iteration | |
crawlPage(idx+1, arr); | |
} | |
}, 500); | |
var d = $q.defer(); | |
}); | |
} | |
} | |
crawlPage(0, ["http://localhost:4000/#!/"]); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
are you able to get this to work?
i get the following error:
TypeError: Cannot use 'in' operator to search for 'compareDocumentPosition' in null
at /Users/ayoung/Sites/zombie/node_modules/zombie/node_modules/jsdom/node_modules/nwmatcher/src/nwmatcher-noqsa.js:267:43
at module.exports (/Users/ayoung/Sites/zombie/node_modules/zombie/node_modules/jsdom/node_modules/nwmatcher/src/nwmatcher-noqsa.js:37:7)
at addNwmatcher (/Users/ayoung/Sites/zombie/node_modules/zombie/node_modules/jsdom/lib/jsdom/selectors/index.js:5:27)
at HTMLDocument.dom.Document.querySelector (/Users/ayoung/Sites/zombie/node_modules/zombie/node_modules/jsdom/lib/jsdom/selectors/index.js:13:12)
at Browser.close as body
at null. (/Users/ayoung/Sites/zombie/fetch.js:78:38)
at wrapper as _onTimeout
at Timer.listOnTimeout as ontimeout