Skip to content

Instantly share code, notes, and snippets.

@joseraya
Created January 21, 2014 20:15
Show Gist options
  • Save joseraya/8547524 to your computer and use it in GitHub Desktop.
Save joseraya/8547524 to your computer and use it in GitHub Desktop.
A node script that crawls a web site and stores snapshots (taken with zombie.js) to the file system. Based on code from this article: http://www.ng-newsletter.com/posts/serious-angular-seo.html
var Browser = require('zombie'),
url = require('url'),
fs = require('fs'),
$q = require('Q'),
saveDir = __dirname + '/_snapshots';
var scriptTagRegex = /<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi;
var stripScriptTags = function(html) {
return html.replace(scriptTagRegex, '');
}
var mkdirParent = function(dirPath, mode, callback) {
//Call the standard fs.mkdir
fs.mkdir(dirPath, mode, function(error) {
//When it fail in this way, do the custom steps
if (error && error.errno === 34) {
//Create all the parents recursively
fs.mkdirParent(path.dirname(dirPath), mode, callback);
//And then the directory
fs.mkdirParent(dirPath, mode, callback);
}
//Manually run the callback since we used our own callback to do all these
callback && callback(error);
});
};
var saveSnapshot = function(uri, body) {
var lastIdx = uri.lastIndexOf('#!/');
if (lastIdx < 0) {
// If we're using html5mode
path = url.parse(uri).pathname;
} else {
// If we're using hashbang mode
path =
uri.substring(lastIdx + 2, uri.length);
}
if (path === '/') path = "/index.html";
if (path.indexOf('.html') == -1)
path += ".html";
var filename = saveDir + path;
console.log("Saving ", uri, " to ", filename);
var dirname = require("path").dirname(filename);
mkdirParent(dirname);
fs.open(filename, 'w', function(e, fd) {
if (e) return;
fs.write(fd, body);
});
};
var browserOpts = {
waitFor: "100ms",
loadCSS: false,
waitDuration: "100ms"
}
var browser = new Browser(browserOpts);
var crawlPage = function(idx, arr) {
// location = window.location
if (idx < arr.length) {
var uri = arr[idx];
console.time("voy");
var promise = browser.visit(uri)
.then(function() {
console.timeEnd("voy");
var intervalId = setInterval(function() {
console.log("checking status")
var status = browser.body.getAttribute('data-status');
console.log(status);
if (status === "ready") {
clearInterval(intervalId);
// Turn links into absolute links
// and save them, if we need to
// and we haven't already crawled them
var links = browser.queryAll('a');
links.forEach(function(link) {
var href = link.getAttribute('href');
var absUrl = url.resolve(uri, href);
link.setAttribute('href', absUrl);
if (arr.indexOf(absUrl) < 0) {
arr.push(absUrl);
}
});
// Save
saveSnapshot(uri, browser.html());
// Call again on the next iteration
crawlPage(idx+1, arr);
}
}, 500);
var d = $q.defer();
});
}
}
crawlPage(0, ["http://localhost:4000/#!/"]);
@ay13
Copy link

ay13 commented Jun 23, 2014

are you able to get this to work?

i get the following error:

TypeError: Cannot use 'in' operator to search for 'compareDocumentPosition' in null
at /Users/ayoung/Sites/zombie/node_modules/zombie/node_modules/jsdom/node_modules/nwmatcher/src/nwmatcher-noqsa.js:267:43
at module.exports (/Users/ayoung/Sites/zombie/node_modules/zombie/node_modules/jsdom/node_modules/nwmatcher/src/nwmatcher-noqsa.js:37:7)
at addNwmatcher (/Users/ayoung/Sites/zombie/node_modules/zombie/node_modules/jsdom/lib/jsdom/selectors/index.js:5:27)
at HTMLDocument.dom.Document.querySelector (/Users/ayoung/Sites/zombie/node_modules/zombie/node_modules/jsdom/lib/jsdom/selectors/index.js:13:12)
at Browser.close as body
at null. (/Users/ayoung/Sites/zombie/fetch.js:78:38)
at wrapper as _onTimeout
at Timer.listOnTimeout as ontimeout

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment