Skip to content

Instantly share code, notes, and snippets.

@steadicat
Created August 9, 2010 22:36
Show Gist options
  • Save steadicat/516254 to your computer and use it in GitHub Desktop.
Save steadicat/516254 to your computer and use it in GitHub Desktop.
#!/usr/bin/env coffee
# The URL of the web site to make crawlable
URL: 'http://localhost'
# Time to wait for all the scripts to run, in ms
DELAY: 3000
# Script to run to clean up the markup before serializing
CLEANUP: (window) ->
window.$('.off').remove()
# Port number to listen on for the GoogleBot (proxy this from port 80
# for requests that have _escaped_fragment_ in the query string).
PORT: 8002
fs: require 'fs'
sys: require 'sys'
http: require 'http'
parse: require('url').parse
jsdom: require('./contrib/jsdom/lib/jsdom').jsdom
window: jsdom().makeWindow()
document: window.document
DEBUG: (m) -> sys.log "[DEBUG] $m"
INFO: (m) -> sys.log "[INFO] $m"
ERROR: (err) ->
sys.log """[ERROR] $err.message
$err.stack"""
delay: (interval, f) -> setTimeout f, interval
readSource: (url, callback) ->
url: parse url
INFO "Performing GET http://$url.hostname$url.pathname${url.search or ''}"
request: http.createClient(url.port or 80, url.hostname).request('GET', "$url.pathname${url.search or ''}", { host: url.host })
request.on 'response', (response) ->
body: []
response.on 'data', (chunk) ->
body.push chunk
response.on 'end', ->
callback no, body.join('')
request.end()
crawl: (path, frag, callback) ->
readSource "$URL$path", (err, data) ->
return ERROR(err) if err
document.innerHTML: data
window.location.href: "$URL$path#$frag"
# eval inline script tags
for tag in document.getElementsByTagName('script')
if not tag.src
source: tag.children[0]._nodeValue
eval(source)
delay DELAY, () ->
CLEANUP(window) if CLEANUP?
callback window.document.outerHTML
INFO "Done crawling $URL$path#!$frag"
respond: (response, body, code, type) ->
code: or 200
type: or 'text/html; charset=utf-8'
response.writeHead code, {
'Content-Length': body.length
'Content-Type': type
}
response.write body
response.end()
requestListener: (request, response) ->
url: parse(request.url, true)
path: url.pathname
frag: url.query._escaped_fragment_
if not frag?
DEBUG "Request missing fragment"
respond response, '<html><h1>Not found. Make sure you have an <code>_escaped_fragment_</code> in the query string.</h1></html>', 404
return
INFO "Crawling $URL$path#!$frag"
crawl path, frag, (body) ->
respond response, body
http.createServer(requestListener).listen(PORT)
INFO "Waiting patiently for the GoogleBot on port $PORT"
process.on 'uncaughtException', (err) ->
ERROR err
#!/usr/bin/env node
(function(){
var CLEANUP, DEBUG, DELAY, ERROR, LOG, URL, crawl, delay, document, fs, http, jsdom, parse, readSource, requestListener, respond, sys, window;
URL = 'http://localhost';
DELAY = 2000;
CLEANUP = function(window) {
return window.$('.off').remove();
};
fs = require('fs');
sys = require('sys');
http = require('http');
parse = require('url').parse;
jsdom = require('./contrib/jsdom/lib/jsdom').jsdom;
window = jsdom().makeWindow();
document = window.document;
DEBUG = function(m) {
return sys.log("[DEBUG] " + m);
};
LOG = function(m) {
return sys.log("[LOG] " + m);
};
ERROR = function(err) {
return sys.log(("[ERROR] " + (sys.inspect(err)) + "\n" + err.stack));
};
delay = function(interval, f) {
return setTimeout(f, interval);
};
readSource = function(url, callback) {
var request;
url = parse(url);
DEBUG(("Performing GET " + url.pathname + (url.search || '')));
request = http.createClient(url.port || 80, url.hostname).request('GET', ("" + url.pathname + (url.search || '')), {
host: url.host
});
request.on('response', function(response) {
var body;
body = [];
response.on('data', function(chunk) {
return body.push(chunk);
});
return response.on('end', function() {
return callback(false, body.join(''));
});
});
return request.end();
};
crawl = function(path, frag, callback) {
return readSource(("" + URL + path), function(err, data) {
var _a, _b, _c, source, tag;
if (err) {
return ERROR(err);
}
document.innerHTML = data;
window.location.href = ("" + URL + path + "#" + frag);
_b = document.getElementsByTagName('script');
for (_a = 0, _c = _b.length; _a < _c; _a++) {
tag = _b[_a];
if (!tag.src) {
source = tag.children[0]._nodeValue;
eval(source);
}
}
return delay(DELAY, function() {
if (typeof CLEANUP !== "undefined" && CLEANUP !== null) {
CLEANUP(window);
}
return callback(window.document.outerHTML);
});
});
};
respond = function(response, body, code, type) {
code = code || 200;
type = type || 'text/html; charset=utf-8';
response.writeHead(code, {
'Content-Length': body.length,
'Content-Type': type
});
response.write(body);
return response.end();
};
requestListener = function(request, response) {
var frag, path, url;
url = parse(request.url, true);
path = url.pathname;
frag = url.query._escaped_fragment_;
if (!(typeof frag !== "undefined" && frag !== null)) {
DEBUG("Request missing fragment");
respond(response, '<html><h1>Not found. Make sure you have an <code>_escaped_fragment_</code> in the query string.</h1></html>', 404);
return null;
}
LOG("Crawling url " + URL + path + "#!" + frag);
return crawl(path, frag, function(body) {
return respond(response, body);
});
};
http.createServer(requestListener).listen(8002);
})();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment