Skip to content

Instantly share code, notes, and snippets.

@Yuffster
Created February 25, 2012 22:05
Show Gist options
  • Select an option

  • Save Yuffster/1911116 to your computer and use it in GitHub Desktop.

Select an option

Save Yuffster/1911116 to your computer and use it in GitHub Desktop.
/**
* INSTALLATION:
*
* All these modules are core modules except for redis. To install that, just
* type 'npm install redis'.
*
* To run, 'node app.js <port number>'.
*
* USAGE:
*
* Pass the URL you'd like the fetch the content of as 'url' within the GET
* parameters.
*
* Search through the content server-side to find the information you need,
* and strip out all HTML (or convert the "safe" HTML to Markdown and back
* again) before displaying it to avoid XSS/click-jacking/etc attacks.
*
* ///////////////////////////// /!\ WARNING /!\ //////////////////////////////
* // //
* // Do not under any circumstances output the content gained by this //
* // service directly to users, especially not in the browser! //
* // //
* // Doing so would expose your application to serious XSS/click-jacking //
* // security vulnerabilities. //
* // //
* ///////////////////////////// /!\ WARNING /!\ //////////////////////////////
*
*/
var redis = require("redis"),
db = redis.createClient(),
dns = require('dns'),
http = require("http"),
urlib = require("url"),
qstring = require('querystring'),
crypto = require('crypto');
/**
* Simple class for holding onto callbacks until later. We'll be using it to
* keep track of concurrent requests to the same URL.
*
* Subscribing to an event will store the callback function for later use,
* while publishing an event will execute every callback function currently
* attached to the given event.
*/
function Watcher() { this.callbacks = {}; };
Watcher.prototype.subscribe = function(k, callback) {
if (!this.callbacks[k]) this.callbacks[k] = [];
this.callbacks[k].push(callback);
};
Watcher.prototype.publish = function (evnt, error, data) {
for (var i=0;i<this.callbacks[evnt];i++) this.callbacks[evnt][i](data);
};
/**
* Really simple pass-through waiter to ensure our callbacks only get called
* once. If we called any given callback twice, we'd crash our server by
* trying to write to a stream we've already closed.
*/
function Waiter(callback,bind) {
var done = false, chain = [];
bind = bind || this;
var fn = function() {
for (var i=0;i<chain.length;i++) chain[i].apply(bind, arguments);
if (!done) callback.apply(bind, arguments);
else {
console.warn("A callback has been called twice; check for race conditions.");
console.log (new Error().stack);
}
done = true;
};
fn.push = function(f) { chain.push(f); }
return fn;
};
//We'll store watchers for pending requests here, and delete them when a
//request is no longer pending.
var pending = { };
/**
* Parses and normalizes a given URL string, returning an object of options
* to pass to the HTTP module.
*/
function parseURL(url) {
//Parsed this by regex a few revisions back, but it was a naive
//implementation for fun.
var parsed = urlib.parse(url);
if (parsed.protocol!='http:') {
return callback("URL must start with http://");
}
//Normalize the query string (for caching).
if (parsed.query) {
var params = parsed.query.split(/&/g), keys = [], kvs = {}, qstr = [],
i, j, s, k, v, vs;
for (i=0;i<params.length;i++) {
s = params[i].split('=');
k = s[0];
v = (typeof(s[1])!='undefined') ? s[1] : null;
if (!kvs[k]) kvs[k] = [];
kvs[k].push(v);
if (keys.indexOf(k)<0) keys.push(k);
}
keys.sort();
for (i=0;i<keys.length;i++) {
vs = kvs[keys[i]].sort();
for (j=0;j<vs.length;j++) {
qstr.push( (vs[j]!=null) ? keys[i]+'='+vs[j] : keys[i] );
}
} parsed.query = qstr.join('&');
}
return {
host: parsed.host,
port: parsed.port || 80,
path: parsed.pathname+'?'+(parsed.query || ''),
method: 'GET'
}
};
/**
* When a request comes in, we'll hit getPage to find the fastest way of
* getting the data to the user.
*
* If we don't have the data cached, we'll make a trip to the target URL to
* grab the data and then cache it. If a user requests a page at the same time
* we're fetching it for another user, that user will be subscribed to an event
* which will be triggered when the data finishes downloading from the first
* user's request.
*/
function getPage(url, callback, options) {
if (!url) return callback("URL not provided.");
var options = parseURL(url),
keyStr = (options.host+options.port+options.path+options.query);
var key = crypto.createHash('sha1')
.update(keyStr.toLowerCase())
.digest('hex');
//First thing we'll do is check the Redis cache to see if we already have
//the data.
//
//We're using a hash here instead of a standard key just in case we want to
//add more properties to each cached item in the future.
db.hgetall(key, function(e,cached) {
//If it's cached, just return it.
if (cached.data) {
callback(false, cached.data);
//If another request for the same item is pending, wait for the pending
//request to complete and then return it.
} else if (pending[key]) {
pending[key].subscribe(callback);
//Otherwise, set the request to pending and make an HTTP call to grab
//the data from the remote server.
} else if (options.cacheOnly!=false) {
pending[key] = new Watcher();
fetchPage(options, function(e, d) {
//Cache the data, unless there's an error.
if (!e) {
db.hset(key, "data", d);
db.hset(key, "keystring", keyStr);
}
//Publish the data to anyone waiting for it.
if (pending[key]) pending[key].publish(e,d);
//Remove it from the pending request hash.
delete(pending[key]);
//Return the data for the current user.
callback(e, d);
});
} else {
callback('Not in local cache.');
}
});
}
/**
* Fetches the remote page and returns the result via callback.
*/
function fetchPage(options, callback, maxRedirects) {
//Count will keep track of how many redirects we've been dealing with, so
//that recursive redirects don't crash things.
var redirects = maxRedirects || 5, req = false;
//30-second timeout.
var timeout = setTimeout(function() {
if (req) req.abort();
callback("The server took too long to respond.");
}, 30*1000);
function redirect(location) {
redirects--;
if (redirects<1) callback("Too many redirects.");
//Stupid hack: getPage recurses through fetchPage.
else getPage(location, function(e,d) {
if (d) callback(e,d);
else fetchPage(parseURL(location), callback, redirects);
}, {cacheOnly:true});
clearTimeout(timeout);
};
//I'm tired of doing callback; clearTimeout.
function finish(e,d) {
callback(e,d);
clearTimeout(timeout);
}
dns.lookup(options.host, function(e,d) {
if (e) return finish(e.message || e);
//Make sure we're not being directed via malicious DNS record to our
//own private network.
//
//I rolled the previously implemented local network check into this one
// as well.
//
//TODO: IPv6
if (d.match(/^(127\.0\.0\.1|10\.|172\.1[6-9]|172\.2[0-9]|172\.3[0-1]|192\.168)/)) {
return finish("Cannot connect to private network.");
}
//A very rudimentary fetch method.
req = http.get(options, function(res) {
if (!res) return finish("Could not complete the request.");
//In the real-world, we'd want to make a table of all the HTTP
//status codes to return better errors than this.
if (res.statusCode>=400) return finish(res.statusCode);
var data = "";
res.setEncoding('utf8');
var loc = res.headers.location;
if (loc) {
if (!loc.match(/^https?:\/\//i)) {
loc = 'http://'+options.host+'/'+loc.replace(/^\//,'');
} return redirect(loc);
};
res.on('data', function (c) { data += c; });
res.on('end', function() { finish(false, data); });
});
req.on('error', function(e) { finish(e.message); });
});
};
//Takes the first argument to the node command line after the filename;
//defaults to 8000 if the argument is not a number.
var port = process.argv[2]*1 || 8000;
http.createServer(function(req, res) {
getPage(urlib.parse(req.url, true).query.url, new Waiter(function(e,d) {
res.end(JSON.stringify({
error: e,
data : d
}));
}));
}).listen(port);
console.log("Now listening on port "+port+'.');
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment