Created
February 25, 2012 22:05
-
-
Save Yuffster/1911116 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /** | |
| * INSTALLATION: | |
| * | |
| * All these modules are core modules except for redis. To install that, just | |
| * type 'npm install redis'. | |
| * | |
| * To run, 'node app.js <port number>'. | |
| * | |
| * USAGE: | |
| * | |
| * Pass the URL you'd like the fetch the content of as 'url' within the GET | |
| * parameters. | |
| * | |
| * Search through the content server-side to find the information you need, | |
| * and strip out all HTML (or convert the "safe" HTML to Markdown and back | |
| * again) before displaying it to avoid XSS/click-jacking/etc attacks. | |
| * | |
| * ///////////////////////////// /!\ WARNING /!\ ////////////////////////////// | |
| * // // | |
| * // Do not under any circumstances output the content gained by this // | |
| * // service directly to users, especially not in the browser! // | |
| * // // | |
| * // Doing so would expose your application to serious XSS/click-jacking // | |
| * // security vulnerabilities. // | |
| * // // | |
| * ///////////////////////////// /!\ WARNING /!\ ////////////////////////////// | |
| * | |
| */ | |
| var redis = require("redis"), | |
| db = redis.createClient(), | |
| dns = require('dns'), | |
| http = require("http"), | |
| urlib = require("url"), | |
| qstring = require('querystring'), | |
| crypto = require('crypto'); | |
| /** | |
| * Simple class for holding onto callbacks until later. We'll be using it to | |
| * keep track of concurrent requests to the same URL. | |
| * | |
| * Subscribing to an event will store the callback function for later use, | |
| * while publishing an event will execute every callback function currently | |
| * attached to the given event. | |
| */ | |
| function Watcher() { this.callbacks = {}; }; | |
| Watcher.prototype.subscribe = function(k, callback) { | |
| if (!this.callbacks[k]) this.callbacks[k] = []; | |
| this.callbacks[k].push(callback); | |
| }; | |
| Watcher.prototype.publish = function (evnt, error, data) { | |
| for (var i=0;i<this.callbacks[evnt];i++) this.callbacks[evnt][i](data); | |
| }; | |
| /** | |
| * Really simple pass-through waiter to ensure our callbacks only get called | |
| * once. If we called any given callback twice, we'd crash our server by | |
| * trying to write to a stream we've already closed. | |
| */ | |
| function Waiter(callback,bind) { | |
| var done = false, chain = []; | |
| bind = bind || this; | |
| var fn = function() { | |
| for (var i=0;i<chain.length;i++) chain[i].apply(bind, arguments); | |
| if (!done) callback.apply(bind, arguments); | |
| else { | |
| console.warn("A callback has been called twice; check for race conditions."); | |
| console.log (new Error().stack); | |
| } | |
| done = true; | |
| }; | |
| fn.push = function(f) { chain.push(f); } | |
| return fn; | |
| }; | |
| //We'll store watchers for pending requests here, and delete them when a | |
| //request is no longer pending. | |
| var pending = { }; | |
| /** | |
| * Parses and normalizes a given URL string, returning an object of options | |
| * to pass to the HTTP module. | |
| */ | |
| function parseURL(url) { | |
| //Parsed this by regex a few revisions back, but it was a naive | |
| //implementation for fun. | |
| var parsed = urlib.parse(url); | |
| if (parsed.protocol!='http:') { | |
| return callback("URL must start with http://"); | |
| } | |
| //Normalize the query string (for caching). | |
| if (parsed.query) { | |
| var params = parsed.query.split(/&/g), keys = [], kvs = {}, qstr = [], | |
| i, j, s, k, v, vs; | |
| for (i=0;i<params.length;i++) { | |
| s = params[i].split('='); | |
| k = s[0]; | |
| v = (typeof(s[1])!='undefined') ? s[1] : null; | |
| if (!kvs[k]) kvs[k] = []; | |
| kvs[k].push(v); | |
| if (keys.indexOf(k)<0) keys.push(k); | |
| } | |
| keys.sort(); | |
| for (i=0;i<keys.length;i++) { | |
| vs = kvs[keys[i]].sort(); | |
| for (j=0;j<vs.length;j++) { | |
| qstr.push( (vs[j]!=null) ? keys[i]+'='+vs[j] : keys[i] ); | |
| } | |
| } parsed.query = qstr.join('&'); | |
| } | |
| return { | |
| host: parsed.host, | |
| port: parsed.port || 80, | |
| path: parsed.pathname+'?'+(parsed.query || ''), | |
| method: 'GET' | |
| } | |
| }; | |
| /** | |
| * When a request comes in, we'll hit getPage to find the fastest way of | |
| * getting the data to the user. | |
| * | |
| * If we don't have the data cached, we'll make a trip to the target URL to | |
| * grab the data and then cache it. If a user requests a page at the same time | |
| * we're fetching it for another user, that user will be subscribed to an event | |
| * which will be triggered when the data finishes downloading from the first | |
| * user's request. | |
| */ | |
| function getPage(url, callback, options) { | |
| if (!url) return callback("URL not provided."); | |
| var options = parseURL(url), | |
| keyStr = (options.host+options.port+options.path+options.query); | |
| var key = crypto.createHash('sha1') | |
| .update(keyStr.toLowerCase()) | |
| .digest('hex'); | |
| //First thing we'll do is check the Redis cache to see if we already have | |
| //the data. | |
| // | |
| //We're using a hash here instead of a standard key just in case we want to | |
| //add more properties to each cached item in the future. | |
| db.hgetall(key, function(e,cached) { | |
| //If it's cached, just return it. | |
| if (cached.data) { | |
| callback(false, cached.data); | |
| //If another request for the same item is pending, wait for the pending | |
| //request to complete and then return it. | |
| } else if (pending[key]) { | |
| pending[key].subscribe(callback); | |
| //Otherwise, set the request to pending and make an HTTP call to grab | |
| //the data from the remote server. | |
| } else if (options.cacheOnly!=false) { | |
| pending[key] = new Watcher(); | |
| fetchPage(options, function(e, d) { | |
| //Cache the data, unless there's an error. | |
| if (!e) { | |
| db.hset(key, "data", d); | |
| db.hset(key, "keystring", keyStr); | |
| } | |
| //Publish the data to anyone waiting for it. | |
| if (pending[key]) pending[key].publish(e,d); | |
| //Remove it from the pending request hash. | |
| delete(pending[key]); | |
| //Return the data for the current user. | |
| callback(e, d); | |
| }); | |
| } else { | |
| callback('Not in local cache.'); | |
| } | |
| }); | |
| } | |
| /** | |
| * Fetches the remote page and returns the result via callback. | |
| */ | |
| function fetchPage(options, callback, maxRedirects) { | |
| //Count will keep track of how many redirects we've been dealing with, so | |
| //that recursive redirects don't crash things. | |
| var redirects = maxRedirects || 5, req = false; | |
| //30-second timeout. | |
| var timeout = setTimeout(function() { | |
| if (req) req.abort(); | |
| callback("The server took too long to respond."); | |
| }, 30*1000); | |
| function redirect(location) { | |
| redirects--; | |
| if (redirects<1) callback("Too many redirects."); | |
| //Stupid hack: getPage recurses through fetchPage. | |
| else getPage(location, function(e,d) { | |
| if (d) callback(e,d); | |
| else fetchPage(parseURL(location), callback, redirects); | |
| }, {cacheOnly:true}); | |
| clearTimeout(timeout); | |
| }; | |
| //I'm tired of doing callback; clearTimeout. | |
| function finish(e,d) { | |
| callback(e,d); | |
| clearTimeout(timeout); | |
| } | |
| dns.lookup(options.host, function(e,d) { | |
| if (e) return finish(e.message || e); | |
| //Make sure we're not being directed via malicious DNS record to our | |
| //own private network. | |
| // | |
| //I rolled the previously implemented local network check into this one | |
| // as well. | |
| // | |
| //TODO: IPv6 | |
| if (d.match(/^(127\.0\.0\.1|10\.|172\.1[6-9]|172\.2[0-9]|172\.3[0-1]|192\.168)/)) { | |
| return finish("Cannot connect to private network."); | |
| } | |
| //A very rudimentary fetch method. | |
| req = http.get(options, function(res) { | |
| if (!res) return finish("Could not complete the request."); | |
| //In the real-world, we'd want to make a table of all the HTTP | |
| //status codes to return better errors than this. | |
| if (res.statusCode>=400) return finish(res.statusCode); | |
| var data = ""; | |
| res.setEncoding('utf8'); | |
| var loc = res.headers.location; | |
| if (loc) { | |
| if (!loc.match(/^https?:\/\//i)) { | |
| loc = 'http://'+options.host+'/'+loc.replace(/^\//,''); | |
| } return redirect(loc); | |
| }; | |
| res.on('data', function (c) { data += c; }); | |
| res.on('end', function() { finish(false, data); }); | |
| }); | |
| req.on('error', function(e) { finish(e.message); }); | |
| }); | |
| }; | |
| //Takes the first argument to the node command line after the filename; | |
| //defaults to 8000 if the argument is not a number. | |
| var port = process.argv[2]*1 || 8000; | |
| http.createServer(function(req, res) { | |
| getPage(urlib.parse(req.url, true).query.url, new Waiter(function(e,d) { | |
| res.end(JSON.stringify({ | |
| error: e, | |
| data : d | |
| })); | |
| })); | |
| }).listen(port); | |
| console.log("Now listening on port "+port+'.'); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment