Yuffster · February 25, 2012 22:05
diff --git a/fetch.js b/fetch.js
 /**
 * INSTALLATION:
 * 
 *   All these modules are core modules except for redis.  To install that, just
 *   type 'npm install redis'.
 *
 *   To run, 'node app.js <port number>'.
 *
 * USAGE:
 *
 *   Pass the URL you'd like the fetch the content of as 'url' within the GET
 *   parameters.
 *
 *   Search through the content server-side to find the information you need,
 *   and strip out all HTML (or convert the "safe" HTML to Markdown and back
 *   again) before displaying it to avoid XSS/click-jacking/etc attacks.
 *
 * ///////////////////////////// /!\ WARNING /!\ //////////////////////////////
 * //                                                                        //
 * //  Do not under any circumstances output the content gained by this      //
 * //  service directly to users, especially not in the browser!             //
 * //                                                                        //
 * //  Doing so would expose your application to serious XSS/click-jacking   //
 * //  security vulnerabilities.                                             //
 * //                                                                        //
 * ///////////////////////////// /!\ WARNING /!\ //////////////////////////////
 * 
 */
 var redis   = require("redis"),
    db      = redis.createClient(),
    dns     = require('dns'),
    http    = require("http"),
    urlib   = require("url"),
    qstring = require('querystring'),
    crypto  = require('crypto');

 /**
 * Simple class for holding onto callbacks until later.  We'll be using it to
 * keep track of concurrent requests to the same URL.
 *
 * Subscribing to an event will store the callback function for later use,
 * while publishing an event will execute every callback function currently
 * attached to the given event.
 */
 function Watcher() { this.callbacks = {}; };

 Watcher.prototype.subscribe = function(k, callback) {	
 	if (!this.callbacks[k]) this.callbacks[k] = [];
 	this.callbacks[k].push(callback);
 };

 Watcher.prototype.publish = function (evnt, error, data) {
 	for (var i=0;i<this.callbacks[evnt];i++) this.callbacks[evnt][i](data);
 };

 /**
 * Really simple pass-through waiter to ensure our callbacks only get called
 * once.  If we called any given callback twice, we'd crash our server by 
 * trying to write to a stream we've already closed.
 */
 function Waiter(callback,bind) {
 	
 	var done = false, chain = [];
 	
 	bind = bind || this;
 	
 	var fn = function() {
 		
 		for (var i=0;i<chain.length;i++) chain[i].apply(bind, arguments);
 		if (!done) callback.apply(bind, arguments);
 		else {
 			console.warn("A callback has been called twice; check for race conditions.");
 			console.log (new Error().stack);
 		}
 		done = true;
 	};
 	
 	fn.push = function(f) { chain.push(f); }
 	
 	return fn;
 	
 };

 //We'll store watchers for pending requests here, and delete them when a 
 //request is no longer pending.
 var pending = { };

 /**
 * Parses and normalizes a given URL string, returning an object of options
 * to pass to the HTTP module.
 */
 function parseURL(url) {
 	
 	//Parsed this by regex a few revisions back, but it was a naive
 	//implementation for fun.
 	var parsed = urlib.parse(url);
 	
 	if (parsed.protocol!='http:') {
 		return callback("URL must start with http://");
 	}

 	//Normalize the query string (for caching).
 	if (parsed.query) {
 		var params = parsed.query.split(/&/g), keys = [], kvs = {}, qstr = [],
 		    i, j, s, k, v, vs;
 		for (i=0;i<params.length;i++) {
 			s = params[i].split('=');
 			k = s[0];
 			v = (typeof(s[1])!='undefined') ? s[1] : null;
 			if (!kvs[k]) kvs[k] = [];
 			kvs[k].push(v);
 			if (keys.indexOf(k)<0) keys.push(k);
 		}
 		keys.sort();
 		for (i=0;i<keys.length;i++) {
 			vs = kvs[keys[i]].sort();
 			for (j=0;j<vs.length;j++) {
 				qstr.push( (vs[j]!=null) ? keys[i]+'='+vs[j] : keys[i] );
 			}
 		} parsed.query = qstr.join('&');
 	}
 	
 	return {
 		host: parsed.host,
 		port: parsed.port || 80,
 		path: parsed.pathname+'?'+(parsed.query || ''),
 		method: 'GET'
 	}
 	
 };

 /**
 * When a request comes in, we'll hit getPage to find the fastest way of
 * getting the data to the user.
 *
 * If we don't have the data cached, we'll make a trip to the target URL to
 * grab the data and then cache it.  If a user requests a page at the same time
 * we're fetching it for another user, that user will be subscribed to an event
 * which will be triggered when the data finishes downloading from the first
 * user's request.
 */
 function getPage(url, callback, options) {
 	
 	if (!url) return callback("URL not provided.");

 	var options = parseURL(url), 
 	    keyStr  = (options.host+options.port+options.path+options.query);
 	
 	var key = crypto.createHash('sha1')
 	                .update(keyStr.toLowerCase())
 	                .digest('hex');
 	
 	//First thing we'll do is check the Redis cache to see if we already have
 	//the data.
 	//
 	//We're using a hash here instead of a standard key just in case we want to
 	//add more properties to each cached item in the future.
 	db.hgetall(key, function(e,cached) {
 		//If it's cached, just return it.
 		if (cached.data) {
 			callback(false, cached.data);
 		//If another request for the same item is pending, wait for the pending
 		//request to complete and then return it.
 		} else if (pending[key]) {
 			pending[key].subscribe(callback);
 		//Otherwise, set the request to pending and make an HTTP call to grab
 		//the data from the remote server.
 		} else if (options.cacheOnly!=false) {
 			pending[key] = new Watcher();
 			fetchPage(options, function(e, d) {
 				//Cache the data, unless there's an error.
 				if (!e) {
 					db.hset(key, "data", d);
 					db.hset(key, "keystring", keyStr);
 				}
 				//Publish the data to anyone waiting for it.
 				if (pending[key]) pending[key].publish(e,d);
 				//Remove it from the pending request hash.
 				delete(pending[key]);
 				//Return the data for the current user.
 				callback(e, d);
 			});
 		} else {
 			callback('Not in local cache.');
 		}
 	});
 	
 }

 /**
 * Fetches the remote page and returns the result via callback.
 */
 function fetchPage(options, callback, maxRedirects) {
 	
 	//Count will keep track of how many redirects we've been dealing with, so
 	//that recursive redirects don't crash things.
 	var redirects = maxRedirects || 5, req = false;

 	//30-second timeout.
 	var timeout = setTimeout(function() {
 		if (req) req.abort();
 		callback("The server took too long to respond.");
 	}, 30*1000);

 	function redirect(location) {
 		redirects--;
 		if (redirects<1) callback("Too many redirects.");
 		//Stupid hack: getPage recurses through fetchPage.
 		else getPage(location, function(e,d) {
 			if (d) callback(e,d);
 			else fetchPage(parseURL(location), callback, redirects);
 		}, {cacheOnly:true});
 		clearTimeout(timeout);
 	};
 	
 	//I'm tired of doing callback; clearTimeout.
 	function finish(e,d) {
 		callback(e,d);
 		clearTimeout(timeout);
 	}
 	
 	dns.lookup(options.host, function(e,d) {

 		if (e)  return finish(e.message || e);
 		
 		//Make sure we're not being directed via malicious DNS record to our
 		//own private network.
 		//
 		//I rolled the previously implemented local network check into this one
 		// as well.
 		//
 		//TODO: IPv6
 		if (d.match(/^(127\.0\.0\.1|10\.|172\.1[6-9]|172\.2[0-9]|172\.3[0-1]|192\.168)/)) {
 			return finish("Cannot connect to private network.");
 		}
 		
 		//A very rudimentary fetch method.
 		req = http.get(options, function(res) {

 			if (!res) return finish("Could not complete the request.");

 			//In the real-world, we'd want to make a table of all the HTTP
 			//status codes to return better errors than this.
 			if (res.statusCode>=400) return finish(res.statusCode);
 					
 			var data = "";
 			res.setEncoding('utf8');

 			var loc = res.headers.location;
 			if (loc) {
 				if (!loc.match(/^https?:\/\//i)) {
 					loc = 'http://'+options.host+'/'+loc.replace(/^\//,'');
 				} return redirect(loc);
 			};
 			
 			res.on('data', function (c) { data += c; });
 			res.on('end',  function() { finish(false, data); });

 		});	
 		
 		req.on('error', function(e) { finish(e.message); });
 		
 	});
 	
 };

 //Takes the first argument to the node command line after the filename; 
 //defaults to 8000 if the argument is not a number.
 var port = process.argv[2]*1 || 8000;

 http.createServer(function(req, res) {
 	getPage(urlib.parse(req.url, true).query.url, new Waiter(function(e,d) {
 		res.end(JSON.stringify({
 			error: e,
 			data : d
 		}));
 	}));
 }).listen(port);

 console.log("Now listening on port "+port+'.');
	/**
	* INSTALLATION:
	*
	* All these modules are core modules except for redis. To install that, just
	* type 'npm install redis'.
	*
	* To run, 'node app.js <port number>'.
	*
	* USAGE:
	*
	* Pass the URL you'd like the fetch the content of as 'url' within the GET
	* parameters.
	*
	* Search through the content server-side to find the information you need,
	* and strip out all HTML (or convert the "safe" HTML to Markdown and back
	* again) before displaying it to avoid XSS/click-jacking/etc attacks.
	*
	* ///////////////////////////// /!\ WARNING /!\ //////////////////////////////
	* // //
	* // Do not under any circumstances output the content gained by this //
	* // service directly to users, especially not in the browser! //
	* // //
	* // Doing so would expose your application to serious XSS/click-jacking //
	* // security vulnerabilities. //
	* // //
	* ///////////////////////////// /!\ WARNING /!\ //////////////////////////////
	*
	*/
	var redis = require("redis"),
	db = redis.createClient(),
	dns = require('dns'),
	http = require("http"),
	urlib = require("url"),
	qstring = require('querystring'),
	crypto = require('crypto');

	/**
	* Simple class for holding onto callbacks until later. We'll be using it to
	* keep track of concurrent requests to the same URL.
	*
	* Subscribing to an event will store the callback function for later use,
	* while publishing an event will execute every callback function currently
	* attached to the given event.
	*/
	function Watcher() { this.callbacks = {}; };

	Watcher.prototype.subscribe = function(k, callback) {
	if (!this.callbacks[k]) this.callbacks[k] = [];
	this.callbacks[k].push(callback);
	};

	Watcher.prototype.publish = function (evnt, error, data) {
	for (var i=0;i<this.callbacks[evnt];i++) this.callbacks[evnt][i](data);
	};

	/**
	* Really simple pass-through waiter to ensure our callbacks only get called
	* once. If we called any given callback twice, we'd crash our server by
	* trying to write to a stream we've already closed.
	*/
	function Waiter(callback,bind) {

	var done = false, chain = [];

	bind = bind \|\| this;

	var fn = function() {

	for (var i=0;i<chain.length;i++) chain[i].apply(bind, arguments);
	if (!done) callback.apply(bind, arguments);
	else {
	console.warn("A callback has been called twice; check for race conditions.");
	console.log (new Error().stack);
	}
	done = true;
	};

	fn.push = function(f) { chain.push(f); }

	return fn;

	};

	//We'll store watchers for pending requests here, and delete them when a
	//request is no longer pending.
	var pending = { };

	/**
	* Parses and normalizes a given URL string, returning an object of options
	* to pass to the HTTP module.
	*/
	function parseURL(url) {

	//Parsed this by regex a few revisions back, but it was a naive
	//implementation for fun.
	var parsed = urlib.parse(url);

	if (parsed.protocol!='http:') {
	return callback("URL must start with http://");
	}

	//Normalize the query string (for caching).
	if (parsed.query) {
	var params = parsed.query.split(/&/g), keys = [], kvs = {}, qstr = [],
	i, j, s, k, v, vs;
	for (i=0;i<params.length;i++) {
	s = params[i].split('=');
	k = s[0];
	v = (typeof(s[1])!='undefined') ? s[1] : null;
	if (!kvs[k]) kvs[k] = [];
	kvs[k].push(v);
	if (keys.indexOf(k)<0) keys.push(k);
	}
	keys.sort();
	for (i=0;i<keys.length;i++) {
	vs = kvs[keys[i]].sort();
	for (j=0;j<vs.length;j++) {
	qstr.push( (vs[j]!=null) ? keys[i]+'='+vs[j] : keys[i] );
	}
	} parsed.query = qstr.join('&');
	}

	return {
	host: parsed.host,
	port: parsed.port \|\| 80,
	path: parsed.pathname+'?'+(parsed.query \|\| ''),
	method: 'GET'
	}

	};

	/**
	* When a request comes in, we'll hit getPage to find the fastest way of
	* getting the data to the user.
	*
	* If we don't have the data cached, we'll make a trip to the target URL to
	* grab the data and then cache it. If a user requests a page at the same time
	* we're fetching it for another user, that user will be subscribed to an event
	* which will be triggered when the data finishes downloading from the first
	* user's request.
	*/
	function getPage(url, callback, options) {

	if (!url) return callback("URL not provided.");

	var options = parseURL(url),
	keyStr = (options.host+options.port+options.path+options.query);

	var key = crypto.createHash('sha1')
	.update(keyStr.toLowerCase())
	.digest('hex');

	//First thing we'll do is check the Redis cache to see if we already have
	//the data.
	//
	//We're using a hash here instead of a standard key just in case we want to
	//add more properties to each cached item in the future.
	db.hgetall(key, function(e,cached) {
	//If it's cached, just return it.
	if (cached.data) {
	callback(false, cached.data);
	//If another request for the same item is pending, wait for the pending
	//request to complete and then return it.
	} else if (pending[key]) {
	pending[key].subscribe(callback);
	//Otherwise, set the request to pending and make an HTTP call to grab
	//the data from the remote server.
	} else if (options.cacheOnly!=false) {
	pending[key] = new Watcher();
	fetchPage(options, function(e, d) {
	//Cache the data, unless there's an error.
	if (!e) {
	db.hset(key, "data", d);
	db.hset(key, "keystring", keyStr);
	}
	//Publish the data to anyone waiting for it.
	if (pending[key]) pending[key].publish(e,d);
	//Remove it from the pending request hash.
	delete(pending[key]);
	//Return the data for the current user.
	callback(e, d);
	});
	} else {
	callback('Not in local cache.');
	}
	});

	}

	/**
	* Fetches the remote page and returns the result via callback.
	*/
	function fetchPage(options, callback, maxRedirects) {

	//Count will keep track of how many redirects we've been dealing with, so
	//that recursive redirects don't crash things.
	var redirects = maxRedirects \|\| 5, req = false;

	//30-second timeout.
	var timeout = setTimeout(function() {
	if (req) req.abort();
	callback("The server took too long to respond.");
	}, 30*1000);

	function redirect(location) {
	redirects--;
	if (redirects<1) callback("Too many redirects.");
	//Stupid hack: getPage recurses through fetchPage.
	else getPage(location, function(e,d) {
	if (d) callback(e,d);
	else fetchPage(parseURL(location), callback, redirects);
	}, {cacheOnly:true});
	clearTimeout(timeout);
	};

	//I'm tired of doing callback; clearTimeout.
	function finish(e,d) {
	callback(e,d);
	clearTimeout(timeout);
	}

	dns.lookup(options.host, function(e,d) {

	if (e) return finish(e.message \|\| e);

	//Make sure we're not being directed via malicious DNS record to our
	//own private network.
	//
	//I rolled the previously implemented local network check into this one
	// as well.
	//
	//TODO: IPv6
	if (d.match(/^(127\.0\.0\.1\|10\.\|172\.1[6-9]\|172\.2[0-9]\|172\.3[0-1]\|192\.168)/)) {
	return finish("Cannot connect to private network.");
	}

	//A very rudimentary fetch method.
	req = http.get(options, function(res) {

	if (!res) return finish("Could not complete the request.");

	//In the real-world, we'd want to make a table of all the HTTP
	//status codes to return better errors than this.
	if (res.statusCode>=400) return finish(res.statusCode);

	var data = "";
	res.setEncoding('utf8');

	var loc = res.headers.location;
	if (loc) {
	if (!loc.match(/^https?:\/\//i)) {
	loc = 'http://'+options.host+'/'+loc.replace(/^\//,'');
	} return redirect(loc);
	};

	res.on('data', function (c) { data += c; });
	res.on('end', function() { finish(false, data); });

	});

	req.on('error', function(e) { finish(e.message); });

	});

	};

	//Takes the first argument to the node command line after the filename;
	//defaults to 8000 if the argument is not a number.
	var port = process.argv[2]*1 \|\| 8000;

	http.createServer(function(req, res) {
	getPage(urlib.parse(req.url, true).query.url, new Waiter(function(e,d) {
	res.end(JSON.stringify({
	error: e,
	data : d
	}));
	}));
	}).listen(port);

	console.log("Now listening on port "+port+'.');
No results found