cmoore4 · May 29, 2011 20:51
diff --git a/gistfile1.js b/gistfile1.js
 // This is the library that'll handle all of our input tracking and job dispatching
 var nodeio = require('node.io');

 // The base_url is the site you want to crawl.
 // Links is an array of all the links seen as <a> tags, but not yet scraped.
 // crawled_links is the array of all the pages already scraped.
 var base_url = 'http://reddit.com',
    links = [base_url],
    crawled_links = [];

 var count = 0;

 var methods = {
 	// Input takes an array that will be fed to jobs as "run"
 	input: links,

 	// Here's the function that does most of the heavy lifting.
 	// Retrieves and processes HTML
 	run: function(link){
 		var self = this;
 		self.getHtml(link, function(err, $, data, headers){
 			if (err){
 				console.log('Error scraping page ' + link);
 				console.log(err);
 				
 				// "retry" puts the link back in the input queue
 				if (err.toString() == 'timeout'){ 
 	                self.retry(); 
 				} else {
 					self.skip();
 				}
 			}

 			// Not error
 			else {
 				$a = $('a[href]');
 				if ($a.length){
 					$a.foreach(function($el, idx){
 						url = $el.attr('href');
 						if (links.indexOf(url) == -1 && crawled_links.indexOf(url) == -1){
 							// We add the url to links array to keep track of it
 							// self.add() is from Node.io, and adds the link to the input array
 							links.push(url);
 							self.add(url);
 							console.log(link + ': Added ' + url);
 						}
 					});
 				}
 				//emits the result to be handled by fail if necessary
 				this.emit();
 				count += 1;
 			}
 		});
 	},

 	complete: function(callback){
 		console.log('Read in ' + count + ' pages.');
 		// Node.io requires this, but is unnecessary for now
 		callback();
 	}
 }

 // Here's the actual call to start the Node.io job:
 exports.job = new nodeio.Job({
    timeout: 12, // 12 secodn timeout per thread
    jsdom: true, // use jQuery instead of htmlParser
    max: 5 //5 threads in this job
 }, methods); // methods is our big object above.
	// This is the library that'll handle all of our input tracking and job dispatching
	var nodeio = require('node.io');

	// The base_url is the site you want to crawl.
	// Links is an array of all the links seen as <a> tags, but not yet scraped.
	// crawled_links is the array of all the pages already scraped.
	var base_url = 'http://reddit.com',
	links = [base_url],
	crawled_links = [];

	var count = 0;

	var methods = {
	// Input takes an array that will be fed to jobs as "run"
	input: links,

	// Here's the function that does most of the heavy lifting.
	// Retrieves and processes HTML
	run: function(link){
	var self = this;
	self.getHtml(link, function(err, $, data, headers){
	if (err){
	console.log('Error scraping page ' + link);
	console.log(err);

	// "retry" puts the link back in the input queue
	if (err.toString() == 'timeout'){
	self.retry();
	} else {
	self.skip();
	}
	}

	// Not error
	else {
	$a = $('a[href]');
	if ($a.length){
	$a.foreach(function($el, idx){
	url = $el.attr('href');
	if (links.indexOf(url) == -1 && crawled_links.indexOf(url) == -1){
	// We add the url to links array to keep track of it
	// self.add() is from Node.io, and adds the link to the input array
	links.push(url);
	self.add(url);
	console.log(link + ': Added ' + url);
	}
	});
	}
	//emits the result to be handled by fail if necessary
	this.emit();
	count += 1;
	}
	});
	},

	complete: function(callback){
	console.log('Read in ' + count + ' pages.');
	// Node.io requires this, but is unnecessary for now
	callback();
	}
	}

	// Here's the actual call to start the Node.io job:
	exports.job = new nodeio.Job({
	timeout: 12, // 12 secodn timeout per thread
	jsdom: true, // use jQuery instead of htmlParser
	max: 5 //5 threads in this job
	}, methods); // methods is our big object above.