Created
May 29, 2011 20:51
-
-
Save cmoore4/998126 to your computer and use it in GitHub Desktop.
Node.io Scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// This is the library that'll handle all of our input tracking and job dispatching | |
var nodeio = require('node.io'); | |
// The base_url is the site you want to crawl. | |
// Links is an array of all the links seen as <a> tags, but not yet scraped. | |
// crawled_links is the array of all the pages already scraped. | |
var base_url = 'http://reddit.com', | |
links = [base_url], | |
crawled_links = []; | |
var count = 0; | |
var methods = { | |
// Input takes an array that will be fed to jobs as "run" | |
input: links, | |
// Here's the function that does most of the heavy lifting. | |
// Retrieves and processes HTML | |
run: function(link){ | |
var self = this; | |
self.getHtml(link, function(err, $, data, headers){ | |
if (err){ | |
console.log('Error scraping page ' + link); | |
console.log(err); | |
// "retry" puts the link back in the input queue | |
if (err.toString() == 'timeout'){ | |
self.retry(); | |
} else { | |
self.skip(); | |
} | |
} | |
// Not error | |
else { | |
$a = $('a[href]'); | |
if ($a.length){ | |
$a.foreach(function($el, idx){ | |
url = $el.attr('href'); | |
if (links.indexOf(url) == -1 && crawled_links.indexOf(url) == -1){ | |
// We add the url to links array to keep track of it | |
// self.add() is from Node.io, and adds the link to the input array | |
links.push(url); | |
self.add(url); | |
console.log(link + ': Added ' + url); | |
} | |
}); | |
} | |
//emits the result to be handled by fail if necessary | |
this.emit(); | |
count += 1; | |
} | |
}); | |
}, | |
complete: function(callback){ | |
console.log('Read in ' + count + ' pages.'); | |
// Node.io requires this, but is unnecessary for now | |
callback(); | |
} | |
} | |
// Here's the actual call to start the Node.io job: | |
exports.job = new nodeio.Job({ | |
timeout: 12, // 12 secodn timeout per thread | |
jsdom: true, // use jQuery instead of htmlParser | |
max: 5 //5 threads in this job | |
}, methods); // methods is our big object above. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment