Created
May 6, 2015 05:19
-
-
Save sibnerian/55bc9a8ab6b4e0d02b92 to your computer and use it in GitHub Desktop.
HighlandJS batched scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var _ = require('highland'); | |
var cheerio = require('cheerio'); | |
var request = require('request'); | |
// request library, but with a node.js style callback - i.e. (err, res) | |
var req = function (url, cb) { | |
request(url, function (err, res, data) { | |
cb(err, data); | |
}); | |
}; | |
// Timeout to demonstrate functionality. Pauses for three seconds in between batches. | |
var waititout = function (data, cb) { | |
setTimeout(function () { | |
cb(null, data); | |
}, 3000); | |
}; | |
// Create a stream with all these websites... | |
_(['https://github.com/isibner', | |
'https://github.com/cheeriojs/cheerio', | |
'https://github.com/caolan/highland/issues/246', | |
'http://highlandjs.org/' | |
]) | |
// ...batch them two at a time, and send the length-2 arrays down the pipline... | |
.batch(2) | |
// ... wait 3 seconds per array (for demonstration purposes)... | |
.map(_.wrapCallback(waititout)) | |
// ...flatten the arrays so we consider one URL at a time... | |
.flatten() | |
// ...GET the URL, and send the body down the pipeline... | |
.map(_.wrapCallback(req)) | |
// ...process those URLs one at a time (necessary hint to Highland)... | |
.series() | |
// ... and finally get the title of the page with Cheerio and log it! | |
.map(function (page) { | |
var $ = cheerio.load(page); | |
return $('title').text(); | |
}).each(_.log); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment