Skip to content

Instantly share code, notes, and snippets.

@ishiduca
Created October 16, 2014 11:35
Show Gist options
  • Save ishiduca/cdaf07b5cd6b52b153a1 to your computer and use it in GitHub Desktop.
Save ishiduca/cdaf07b5cd6b52b153a1 to your computer and use it in GitHub Desktop.
crawler.js
var path = require('path')
var fs = require('fs')
var OpmlParser = require('opmlparser')
var semaphore = require('semaphore')
var request = require('request')
var FeedParser = require('feedparser')
var export_xml = path.join(__dirname, './export.xml')
var lock_capacity = 3
var timeout = 5000
var interval = 800
var rs = fs.createReadStream(export_xml)
var opmlps = new OpmlParser
var sem = semaphore(lock_capacity)
opmlps.on('readable', function () {
var outline
while (null !== (outline = opmlps.read())) {
if (! outline.xmlurl) return
;(function (xmlurl) {
sem.take(function () {
console.log('[fetch > %s]', xmlurl)
var req = request.get({uri: xmlurl, timeout: timeout})
var feedparser = new FeedParser
req.on('error', function (err) {
console.log('[%s, xmlurl: %s]', err.toString(), xmlurl)
})
feedparser.on('error', function (err) {
console.log('[%s, xmlurl: %s]', err.toString(), xmlurl)
})
req.on('response', function (response) {
console.log('[get < %s]', xmlurl)
setTimeout(function () { sem.leave() }, interval)
var st = this
if (response.statusCode !== 200) {
return req.emit('error'
, new Error('BadURIError: ' + response.statusCode
+ ', location: ' + response.request.uri.href))
}
st.pipe(feedparser)
})
feedparser.on('readable', function () {
var st = this
var meta = this.meta
var item
while (null !== (item = feedparser.read())) {
console.log(item.title || item.description)
}
})
})
})(outline.xmlurl)
}
})
rs.pipe(opmlps)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment