Skip to content

Instantly share code, notes, and snippets.

@ishiduca
Last active September 15, 2015 05:14
Show Gist options
  • Save ishiduca/30a3a2ca043d31519a65 to your computer and use it in GitHub Desktop.
Save ishiduca/30a3a2ca043d31519a65 to your computer and use it in GitHub Desktop.
node web scraper - 【とらのあなWebSite】とらの瞬間風速
{
"dependencies": {
"destroy": "^1.0.3",
"JSONStream": "^1.0.4",
"hyperquest": "^1.2.0",
"iconv-lite": "^0.4.11",
"through2": "^2.0.0",
"trumpet": "^1.7.1"
}
}
'use strict'
var hyperquest = require('hyperquest')
var iconv = require('iconv-lite')
var trumpet = require('trumpet')
var through = require('through2')
var JSONStream = require('JSONStream')
var destroy = require('destroy')
var uri = 'http://www.toranoana.jp/mailorder/watch/wi_gen_all.html'
var selector = '.Main table.table_index_Frame_2 tr td table.whole_w_left tr td table.watch tr td.watch1 a'
var rs = through.obj()
var tr = trumpet()
tr.selectAll(selector, function (a) {
var buf = []
a.createReadStream()
.pipe(iconv.decodeStream('cp932'))
.pipe(through.obj(function (c, enc, done) {
buf.push(c)
done()
}, function (done) {
var data
;(Buffer.isBuffer(buf[0])) ? (data = String(Buffer.concat(buf)))
: (data = buf.join(''))
a.getAttribute('href', function (href) {
rs.write({
href: href
, text: data
})
done()
})
}))
})
tr.once('end', rs.end.bind(rs))
hyperquest(uri).once('error', onError).once('response', onResponse)
.pipe(tr)
rs
.pipe(JSONStream.stringify())
.pipe(process.stdout)
function onError (err) {
console.error(err)
}
function onResponse (res) {
if (res.statusCode !== 200) {
destroy(res)
return onError(new Error(res.statusCode + ': ' + uri))
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment