Skip to content

Instantly share code, notes, and snippets.

@max-mapper
Last active August 1, 2018 16:31
Show Gist options
  • Save max-mapper/a32ddd0e10e90ff98a3ac7c5f8557b53 to your computer and use it in GitHub Desktop.
Save max-mapper/a32ddd0e10e90ff98a3ac7c5f8557b53 to your computer and use it in GitHub Desktop.
streaming merge sort of two line delimited files (csv and json lines)
// output of above script pipe into here, converts it to smaller csv
var split = require('split2')
var through = require('through2')
console.log('doi,url')
var splitter = split()
var each = through(function (buf, enc, next) {
var _
try {
_ = JSON.parse(buf)
} catch (e) {
console.error('ERROR', buf.toString())
return next()
}
if (typeof(_.data) === 'undefined') {
if (_.doi) console.log('\"' + _.doi + '\",');
else console.error('NO DOI/DATA', _)
return next()
}
for (var i = 0; i < _.data.length; i++) {
if (_.data[i].type === 'URL') {
console.log('\"' + _.doi + '\",\"' + _.data[i].data + '\"');
return next();
}
}
console.error('NO URL', _)
next()
})
process.stdin.pipe(splitter).pipe(each)
var fs = require('fs')
var split = require('split2')
var iterate = require('stream-iterate')
var csv = require('csv-parser')
var ndjson = require('ndjson')
var tsv = fs.createReadStream('../doi.tsv')
var parser = csv({separator: '\t'})
tsv.pipe(parser)
var read1 = iterate(parser)
var log = fs.createReadStream('../dois.json')
var json = ndjson.parse()
log.pipe(json)
var read2 = iterate(json)
loop()
// recursively iterates through each item in the stream
function loop () {
read1(function (err, data1, next1) {
if (err) throw err
read2(function (err, data2, next2) {
if (err) throw err
// do stuff with data1 and data2... e.g.
data1.headers = data2.headers
data1.data = data2.data
console.log(JSON.stringify(data1))
// recurse
next1()
next2()
loop()
})
})
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment