Last active
August 1, 2018 16:31
-
-
Save max-mapper/a32ddd0e10e90ff98a3ac7c5f8557b53 to your computer and use it in GitHub Desktop.
streaming merge sort of two line delimited files (csv and json lines)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// output of above script pipe into here, converts it to smaller csv | |
var split = require('split2') | |
var through = require('through2') | |
console.log('doi,url') | |
var splitter = split() | |
var each = through(function (buf, enc, next) { | |
var _ | |
try { | |
_ = JSON.parse(buf) | |
} catch (e) { | |
console.error('ERROR', buf.toString()) | |
return next() | |
} | |
if (typeof(_.data) === 'undefined') { | |
if (_.doi) console.log('\"' + _.doi + '\",'); | |
else console.error('NO DOI/DATA', _) | |
return next() | |
} | |
for (var i = 0; i < _.data.length; i++) { | |
if (_.data[i].type === 'URL') { | |
console.log('\"' + _.doi + '\",\"' + _.data[i].data + '\"'); | |
return next(); | |
} | |
} | |
console.error('NO URL', _) | |
next() | |
}) | |
process.stdin.pipe(splitter).pipe(each) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var fs = require('fs') | |
var split = require('split2') | |
var iterate = require('stream-iterate') | |
var csv = require('csv-parser') | |
var ndjson = require('ndjson') | |
var tsv = fs.createReadStream('../doi.tsv') | |
var parser = csv({separator: '\t'}) | |
tsv.pipe(parser) | |
var read1 = iterate(parser) | |
var log = fs.createReadStream('../dois.json') | |
var json = ndjson.parse() | |
log.pipe(json) | |
var read2 = iterate(json) | |
loop() | |
// recursively iterates through each item in the stream | |
function loop () { | |
read1(function (err, data1, next1) { | |
if (err) throw err | |
read2(function (err, data2, next2) { | |
if (err) throw err | |
// do stuff with data1 and data2... e.g. | |
data1.headers = data2.headers | |
data1.data = data2.data | |
console.log(JSON.stringify(data1)) | |
// recurse | |
next1() | |
next2() | |
loop() | |
}) | |
}) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment