-
-
Save renesugar/ee91951f5619764c65b65618275f2eeb to your computer and use it in GitHub Desktop.
fast loading of a large dataset into leveldb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// data comes from here http://stat-computing.org/dataexpo/2009/the-data.html | |
// download 1994.csv.bz2 and unpack by running: cat 1994.csv.bz2 | bzip2 -d > 1994.csv | |
// 1994.csv should be ~5.2 million lines and 500MB | |
// importing all rows into leveldb took ~50 seconds on my machine | |
// there are two main techniques at work here: | |
// 1: never create JS objects, leave the data as binary the entire time (binary-split does this) | |
// 2: group lines into 16 MB batches, to take advantage of leveldbs batch API (byte-stream does this) | |
var level = require('level') | |
var byteStream = require('byte-stream') | |
var split = require('binary-split') | |
var fs = require('fs') | |
var count = 0 | |
var wbs = 1024 * 1024 * 16 | |
var db = level('data.db', {writeBufferSize: wbs}, function(){ | |
var batcher = byteStream(wbs) | |
fs.createReadStream('1994.csv') | |
.pipe(split()) | |
.pipe(batcher) | |
.on('data', function(lines) { | |
var batch = db.batch() | |
for (var i = 0; i < lines.length; i++) { | |
batch.put(count, lines[i]) | |
count++ | |
} | |
batch.write(batcher.next.bind(batcher)) | |
}) | |
}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment