Skip to content

Instantly share code, notes, and snippets.

@iursevla
Last active May 22, 2017 20:59
Show Gist options
  • Save iursevla/ed1c4c6a0a5b51733277e47e9adc5b8c to your computer and use it in GitHub Desktop.
Save iursevla/ed1c4c6a0a5b51733277e47e9adc5b8c to your computer and use it in GitHub Desktop.
CSV Parser
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<meta http-equiv="X-UA-Compatible" content="ie=edge">
<title>CSV Parser</title>
<script src="CSVParser.js"></script>
</head>
<body>
<input type="file" id="file" name="files" accept=".csv">
<button id="submit" onclick="startParser();">Ok</button>
<br>
Choose CSV file to Parse
<div id="progress">
</div>
</body>
</html>
class CSVParser {
constructor(userOptions = {}) {
this.dataWorkers = null; //Array of Workers
this.indexWorker = 0; //Id of the worker to send the next chunk
this.replies = 0; //How many workers replied back with their results
this.isDone = false; //Whether or not the Parser is done reading the file
this.startByte = 0; //Starting point of the read request
this.numRows = 0; //Number of rows that were read from the file
this.data = [];
this.options = this.loadOptions(userOptions);
this.reader = this.createFileReader();
this.numChunks = Math.round(this.options.file.size / this.options.chunkSize);
if (this.options.header.fileHasHeader && !this.options.header.headerIndeces)
this.readNextChunk(this.options.header.headerBytesSize);
else
this.readNextChunk(this.options.chunkSize);
}
//Should load all options and act accordingly
loadOptions(userOptions = {}) {
//Error vars
this.FILEERROR = "ES6CSVParser => CSV file to parse is undefined.";
this.FIRSTROWERROR = "Didn't read enough for the first row. Change header size to a higher value.";
this.HEADEROPTIONSERROR = "No header indeces provided.";
this.NOKEYS = "Not all given keys are present in the header";
let opts = { //Load Options
chunkSize: userOptions.chunkSize || 1024 * 1024 * 10, //UserDef or 10MB
delimiter: userOptions.delimiter || ',', //UserDef or comma
rowDelimiter: userOptions.rowDelimiter || '\n', //UserDef or \n http://stackoverflow.com/a/1552782/
file: userOptions.file || document.getElementById("file").files[0], //UserDef or <input id="file"..>
numWorkers: userOptions.numWorkers || 2, //Worker threads
header: {
fileHasHeader: userOptions.fileHasHeader !== undefined ? userOptions.fileHasHeader : true, //Should read header to look for headerKeys
headerBytesSize: userOptions.headerBytesSize || 1024, //Minimize the header read (only used if readHeader true)
headerKeys: userOptions.headerKeys, //Which headers the worker which treats data should use
headerIndeces: userOptions.headerIndeces //Header indeces to use
}
}
if (opts.file === undefined)
throw Error(this.FILEERROR);
if (!opts.header.headerKeys || opts.header.headerKeys.length === 0) //Should always have keys
throw new Error(this.HEADEROPTIONSERROR)
return opts;
}
//Create file reader and event functions
createFileReader() {
let reader = new FileReader();
reader.onload = (e) => { this.chunkLoaded(e); }
reader.onerror = (e) => { console.error(e); }
return reader;
}
//Reads next chunk from the file (header or body)
readNextChunk(chunkSize) {
let file = this.options.file;
let start = Math.min(this.startByte, file.size);
let end = Math.min(this.startByte + chunkSize, file.size - 1);
this.reader.readAsText(file.slice(start, end));
}
//When a chunk is loaded from the CSV file
chunkLoaded(e) {
let opts = this.options;
if (this.startByte === 0 && this.options.header.fileHasHeader) {
this.startByte += opts.header.headerBytesSize;
this.processHeader(e.target.result);
}
else {
this.startByte += opts.chunkSize;
this.processBodyChunk(e.target.result);
}
this.updateProgress();
if (this.startByte >= this.options.file.size)
this.isDone = true;
else
this.readNextChunk(this.options.chunkSize);
}
//Process Header
processHeader(res) {
let opts = this.options;
let rows = res.split(opts.rowDelimiter);
if (rows.length < 2) //Couldnt reach any row delimiter so it didnt read one full row
throw new Error(this.FIRSTROWERROR);
else { //find the keys we want and then start workers
let keysRead = rows[0].split(opts.delimiter);
let lastKey = keysRead[keysRead.length - 1];
if (this.endsWithImp(lastKey.charAt(lastKey.length - 1)))
keysRead[keysRead.length - 1] = lastKey.slice(0, lastKey.length - 1);
let indices = [];
for (const k of opts.header.headerKeys) {
let pos = keysRead.indexOf(k);
if (pos >= 0)
indices.push(pos);
}
if (indices.length < opts.header.headerKeys.length) //Didnt found all given keys
throw new Error(this.NOKEYS);
opts.header.headerIndeces = indices;
this.startWorkers(opts); //Send info to read workers
let temp = '';
for (let i = 1; i < rows.length; i++)
i === rows.length - 1 ? temp += rows[i] : temp += rows[i] + opts.rowDelimiter;
this.remainingRows = temp;
}
}
//Other chunks that aren't an header
processBodyChunk(rows) {
let numWorkers = this.options.numWorkers;
let rowDelimiter = this.options.rowDelimiter;
let lastChar = rows.charAt(rows.length - 1)
if (this.endsWithImp(lastChar)) {
rows = this.remainingRows + rows;
this.dataWorkers[this.indexWorker++ % numWorkers].postMessage({ rows });
this.remainingRows = ''; //Reset remaining row(s)
}
else {
rows = this.remainingRows + rows;
this.dataWorkers[this.indexWorker++ % numWorkers].postMessage({ rows, ignoreLastRow: true }); //Ignore last row
let index = rows.length - 1;
let char = rows[index];
let lastLine = "";
while (char !== rowDelimiter) {
lastLine = char + lastLine;
char = rows[--index];
}
this.remainingRows = lastLine;
}
}
//Verifies if the char is equal to rowDelimiter or Line Feed (\n) or Carriage Return (\r)
endsWithImp(lastChar) {
return lastChar === this.options.rowDelimiter || lastChar.charCodeAt() === 10 || lastChar.charCodeAt() === 13;
}
//Start data processing workers
startWorkers(options) {
let workerOptions = {
chunkSize: options.chunkSize, //UserDef or 10MB ?
delimiter: options.delimiter, //UserDef or comma
rowDelimiter: options.rowDelimiter, //UserDef or \n
headerKeys: options.header.headerKeys, //Which headers the worker which treats data should use
headerIndeces: options.header.headerIndeces
}
let numWorkers = options.numWorkers;
if (options.chunkSize >= options.file.size) //If the size of the file is smaller than the chunk only 1 worker is needed
numWorkers = 1;
this.dataWorkers = new Array(numWorkers);
for (let i = 0; i < numWorkers; i++) { // Create n workers
workerOptions.index = i;
this.dataWorkers[i] = new Worker('./worker.js');
this.dataWorkers[i].postMessage(workerOptions);
this.dataWorkers[i].onmessage = (e) => { this.receiveWorkersData(e.data); };
}
}
//Receive Workers treated data
receiveWorkersData(data) {
this.data.push(data);
if (++this.replies === this.numChunks && this.isDone)
this.terminateWorkers();
}
//Shutdown data processing workers
terminateWorkers() {
for (let w of this.dataWorkers)
w.terminate();
delete this.dataWorkers;
}
//Updates DOM element with id=progress with the percentage already read from the file
updateProgress() {
let progress = (this.startByte / this.options.file.size) * 100;
progress = Math.round(progress) > 100 ? 100 : Math.round(progress);
document.getElementById('progress').innerHTML = "File Reading Progress: " + progress + "%";
document.getElementsByClassName('determinate')[0].style.width = progress + "%";
}
}
var parser = null;
function startParser() {
parser = new CSVParser({ chunkSize: 20 * 1024 * 1024, headerKeys: ['Block', 'IUCR', 'Longitude', 'Year', 'Latitude'], numWorkers: 4 });
}
let numMsgsReceived = 0; //Number messages received
let options = null; //Save options
let headerIndeces = []; //Indeces of header to save
/**
* Receive options and file to read from main Thread.
*/
this.onmessage = function (e) {
if (numMsgsReceived === 0) { //Save options
options = e.data;
headerIndeces = options.headerIndeces;
numMsgsReceived++;
}
else {
let rows = e.data.rows.split(/\r?\n/);
let numRows = rows.length;
if (e.data.ignoreLastRow)
pushRows(rows, numRows - 1)
else
pushRows(rows, numRows);
}
}
function pushRows(rows, numRows) {
let mapa = [];
for (let i = 0; i < numRows; i++) {
let values = rows[i].split(options.delimiter);
let row = []
for (const idx of headerIndeces) {
if (+values[idx] === +values[idx])
row.push(+values[idx]); //Fast convert str to number
else
row.push(values[idx]);
}
mapa.push(row);
}
self.postMessage(mapa);
delete mapa;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment