Last active
May 22, 2017 20:59
-
-
Save iursevla/ed1c4c6a0a5b51733277e47e9adc5b8c to your computer and use it in GitHub Desktop.
CSV Parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!DOCTYPE html> | |
<html lang="en"> | |
<head> | |
<meta charset="UTF-8"> | |
<meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
<meta http-equiv="X-UA-Compatible" content="ie=edge"> | |
<title>CSV Parser</title> | |
<script src="CSVParser.js"></script> | |
</head> | |
<body> | |
<input type="file" id="file" name="files" accept=".csv"> | |
<button id="submit" onclick="startParser();">Ok</button> | |
<br> | |
Choose CSV file to Parse | |
<div id="progress"> | |
</div> | |
</body> | |
</html> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class CSVParser { | |
constructor(userOptions = {}) { | |
this.dataWorkers = null; //Array of Workers | |
this.indexWorker = 0; //Id of the worker to send the next chunk | |
this.replies = 0; //How many workers replied back with their results | |
this.isDone = false; //Whether or not the Parser is done reading the file | |
this.startByte = 0; //Starting point of the read request | |
this.numRows = 0; //Number of rows that were read from the file | |
this.data = []; | |
this.options = this.loadOptions(userOptions); | |
this.reader = this.createFileReader(); | |
this.numChunks = Math.round(this.options.file.size / this.options.chunkSize); | |
if (this.options.header.fileHasHeader && !this.options.header.headerIndeces) | |
this.readNextChunk(this.options.header.headerBytesSize); | |
else | |
this.readNextChunk(this.options.chunkSize); | |
} | |
//Should load all options and act accordingly | |
loadOptions(userOptions = {}) { | |
//Error vars | |
this.FILEERROR = "ES6CSVParser => CSV file to parse is undefined."; | |
this.FIRSTROWERROR = "Didn't read enough for the first row. Change header size to a higher value."; | |
this.HEADEROPTIONSERROR = "No header indeces provided."; | |
this.NOKEYS = "Not all given keys are present in the header"; | |
let opts = { //Load Options | |
chunkSize: userOptions.chunkSize || 1024 * 1024 * 10, //UserDef or 10MB | |
delimiter: userOptions.delimiter || ',', //UserDef or comma | |
rowDelimiter: userOptions.rowDelimiter || '\n', //UserDef or \n http://stackoverflow.com/a/1552782/ | |
file: userOptions.file || document.getElementById("file").files[0], //UserDef or <input id="file"..> | |
numWorkers: userOptions.numWorkers || 2, //Worker threads | |
header: { | |
fileHasHeader: userOptions.fileHasHeader !== undefined ? userOptions.fileHasHeader : true, //Should read header to look for headerKeys | |
headerBytesSize: userOptions.headerBytesSize || 1024, //Minimize the header read (only used if readHeader true) | |
headerKeys: userOptions.headerKeys, //Which headers the worker which treats data should use | |
headerIndeces: userOptions.headerIndeces //Header indeces to use | |
} | |
} | |
if (opts.file === undefined) | |
throw Error(this.FILEERROR); | |
if (!opts.header.headerKeys || opts.header.headerKeys.length === 0) //Should always have keys | |
throw new Error(this.HEADEROPTIONSERROR) | |
return opts; | |
} | |
//Create file reader and event functions | |
createFileReader() { | |
let reader = new FileReader(); | |
reader.onload = (e) => { this.chunkLoaded(e); } | |
reader.onerror = (e) => { console.error(e); } | |
return reader; | |
} | |
//Reads next chunk from the file (header or body) | |
readNextChunk(chunkSize) { | |
let file = this.options.file; | |
let start = Math.min(this.startByte, file.size); | |
let end = Math.min(this.startByte + chunkSize, file.size - 1); | |
this.reader.readAsText(file.slice(start, end)); | |
} | |
//When a chunk is loaded from the CSV file | |
chunkLoaded(e) { | |
let opts = this.options; | |
if (this.startByte === 0 && this.options.header.fileHasHeader) { | |
this.startByte += opts.header.headerBytesSize; | |
this.processHeader(e.target.result); | |
} | |
else { | |
this.startByte += opts.chunkSize; | |
this.processBodyChunk(e.target.result); | |
} | |
this.updateProgress(); | |
if (this.startByte >= this.options.file.size) | |
this.isDone = true; | |
else | |
this.readNextChunk(this.options.chunkSize); | |
} | |
//Process Header | |
processHeader(res) { | |
let opts = this.options; | |
let rows = res.split(opts.rowDelimiter); | |
if (rows.length < 2) //Couldnt reach any row delimiter so it didnt read one full row | |
throw new Error(this.FIRSTROWERROR); | |
else { //find the keys we want and then start workers | |
let keysRead = rows[0].split(opts.delimiter); | |
let lastKey = keysRead[keysRead.length - 1]; | |
if (this.endsWithImp(lastKey.charAt(lastKey.length - 1))) | |
keysRead[keysRead.length - 1] = lastKey.slice(0, lastKey.length - 1); | |
let indices = []; | |
for (const k of opts.header.headerKeys) { | |
let pos = keysRead.indexOf(k); | |
if (pos >= 0) | |
indices.push(pos); | |
} | |
if (indices.length < opts.header.headerKeys.length) //Didnt found all given keys | |
throw new Error(this.NOKEYS); | |
opts.header.headerIndeces = indices; | |
this.startWorkers(opts); //Send info to read workers | |
let temp = ''; | |
for (let i = 1; i < rows.length; i++) | |
i === rows.length - 1 ? temp += rows[i] : temp += rows[i] + opts.rowDelimiter; | |
this.remainingRows = temp; | |
} | |
} | |
//Other chunks that aren't an header | |
processBodyChunk(rows) { | |
let numWorkers = this.options.numWorkers; | |
let rowDelimiter = this.options.rowDelimiter; | |
let lastChar = rows.charAt(rows.length - 1) | |
if (this.endsWithImp(lastChar)) { | |
rows = this.remainingRows + rows; | |
this.dataWorkers[this.indexWorker++ % numWorkers].postMessage({ rows }); | |
this.remainingRows = ''; //Reset remaining row(s) | |
} | |
else { | |
rows = this.remainingRows + rows; | |
this.dataWorkers[this.indexWorker++ % numWorkers].postMessage({ rows, ignoreLastRow: true }); //Ignore last row | |
let index = rows.length - 1; | |
let char = rows[index]; | |
let lastLine = ""; | |
while (char !== rowDelimiter) { | |
lastLine = char + lastLine; | |
char = rows[--index]; | |
} | |
this.remainingRows = lastLine; | |
} | |
} | |
//Verifies if the char is equal to rowDelimiter or Line Feed (\n) or Carriage Return (\r) | |
endsWithImp(lastChar) { | |
return lastChar === this.options.rowDelimiter || lastChar.charCodeAt() === 10 || lastChar.charCodeAt() === 13; | |
} | |
//Start data processing workers | |
startWorkers(options) { | |
let workerOptions = { | |
chunkSize: options.chunkSize, //UserDef or 10MB ? | |
delimiter: options.delimiter, //UserDef or comma | |
rowDelimiter: options.rowDelimiter, //UserDef or \n | |
headerKeys: options.header.headerKeys, //Which headers the worker which treats data should use | |
headerIndeces: options.header.headerIndeces | |
} | |
let numWorkers = options.numWorkers; | |
if (options.chunkSize >= options.file.size) //If the size of the file is smaller than the chunk only 1 worker is needed | |
numWorkers = 1; | |
this.dataWorkers = new Array(numWorkers); | |
for (let i = 0; i < numWorkers; i++) { // Create n workers | |
workerOptions.index = i; | |
this.dataWorkers[i] = new Worker('./worker.js'); | |
this.dataWorkers[i].postMessage(workerOptions); | |
this.dataWorkers[i].onmessage = (e) => { this.receiveWorkersData(e.data); }; | |
} | |
} | |
//Receive Workers treated data | |
receiveWorkersData(data) { | |
this.data.push(data); | |
if (++this.replies === this.numChunks && this.isDone) | |
this.terminateWorkers(); | |
} | |
//Shutdown data processing workers | |
terminateWorkers() { | |
for (let w of this.dataWorkers) | |
w.terminate(); | |
delete this.dataWorkers; | |
} | |
//Updates DOM element with id=progress with the percentage already read from the file | |
updateProgress() { | |
let progress = (this.startByte / this.options.file.size) * 100; | |
progress = Math.round(progress) > 100 ? 100 : Math.round(progress); | |
document.getElementById('progress').innerHTML = "File Reading Progress: " + progress + "%"; | |
document.getElementsByClassName('determinate')[0].style.width = progress + "%"; | |
} | |
} | |
var parser = null; | |
function startParser() { | |
parser = new CSVParser({ chunkSize: 20 * 1024 * 1024, headerKeys: ['Block', 'IUCR', 'Longitude', 'Year', 'Latitude'], numWorkers: 4 }); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
let numMsgsReceived = 0; //Number messages received | |
let options = null; //Save options | |
let headerIndeces = []; //Indeces of header to save | |
/** | |
* Receive options and file to read from main Thread. | |
*/ | |
this.onmessage = function (e) { | |
if (numMsgsReceived === 0) { //Save options | |
options = e.data; | |
headerIndeces = options.headerIndeces; | |
numMsgsReceived++; | |
} | |
else { | |
let rows = e.data.rows.split(/\r?\n/); | |
let numRows = rows.length; | |
if (e.data.ignoreLastRow) | |
pushRows(rows, numRows - 1) | |
else | |
pushRows(rows, numRows); | |
} | |
} | |
function pushRows(rows, numRows) { | |
let mapa = []; | |
for (let i = 0; i < numRows; i++) { | |
let values = rows[i].split(options.delimiter); | |
let row = [] | |
for (const idx of headerIndeces) { | |
if (+values[idx] === +values[idx]) | |
row.push(+values[idx]); //Fast convert str to number | |
else | |
row.push(values[idx]); | |
} | |
mapa.push(row); | |
} | |
self.postMessage(mapa); | |
delete mapa; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment