iursevla · May 22, 2017 20:59
diff --git a/CSVParser.html b/CSVParser.html
 <!DOCTYPE html>
 <html lang="en">

 <head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <meta http-equiv="X-UA-Compatible" content="ie=edge">
    <title>CSV Parser</title>
    <script src="CSVParser.js"></script>
 </head>

 <body>
    <input type="file" id="file" name="files" accept=".csv">
    <button id="submit" onclick="startParser();">Ok</button>
    <br> 
    Choose CSV file to Parse
    <div id="progress">
    </div>
 </body>

 </html>
diff --git a/CSVParser.js b/CSVParser.js
 class CSVParser {
    constructor(userOptions = {}) {
        this.dataWorkers = null; //Array of Workers
        this.indexWorker = 0; //Id of the worker to send the next chunk
        this.replies = 0; //How many workers replied back with their results
        this.isDone = false; //Whether or not the Parser is done reading the file
        this.startByte = 0; //Starting point of the read request
        this.numRows = 0; //Number of rows that were read from the file
        this.data = [];
        this.options = this.loadOptions(userOptions);
        this.reader = this.createFileReader();
        this.numChunks = Math.round(this.options.file.size / this.options.chunkSize);
        if (this.options.header.fileHasHeader && !this.options.header.headerIndeces)
            this.readNextChunk(this.options.header.headerBytesSize);
        else
            this.readNextChunk(this.options.chunkSize);
    }

    //Should load all options and act accordingly
    loadOptions(userOptions = {}) {
        //Error vars
        this.FILEERROR = "ES6CSVParser => CSV file to parse is undefined.";
        this.FIRSTROWERROR = "Didn't read enough for the first row. Change header size to a higher value.";
        this.HEADEROPTIONSERROR = "No header indeces provided.";
        this.NOKEYS = "Not all given keys are present in the header";
       
        let opts = { //Load Options
            chunkSize: userOptions.chunkSize || 1024 * 1024 * 10, //UserDef or 10MB
            delimiter: userOptions.delimiter || ',', //UserDef or comma
            rowDelimiter: userOptions.rowDelimiter || '\n', //UserDef or \n  http://stackoverflow.com/a/1552782/
            file: userOptions.file || document.getElementById("file").files[0], //UserDef or <input id="file"..>
            numWorkers: userOptions.numWorkers || 2, //Worker threads 
            header: {
                fileHasHeader: userOptions.fileHasHeader !== undefined ? userOptions.fileHasHeader : true, //Should read header to look for headerKeys
                headerBytesSize: userOptions.headerBytesSize || 1024, //Minimize the header read (only used if readHeader true)
                headerKeys: userOptions.headerKeys, //Which headers the worker which treats data should use
                headerIndeces: userOptions.headerIndeces //Header indeces to use
            }
        }

        if (opts.file === undefined)
            throw Error(this.FILEERROR);
        if (!opts.header.headerKeys || opts.header.headerKeys.length === 0) //Should always have keys
            throw new Error(this.HEADEROPTIONSERROR)
        return opts;
    }

    //Create file reader and event functions
    createFileReader() {
        let reader = new FileReader();
        reader.onload = (e) => { this.chunkLoaded(e); }
        reader.onerror = (e) => { console.error(e); }
        return reader;
    }

    //Reads next chunk from the file (header or body)
    readNextChunk(chunkSize) {
        let file = this.options.file;
        let start = Math.min(this.startByte, file.size);
        let end = Math.min(this.startByte + chunkSize, file.size - 1);
        this.reader.readAsText(file.slice(start, end));
    }

    //When a chunk is loaded from the CSV file 
    chunkLoaded(e) {
        let opts = this.options;
        if (this.startByte === 0 && this.options.header.fileHasHeader) {
            this.startByte += opts.header.headerBytesSize;
            this.processHeader(e.target.result);
        }
        else {
            this.startByte += opts.chunkSize;
            this.processBodyChunk(e.target.result);
        }

        this.updateProgress();

        if (this.startByte >= this.options.file.size) 
            this.isDone = true;
        else
            this.readNextChunk(this.options.chunkSize);
    }

    //Process Header 
    processHeader(res) {
        let opts = this.options;
        let rows = res.split(opts.rowDelimiter);
        if (rows.length < 2) //Couldnt reach any row delimiter so it didnt read one full row
            throw new Error(this.FIRSTROWERROR); 
        else { //find the keys we want and then start workers
            let keysRead = rows[0].split(opts.delimiter);
            let lastKey = keysRead[keysRead.length - 1];
            if (this.endsWithImp(lastKey.charAt(lastKey.length - 1)))
                keysRead[keysRead.length - 1] = lastKey.slice(0, lastKey.length - 1);

            let indices = [];
            for (const k of opts.header.headerKeys) {
                let pos = keysRead.indexOf(k);
                if (pos >= 0)
                    indices.push(pos);
            }

            if (indices.length < opts.header.headerKeys.length) //Didnt found all given keys
                throw new Error(this.NOKEYS);
            opts.header.headerIndeces = indices;
            this.startWorkers(opts); //Send info to read workers

            let temp = '';
            for (let i = 1; i < rows.length; i++)
                i === rows.length - 1 ? temp += rows[i] : temp += rows[i] + opts.rowDelimiter;
            this.remainingRows = temp;
        }
    }

    //Other chunks that aren't an header
    processBodyChunk(rows) {
        let numWorkers = this.options.numWorkers;
        let rowDelimiter = this.options.rowDelimiter;
        let lastChar = rows.charAt(rows.length - 1)
        if (this.endsWithImp(lastChar)) { 
            rows = this.remainingRows + rows;
            this.dataWorkers[this.indexWorker++ % numWorkers].postMessage({ rows });
            this.remainingRows = ''; //Reset remaining row(s)
        }
        else {
            rows = this.remainingRows + rows;
            this.dataWorkers[this.indexWorker++ % numWorkers].postMessage({ rows, ignoreLastRow: true }); //Ignore last row

            let index = rows.length - 1;
            let char = rows[index];
            let lastLine = "";
            while (char !== rowDelimiter) {
                lastLine = char + lastLine;
                char = rows[--index];
            }
            this.remainingRows = lastLine;
        }
    }

    //Verifies if the char is equal to rowDelimiter or Line Feed (\n) or Carriage Return (\r)
    endsWithImp(lastChar) {
        return lastChar === this.options.rowDelimiter || lastChar.charCodeAt() === 10 || lastChar.charCodeAt() === 13;
    }

    //Start data processing workers
    startWorkers(options) {
        let workerOptions = {
            chunkSize: options.chunkSize, //UserDef or 10MB ?
            delimiter: options.delimiter, //UserDef or comma
            rowDelimiter: options.rowDelimiter, //UserDef or \n 
            headerKeys: options.header.headerKeys, //Which headers the worker which treats data should use
            headerIndeces: options.header.headerIndeces
        }

        let numWorkers = options.numWorkers;
        if (options.chunkSize >= options.file.size) //If the size of the file is smaller than the chunk only 1 worker is needed
            numWorkers = 1;

        this.dataWorkers = new Array(numWorkers);
        for (let i = 0; i < numWorkers; i++) { // Create n workers
            workerOptions.index = i;
            this.dataWorkers[i] = new Worker('./worker.js');
            this.dataWorkers[i].postMessage(workerOptions);
            this.dataWorkers[i].onmessage = (e) => { this.receiveWorkersData(e.data); };
        }
    }

    //Receive Workers treated data
    receiveWorkersData(data) {
        this.data.push(data);
        if (++this.replies === this.numChunks && this.isDone)
            this.terminateWorkers();
    }

    //Shutdown data processing workers
    terminateWorkers() {
        for (let w of this.dataWorkers)
            w.terminate();
        delete this.dataWorkers;
    }

    //Updates DOM element with id=progress with the percentage already read from the file
    updateProgress() {
        let progress = (this.startByte / this.options.file.size) * 100;
        progress = Math.round(progress) > 100 ? 100 : Math.round(progress);
        document.getElementById('progress').innerHTML = "File Reading Progress: " + progress + "%";
        document.getElementsByClassName('determinate')[0].style.width = progress + "%";
    }
 }

 var parser = null;
 function startParser() {
    parser = new CSVParser({ chunkSize: 20 * 1024 * 1024, headerKeys: ['Block', 'IUCR', 'Longitude', 'Year', 'Latitude'], numWorkers: 4 });
 }
diff --git a/worker.js b/worker.js
 let numMsgsReceived = 0; //Number messages received
 let options = null; //Save options
 let headerIndeces = []; //Indeces of header to save

 /**
 * Receive options and file to read from main Thread.
 */
 this.onmessage = function (e) {
    if (numMsgsReceived === 0) { //Save options
        options = e.data;
        headerIndeces = options.headerIndeces;
        numMsgsReceived++;
    }
    else {
        let rows = e.data.rows.split(/\r?\n/);
        let numRows = rows.length;
        if (e.data.ignoreLastRow)
            pushRows(rows, numRows - 1)
        else
            pushRows(rows, numRows);
    }
 }

 function pushRows(rows, numRows) {
    let mapa = [];
    for (let i = 0; i < numRows; i++) {
        let values = rows[i].split(options.delimiter);
        let row = []
        for (const idx of headerIndeces) {
            if (+values[idx] === +values[idx])
                row.push(+values[idx]); //Fast convert str to number
            else
                row.push(values[idx]);
        }
        mapa.push(row);
    }
    self.postMessage(mapa);
    delete mapa;
 }
	<!DOCTYPE html>
	<html lang="en">

	<head>
	<meta charset="UTF-8">
	<meta name="viewport" content="width=device-width, initial-scale=1.0">
	<meta http-equiv="X-UA-Compatible" content="ie=edge">
	<title>CSV Parser</title>
	<script src="CSVParser.js"></script>
	</head>

	<body>
	<input type="file" id="file" name="files" accept=".csv">
	<button id="submit" onclick="startParser();">Ok</button>
	<br>
	Choose CSV file to Parse
	<div id="progress">
	</div>
	</body>

	</html>
	class CSVParser {
	constructor(userOptions = {}) {
	this.dataWorkers = null; //Array of Workers
	this.indexWorker = 0; //Id of the worker to send the next chunk
	this.replies = 0; //How many workers replied back with their results
	this.isDone = false; //Whether or not the Parser is done reading the file
	this.startByte = 0; //Starting point of the read request
	this.numRows = 0; //Number of rows that were read from the file
	this.data = [];
	this.options = this.loadOptions(userOptions);
	this.reader = this.createFileReader();
	this.numChunks = Math.round(this.options.file.size / this.options.chunkSize);
	if (this.options.header.fileHasHeader && !this.options.header.headerIndeces)
	this.readNextChunk(this.options.header.headerBytesSize);
	else
	this.readNextChunk(this.options.chunkSize);
	}

	//Should load all options and act accordingly
	loadOptions(userOptions = {}) {
	//Error vars
	this.FILEERROR = "ES6CSVParser => CSV file to parse is undefined.";
	this.FIRSTROWERROR = "Didn't read enough for the first row. Change header size to a higher value.";
	this.HEADEROPTIONSERROR = "No header indeces provided.";
	this.NOKEYS = "Not all given keys are present in the header";

	let opts = { //Load Options
	chunkSize: userOptions.chunkSize \|\| 1024 * 1024 * 10, //UserDef or 10MB
	delimiter: userOptions.delimiter \|\| ',', //UserDef or comma
	rowDelimiter: userOptions.rowDelimiter \|\| '\n', //UserDef or \n http://stackoverflow.com/a/1552782/
	file: userOptions.file \|\| document.getElementById("file").files[0], //UserDef or <input id="file"..>
	numWorkers: userOptions.numWorkers \|\| 2, //Worker threads
	header: {
	fileHasHeader: userOptions.fileHasHeader !== undefined ? userOptions.fileHasHeader : true, //Should read header to look for headerKeys
	headerBytesSize: userOptions.headerBytesSize \|\| 1024, //Minimize the header read (only used if readHeader true)
	headerKeys: userOptions.headerKeys, //Which headers the worker which treats data should use
	headerIndeces: userOptions.headerIndeces //Header indeces to use
	}
	}

	if (opts.file === undefined)
	throw Error(this.FILEERROR);
	if (!opts.header.headerKeys \|\| opts.header.headerKeys.length === 0) //Should always have keys
	throw new Error(this.HEADEROPTIONSERROR)
	return opts;
	}

	//Create file reader and event functions
	createFileReader() {
	let reader = new FileReader();
	reader.onload = (e) => { this.chunkLoaded(e); }
	reader.onerror = (e) => { console.error(e); }
	return reader;
	}

	//Reads next chunk from the file (header or body)
	readNextChunk(chunkSize) {
	let file = this.options.file;
	let start = Math.min(this.startByte, file.size);
	let end = Math.min(this.startByte + chunkSize, file.size - 1);
	this.reader.readAsText(file.slice(start, end));
	}

	//When a chunk is loaded from the CSV file
	chunkLoaded(e) {
	let opts = this.options;
	if (this.startByte === 0 && this.options.header.fileHasHeader) {
	this.startByte += opts.header.headerBytesSize;
	this.processHeader(e.target.result);
	}
	else {
	this.startByte += opts.chunkSize;
	this.processBodyChunk(e.target.result);
	}

	this.updateProgress();

	if (this.startByte >= this.options.file.size)
	this.isDone = true;
	else
	this.readNextChunk(this.options.chunkSize);
	}

	//Process Header
	processHeader(res) {
	let opts = this.options;
	let rows = res.split(opts.rowDelimiter);
	if (rows.length < 2) //Couldnt reach any row delimiter so it didnt read one full row
	throw new Error(this.FIRSTROWERROR);
	else { //find the keys we want and then start workers
	let keysRead = rows[0].split(opts.delimiter);
	let lastKey = keysRead[keysRead.length - 1];
	if (this.endsWithImp(lastKey.charAt(lastKey.length - 1)))
	keysRead[keysRead.length - 1] = lastKey.slice(0, lastKey.length - 1);

	let indices = [];
	for (const k of opts.header.headerKeys) {
	let pos = keysRead.indexOf(k);
	if (pos >= 0)
	indices.push(pos);
	}

	if (indices.length < opts.header.headerKeys.length) //Didnt found all given keys
	throw new Error(this.NOKEYS);
	opts.header.headerIndeces = indices;
	this.startWorkers(opts); //Send info to read workers

	let temp = '';
	for (let i = 1; i < rows.length; i++)
	i === rows.length - 1 ? temp += rows[i] : temp += rows[i] + opts.rowDelimiter;
	this.remainingRows = temp;
	}
	}

	//Other chunks that aren't an header
	processBodyChunk(rows) {
	let numWorkers = this.options.numWorkers;
	let rowDelimiter = this.options.rowDelimiter;
	let lastChar = rows.charAt(rows.length - 1)
	if (this.endsWithImp(lastChar)) {
	rows = this.remainingRows + rows;
	this.dataWorkers[this.indexWorker++ % numWorkers].postMessage({ rows });
	this.remainingRows = ''; //Reset remaining row(s)
	}
	else {
	rows = this.remainingRows + rows;
	this.dataWorkers[this.indexWorker++ % numWorkers].postMessage({ rows, ignoreLastRow: true }); //Ignore last row

	let index = rows.length - 1;
	let char = rows[index];
	let lastLine = "";
	while (char !== rowDelimiter) {
	lastLine = char + lastLine;
	char = rows[--index];
	}
	this.remainingRows = lastLine;
	}
	}

	//Verifies if the char is equal to rowDelimiter or Line Feed (\n) or Carriage Return (\r)
	endsWithImp(lastChar) {
	return lastChar === this.options.rowDelimiter \|\| lastChar.charCodeAt() === 10 \|\| lastChar.charCodeAt() === 13;
	}

	//Start data processing workers
	startWorkers(options) {
	let workerOptions = {
	chunkSize: options.chunkSize, //UserDef or 10MB ?
	delimiter: options.delimiter, //UserDef or comma
	rowDelimiter: options.rowDelimiter, //UserDef or \n
	headerKeys: options.header.headerKeys, //Which headers the worker which treats data should use
	headerIndeces: options.header.headerIndeces
	}

	let numWorkers = options.numWorkers;
	if (options.chunkSize >= options.file.size) //If the size of the file is smaller than the chunk only 1 worker is needed
	numWorkers = 1;

	this.dataWorkers = new Array(numWorkers);
	for (let i = 0; i < numWorkers; i++) { // Create n workers
	workerOptions.index = i;
	this.dataWorkers[i] = new Worker('./worker.js');
	this.dataWorkers[i].postMessage(workerOptions);
	this.dataWorkers[i].onmessage = (e) => { this.receiveWorkersData(e.data); };
	}
	}

	//Receive Workers treated data
	receiveWorkersData(data) {
	this.data.push(data);
	if (++this.replies === this.numChunks && this.isDone)
	this.terminateWorkers();
	}

	//Shutdown data processing workers
	terminateWorkers() {
	for (let w of this.dataWorkers)
	w.terminate();
	delete this.dataWorkers;
	}

	//Updates DOM element with id=progress with the percentage already read from the file
	updateProgress() {
	let progress = (this.startByte / this.options.file.size) * 100;
	progress = Math.round(progress) > 100 ? 100 : Math.round(progress);
	document.getElementById('progress').innerHTML = "File Reading Progress: " + progress + "%";
	document.getElementsByClassName('determinate')[0].style.width = progress + "%";
	}
	}

	var parser = null;
	function startParser() {
	parser = new CSVParser({ chunkSize: 20 * 1024 * 1024, headerKeys: ['Block', 'IUCR', 'Longitude', 'Year', 'Latitude'], numWorkers: 4 });
	}
	let numMsgsReceived = 0; //Number messages received
	let options = null; //Save options
	let headerIndeces = []; //Indeces of header to save

	/**
	* Receive options and file to read from main Thread.
	*/
	this.onmessage = function (e) {
	if (numMsgsReceived === 0) { //Save options
	options = e.data;
	headerIndeces = options.headerIndeces;
	numMsgsReceived++;
	}
	else {
	let rows = e.data.rows.split(/\r?\n/);
	let numRows = rows.length;
	if (e.data.ignoreLastRow)
	pushRows(rows, numRows - 1)
	else
	pushRows(rows, numRows);
	}
	}

	function pushRows(rows, numRows) {
	let mapa = [];
	for (let i = 0; i < numRows; i++) {
	let values = rows[i].split(options.delimiter);
	let row = []
	for (const idx of headerIndeces) {
	if (+values[idx] === +values[idx])
	row.push(+values[idx]); //Fast convert str to number
	else
	row.push(values[idx]);
	}
	mapa.push(row);
	}
	self.postMessage(mapa);
	delete mapa;
	}