Skip to content

Instantly share code, notes, and snippets.

@TomHumphries
Created September 3, 2021 14:39
Show Gist options
  • Save TomHumphries/057b2334b98898f975dcdb98bbaea834 to your computer and use it in GitHub Desktop.
Save TomHumphries/057b2334b98898f975dcdb98bbaea834 to your computer and use it in GitHub Desktop.
Split a CSV a file >> RAM into smaller CSV files
/**
* TO BE USED AS A node SCRIPT:
* node script.split_csv.js csv_filepath MB_per_file stop_after_count
*
* Smaller files will be created in the directory of the filepath
* Each file contains the header from the main file
*/
const path = require('path');
const fs = require('fs');
const readline = require('readline');
var args = process.argv.slice(2);
let filepath = args.shift();
let MB_per_file = args.shift();
let stop_after_count = args.shift();
if (!MB_per_file) MB_per_file = 25;
if (Number.isNaN(+MB_per_file)){
throw new Error("Invalid MB_per_file parameter provided");
}
if (!filepath || filepath.length == 0) {
throw new Error("Required arguments: filepath");
}
if (!fs.existsSync(filepath)) {
throw new Error("File not found");
}
if (!filepath.endsWith(".csv")) {
throw new Error("File must end with .csv");
}
splitUpCSV(filepath, MB_per_file, stop_after_count).then(() => {
console.log("complete");
}).catch(err => {
console.log(err);
})
/**
*
* @param {string} filepath filepath of the CSV file to split
* @param {number} MBPerFile max size of each smaller file
* @returns
*/
function splitUpCSV(filepath, MBPerFile = 25, stopAfterCount = 0) {
return new Promise((resolve, reject) => {
let newLineChar = "\r\n";
let fileCounter = 0;
let writeFilepath = createFilepath(filepath, fileCounter);
console.log('Starting new file:', writeFilepath);
let headerLine;
let size_bytes_perFile = MBPerFile * 1024 * 1024;
const lineReader = readline.createInterface({
input: fs.createReadStream(filepath),
});
let lineCounter = 0;
let byteCounter = 0;
let lines = [];
lineReader.on('line', function (line) {
lineReader.pause();
lineCounter++;
byteCounter += (line.length + 2); // (character count + \r\n) x 1 Byte
// set the header if not yet set
if (!headerLine) {
headerLine = line;
}
lines.push(line);
// max size for the current file
if (byteCounter > size_bytes_perFile) {
console.log("Writing", lineCounter.toLocaleString(), "lines to", writeFilepath);
// OK to be the Sync version since this is an "offline" script
fs.writeFileSync(writeFilepath, lines.join(newLineChar) + newLineChar);
fileCounter++
if (stopAfterCount > 0 && fileCounter >= stopAfterCount) {
console.log('File limit reached. Stopping.');
lineReader.close();
}
lines = [headerLine];
writeFilepath = createFilepath(filepath, fileCounter);
byteCounter = 0;
lineCounter = 0;
}
lineReader.resume();
});
lineReader.on('close', function() {
if (lines.length > 1) {
// more than just the header
fs.writeFileSync(writeFilepath, lines.join(newLineChar) + newLineChar);
}
resolve();
});
})
}
function createFilepath(filepath, fileCount) {
const directory = path.dirname(filepath);
const filename = path.basename(filepath, '.csv');
let paddedCounter = `000000${fileCount}`.substr(-6);
return path.join(directory, `${filename}_${paddedCounter}.csv`);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment