Created
September 3, 2021 14:39
-
-
Save TomHumphries/057b2334b98898f975dcdb98bbaea834 to your computer and use it in GitHub Desktop.
Split a CSV a file >> RAM into smaller CSV files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* TO BE USED AS A node SCRIPT: | |
* node script.split_csv.js csv_filepath MB_per_file stop_after_count | |
* | |
* Smaller files will be created in the directory of the filepath | |
* Each file contains the header from the main file | |
*/ | |
const path = require('path'); | |
const fs = require('fs'); | |
const readline = require('readline'); | |
var args = process.argv.slice(2); | |
let filepath = args.shift(); | |
let MB_per_file = args.shift(); | |
let stop_after_count = args.shift(); | |
if (!MB_per_file) MB_per_file = 25; | |
if (Number.isNaN(+MB_per_file)){ | |
throw new Error("Invalid MB_per_file parameter provided"); | |
} | |
if (!filepath || filepath.length == 0) { | |
throw new Error("Required arguments: filepath"); | |
} | |
if (!fs.existsSync(filepath)) { | |
throw new Error("File not found"); | |
} | |
if (!filepath.endsWith(".csv")) { | |
throw new Error("File must end with .csv"); | |
} | |
splitUpCSV(filepath, MB_per_file, stop_after_count).then(() => { | |
console.log("complete"); | |
}).catch(err => { | |
console.log(err); | |
}) | |
/** | |
* | |
* @param {string} filepath filepath of the CSV file to split | |
* @param {number} MBPerFile max size of each smaller file | |
* @returns | |
*/ | |
function splitUpCSV(filepath, MBPerFile = 25, stopAfterCount = 0) { | |
return new Promise((resolve, reject) => { | |
let newLineChar = "\r\n"; | |
let fileCounter = 0; | |
let writeFilepath = createFilepath(filepath, fileCounter); | |
console.log('Starting new file:', writeFilepath); | |
let headerLine; | |
let size_bytes_perFile = MBPerFile * 1024 * 1024; | |
const lineReader = readline.createInterface({ | |
input: fs.createReadStream(filepath), | |
}); | |
let lineCounter = 0; | |
let byteCounter = 0; | |
let lines = []; | |
lineReader.on('line', function (line) { | |
lineReader.pause(); | |
lineCounter++; | |
byteCounter += (line.length + 2); // (character count + \r\n) x 1 Byte | |
// set the header if not yet set | |
if (!headerLine) { | |
headerLine = line; | |
} | |
lines.push(line); | |
// max size for the current file | |
if (byteCounter > size_bytes_perFile) { | |
console.log("Writing", lineCounter.toLocaleString(), "lines to", writeFilepath); | |
// OK to be the Sync version since this is an "offline" script | |
fs.writeFileSync(writeFilepath, lines.join(newLineChar) + newLineChar); | |
fileCounter++ | |
if (stopAfterCount > 0 && fileCounter >= stopAfterCount) { | |
console.log('File limit reached. Stopping.'); | |
lineReader.close(); | |
} | |
lines = [headerLine]; | |
writeFilepath = createFilepath(filepath, fileCounter); | |
byteCounter = 0; | |
lineCounter = 0; | |
} | |
lineReader.resume(); | |
}); | |
lineReader.on('close', function() { | |
if (lines.length > 1) { | |
// more than just the header | |
fs.writeFileSync(writeFilepath, lines.join(newLineChar) + newLineChar); | |
} | |
resolve(); | |
}); | |
}) | |
} | |
function createFilepath(filepath, fileCount) { | |
const directory = path.dirname(filepath); | |
const filename = path.basename(filepath, '.csv'); | |
let paddedCounter = `000000${fileCount}`.substr(-6); | |
return path.join(directory, `${filename}_${paddedCounter}.csv`); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment