Created
November 30, 2022 11:50
-
-
Save lerouxb/225e06fe7d9424ddc7a6e185069ef51d to your computer and use it in GitHub Desktop.
Parse & Analyse .csv files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
'use strict'; | |
const fs = require('fs'); | |
const Papa = require('papaparse'); | |
const { ObjectId } = require('bson'); | |
function parseObjectId(value) { | |
// trying to parse just any string seems to be quite slow, so just support hex strings for now | |
if (value.length !== 24) { | |
return null; | |
} | |
try { | |
return new ObjectId(value); | |
} | |
catch(err) { | |
return null; | |
} | |
} | |
// papaparse already detects dates | |
/* | |
const ISO_DATE_REGEX = /\d{4}-[01]\d-[0-3]\dT[0-2]\d:[0-5]\d:[0-5]\d\.\d+([+-][0-2]\d:[0-5]\d|Z)/; | |
function parseDate(value) { | |
// JavaScript will still happily do: | |
// > new Date('something 12') | |
// 2001-12-01T00:00:00.000Z | |
if (!value.match(ISO_DATE_REGEX)) { | |
return null; | |
} | |
const date = new Date(value); | |
if (isNaN(date.getTime())) { | |
return null; | |
} | |
return date; | |
} | |
*/ | |
function parseCSV(filename) { | |
return new Promise((resolve, reject) => { | |
const file = fs.createReadStream(filename, { | |
encoding: 'utf8', | |
}); | |
let total = 0; | |
const schema = {}; | |
Papa.parse(file, { | |
header: true, | |
dynamicTyping: true, | |
step: function(row) { | |
//console.log(row); | |
if (total % 10000 === 0) { | |
console.log(total); | |
} | |
for (const [key, value] of Object.entries(row.data)) { | |
let fieldType = typeof value; | |
if (value === null) { | |
// what do we want to do in this case? | |
fieldType = 'null'; | |
} | |
else if (fieldType === 'object') { | |
if (value.getTime) { | |
fieldType = 'Date'; | |
} | |
else { | |
// TODO: probably an error? | |
console.log(value, value.toString()); | |
} | |
} | |
else if (fieldType === 'number') { | |
// TODO: should we check which kind of number? We could easily end up with a mixed set. | |
} | |
else if (fieldType === 'string') { | |
let parsed; | |
if ((parsed = parseObjectId(value)) !== null) { | |
fieldType = 'ObjectID'; | |
} | |
//else if ((parsed = parseDate(value)) !== null) { | |
// console.log(parsed, value); | |
// fieldType = 'Date'; | |
//} | |
} | |
if (schema[key] === undefined) { | |
schema[key] = {}; | |
} | |
if (schema[key][fieldType] === undefined) { | |
schema[key][fieldType] = 0; | |
} | |
++schema[key][fieldType]; | |
} | |
++total; | |
}, | |
complete: function() { | |
resolve({ total, schema }); | |
}, | |
error: function (err) { | |
reject(err); | |
} | |
}); | |
}); | |
} | |
function guessFieldTypes(schema) { | |
const fields = {}; | |
for (const [key, summary] of Object.entries(schema)) { | |
if (summary.string !== undefined) { | |
fields[key] = 'string'; | |
continue; | |
} | |
// make sure we don't pick null | |
const types = Object.keys(summary).filter((type) => type !== 'null'); | |
// take the only type if there is just one | |
if (types.length === 1) { | |
fields[key] = types[0]; | |
continue; | |
} | |
// go with string if there are multiple options or everything was null | |
fields[key] = 'string'; // TODO: or mixed? | |
} | |
return fields; | |
} | |
const filename = process.argv[2]; | |
console.log({filename}); | |
const start = Date.now(); | |
parseCSV(filename) | |
.then(({ total, schema }) => { | |
const elapsed = Date.now() - start; | |
console.log(); | |
console.log('field type report'); | |
console.log(schema); | |
console.log(); | |
console.log('detected field types'); | |
console.log(guessFieldTypes(schema)); | |
console.log(); | |
console.log(`${elapsed}ms`, `${total} rows`); | |
}) | |
.catch((err) => { | |
console.error(err); | |
process.exit(1); | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment