Created
November 14, 2017 10:23
-
-
Save mikehenrty/802dadf973e42684df12e054e9769417 to your computer and use it in GitHub Desktop.
This file parses a common voice data directory and outputs stats about the data.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const fs = require('fs'); | |
const path = require('path'); | |
const crypto = require('crypto'); | |
const mp3Duration = require('mp3-duration'); | |
const OUTPUT_FOLDER = path.resolve('./output/batch'); | |
const DOWNLOAD_FOLDER = path.resolve('./sample'); | |
const DEFAULT_SALT = '8hd3e8sddFSdfj'; | |
const CONCURRENCY = 50; | |
const startTime = Date.now(); | |
function hash(str) { | |
return crypto | |
.createHmac('sha256', DEFAULT_SALT) | |
.update(str) | |
.digest('hex'); | |
} | |
/** | |
* Create a function that is the promise version of standard callback apis. | |
*/ | |
function promisify(ctx, func) { | |
return function(...args) { | |
return new Promise((res, rej) => { | |
args.push((err, result) => { | |
if (err) { | |
rej(err); | |
return; | |
} | |
res(result); | |
}); | |
func.apply(ctx, args); | |
}); | |
}; | |
} | |
/** | |
* Promise versions of fs node standard lib. | |
*/ | |
const readdirPromise = promisify(fs, fs.readdir); | |
const readFilePromise = promisify(fs, fs.readFile); | |
const renamePromise = promisify(fs, fs.rename); | |
const durationPromise = promisify(this, mp3Duration); | |
const tracker = { | |
users: 0, | |
mp3s: 0, | |
txts: 0, | |
votes: 0, | |
demos: 0, | |
sentences: 0, | |
seconds: 0.0, | |
weirdos: 0, | |
}; | |
const users = { | |
// "userid": { | |
// sentences: number of sentences | |
// upvotes: number of upvotes | |
// downvotes: number of downvotes | |
// } | |
}; | |
// Key value store for sentences. | |
const sentences = { | |
// "setence_hash": { | |
// text: "sentence_body", | |
// count: "found x times", | |
// path: "path_to_file", | |
// } | |
}; | |
const clips = { | |
// "clipKey": { | |
// "text": "sentence_body" | |
// "upvotes": yes votes | |
// "downvotes": down votes | |
// } | |
}; | |
async function processText(path) { | |
const sentence = await readFilePromise(path, 'utf8'); | |
const key = hash(sentence); | |
if (!sentences[key]) { | |
sentences[key] = { | |
text: sentence, | |
count: 0, | |
path: path, | |
} | |
} | |
++sentences[key].count; | |
} | |
async function processAllText(files) { | |
let = i = 0; | |
while (i < files.length) { | |
let slice = new Array(CONCURRENCY); | |
for (let j = 0; j < CONCURRENCY; j++) { | |
let filePath = files[i + j]; | |
if (filePath) { | |
slice[j] = processText(filePath); | |
} | |
} | |
await Promise.all(slice); | |
i += CONCURRENCY; | |
} | |
} | |
async function processClip(path) { | |
const duration = await durationPromise(path); | |
tracker.seconds += duration; | |
} | |
async function processAllMP3s(files) { | |
let = i = 0; | |
while (i < files.length) { | |
let slice = new Array(CONCURRENCY); | |
for (let j = 0; j < CONCURRENCY; j++) { | |
let filePath = files[i + j]; | |
if (filePath) { | |
slice[j] = processClip(filePath); | |
} | |
} | |
await Promise.all(slice); | |
i += CONCURRENCY; | |
} | |
} | |
async function processVote(path, user, clip) { | |
const vote = await readFilePromise(path, 'utf8'); | |
if (vote === 'true') { | |
++user.upvotes; | |
++clip.upvotes; | |
} else if (vote === 'false') { | |
++user.downvotes | |
++clip.downvotes; | |
} else { | |
console.error('unrecognized vote data', vote); | |
} | |
} | |
async function processAllVotes(files) { | |
let = i = 0; | |
while (i < files.length) { | |
let slice = new Array(CONCURRENCY); | |
for (let j = 0; j < CONCURRENCY; j++) { | |
let data = files[i + j]; | |
if (data) { | |
if (!data.clip) { | |
console.log('wtf', data); | |
return; | |
} | |
slice[j] = processVote(data.path, data.user, data.clip); | |
} | |
} | |
await Promise.all(slice); | |
i += CONCURRENCY; | |
} | |
} | |
async function move(file, folder) { | |
} | |
async function displayVoteMetrics(clips) { | |
const clipTracker = { | |
none: 0, | |
unverified: 0, | |
verified: 0, | |
bad: 0, | |
bothUp: 0, | |
bothDown: 0, | |
}; | |
let keys = Object.keys(clips); | |
for (let i = 0; i < keys.length; i++) { | |
const key = keys[i]; | |
const clip = clips[key]; | |
let votes = clip.upvotes + clip.downvotes; | |
if (votes === 0) { | |
++clipTracker.none; | |
} else if (votes < 3) { | |
++clipTracker.unverified; | |
} else if (clip.upvotes > 1 && clip.downvotes > 1) { | |
if (clip.upvotes / votes > 1 / 3) { | |
++clipTracker.bothUp; | |
await Promise.all([ | |
clip.src && renamePromise(clip.src, OUTPUT_FOLDER + '/' + key + '.mp3'), | |
clip.txt && renamePromise(clip.txt, OUTPUT_FOLDER + '/' + key + '.txt'), | |
]); | |
} else { | |
++clipTracker.bothDown; | |
} | |
} else if (clip.upvotes > 1) { | |
++clipTracker.verified; | |
} else if (clip.downvotes > 1) { | |
++clipTracker.bad; | |
} else { | |
console.error('wtf?', clip); | |
} | |
} | |
console.log('tracker', clipTracker); | |
} | |
async function displayMetrics() { | |
let textFiles = []; | |
let mp3Files = []; | |
let voteFiles = []; | |
try { | |
let folders = await readdirPromise(DOWNLOAD_FOLDER); | |
for (let i = 0; i < Math.floor(folders.length); i++) { | |
let folder = folders[i]; | |
// Only use folders. | |
if (folder.indexOf('.') !== -1) { | |
++tracker.weirdos; | |
continue; | |
} | |
if (!users[folder]) { | |
users[folder] = { | |
sentences: 0, | |
upvotes: 0, | |
downvotes: 0, | |
}; | |
} | |
const user = users[folder]; | |
++tracker.users; | |
let folderPath = path.join(DOWNLOAD_FOLDER, folder); | |
let subfiles = await readdirPromise(folderPath); | |
for (let j = 0; j < subfiles.length; j++) { | |
const file = subfiles[j]; | |
const filePath = path.join(folderPath, file); | |
// Start here! | |
const dotIndex = file.indexOf('.'); | |
let clipKey = folder + '_' + file.substr(0, dotIndex); | |
if (dotIndex === -1) { | |
console.log('found unexpected subfolder', folder, file); | |
++tracker.weirdos; | |
continue; | |
} | |
const ext = file.substr(dotIndex + 1, file.length - 1); | |
if (ext === 'vote' || ext === 'txt' || ext === 'mp3') { | |
if (ext === 'vote') { | |
clipKey = clipKey.split('-by-')[0]; | |
} | |
if (!clips[clipKey]) { | |
clips[clipKey] = { | |
text: '', | |
src: '', | |
txt: '', | |
user: folder, | |
upvotes: 0, | |
downvotes: 0, | |
}; | |
} | |
} | |
let clip = clips[clipKey]; | |
switch (ext) { | |
case 'txt': | |
textFiles.push(filePath); | |
clip.txt = filePath; | |
//processText(filePath); | |
++tracker.txts; | |
++user.sentences; | |
break; | |
case 'mp3': | |
mp3Files.push(filePath); | |
clip.src = filePath; | |
//processClip(filePath); | |
++tracker.mp3s; | |
break; | |
case 'vote': | |
voteFiles.push({ | |
path: filePath, | |
user: user, | |
clip: clip, | |
}); | |
//await processVote(filePath, folder, file); | |
++tracker.votes; | |
break; | |
case 'json': | |
++tracker.demos; | |
break; | |
default: | |
console.error('unrecognized file', file, ext); | |
++tracker.weirdos; | |
break; | |
} | |
} | |
} | |
console.log('found files', textFiles.length); | |
//await processAllText(textFiles); | |
console.log('found mp3s', mp3Files.length); | |
//await processAllMP3s(mp3Files); | |
console.log('found votes', voteFiles.length); | |
await processAllVotes(voteFiles); | |
tracker.sentences += Object.keys(sentences).length; | |
console.log(tracker); | |
const endTime = Date.now(); | |
const elapsed = endTime - startTime; | |
const minutes = elapsed / 60000; | |
console.log(`\ncomplete in ${minutes.toFixed(2)} minutes\n`); | |
displayVoteMetrics(clips); | |
} catch (err) { | |
console.error('top level error', err); | |
} | |
} | |
displayMetrics().catch(err => { | |
console.error('unhandled exception', err); | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment