-
-
Save timoxley/0cb5053dec107499c8aabad8dfd651ea to your computer and use it in GitHub Desktop.
import { join } from 'path' | |
import { readdir, stat } from 'fs-promise' | |
async function rreaddir (dir, allFiles = []) { | |
const files = (await readdir(dir)).map(f => join(dir, f)) | |
allFiles.push(...files) | |
await Promise.all(files.map(async f => ( | |
(await stat(f)).isDirectory() && rreaddir(f, allFiles) | |
))) | |
return allFiles | |
} |
// straight synchronous version for perf comparison | |
import { join } from 'path' | |
import fs from 'fs' | |
function rreaddirSync (dir, allFiles = []) { | |
const files = fs.readdirSync(dir).map(f => join(dir, f)) | |
allFiles.push(...files) | |
files.forEach(f => { | |
fs.statSync(f).isDirectory() && rreaddirSync(f, allFiles) | |
}) | |
return allFiles | |
} |
import assert from 'assert' | |
console.time('parallel') | |
rreaddir(__dirname).then(pFiles => { | |
console.timeEnd('parallel') | |
console.time('sync') | |
const sFiles = rreaddirSync(__dirname) | |
console.timeEnd('sync') | |
assert.deepEqual(pFiles.sort(alpha), sFiles.sort(alpha)) | |
}) | |
function alpha (a, b) { | |
return a.localeCompare(b) | |
} |
parallel: 313.899ms | |
sync: 80.259ms |
import { basename, join } from 'path';
import { readdir, stat, Stats } from 'fs-extra';
const recursiveReadDir = async (
dir: string,
checkShouldIgnore?: (uri: string, stats: Stats) => boolean,
concurrency = 100
): Promise<string[]> => {
const collected: string[] = [];
const queue = [dir];
const visit = async (file: string) => {
const stats = await stat(file);
if (checkShouldIgnore?.(file, stats)) return;
if (stats.isDirectory()) {
queue.push(...(await readdir(file)).map(f => join(file, f)));
return;
}
collected.push(file);
};
while (queue.length) {
await Promise.all(
queue.splice(0, concurrency).map(visit)
);
}
return collected;
};
☝️ uses typescript and introduces optional ignore and concurrency parameters and leaves out directories in the resulting file list
Because I ran into memory problems recursively reading a .git directory using @glebec's version, I came up with the queue based version above. Will run some benchmarks on this soon.
Blast from the past! For context, this gist was part of a question posted on a Slack channel about why some async code was slower than the corresponding sync code. I didn’t intend my reply to be used in anyone’s prod. 😅 That being said, good idea to introduce a concurrency limit!
❤️
Seemingly not much difference in performance, which is good:
queue: 935.704ms
sync: 1754.959ms
parallel: 945.062ms
Interesting to see that the parallel version is now much faster than the original sync version. Tested using Node v12.13.0 with slightly adjusted versions of parallel and sync (because my version also leaves out directories in the resulting file list).
☝️ uses native promises and promisification, no slow 3rd-party libraries.
But as you can see, the async version is still slower.
This is because concurrency is not parallelism. The async version makes many concurrent file read requests, but the drive is a bottleneck – the OS cannot do a whole lot of actual parallel FS work.
So what's the benefit of the async version? Well, for those ~1.4 seconds, that code is non-blocking. In other words, other parts of your JS app can do work in-between all the recursive file read calls.
With the sync version, your app freezes up completely for ~1.2 seconds. The overall time for the file reads might be slightly less, but that block can have very significant effects on other parts of your application or user experience.