Created
May 15, 2020 16:39
-
-
Save yarikoptic/7284b634d8ab12277a3d316a117cc0db to your computer and use it in GitHub Desktop.
A dirty helper to time traversal of the file tree and collection of os.stat results for files. See https://github.com/con/pyfscacher/issues/1 for more info
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import os | |
from os import stat | |
from os.path import join, islink | |
from time import time | |
from pathlib import Path | |
from functools import wraps | |
from joblib import Parallel, delayed | |
def safestat(p): | |
try: | |
return stat(p, follow_symlinks=False) | |
except FileNotFoundError: | |
return None | |
def timeit(f): | |
@wraps(f) | |
def wrapper(*args, **kwargs): | |
t0 = time() | |
ret = f(*args, **kwargs) | |
dt = time() - t0 | |
return ret, dt | |
return wrapper | |
def statfiles(dp, files): | |
return {val: safestat(join(dp, val)) for val in files} | |
def statdir(d='.'): | |
return [(dp, statfiles(dp, files)) for dp, _, files in os.walk(d)] | |
def statdir(d='.'): | |
return [(dp, statfiles(dp, files)) for dp, _, files in os.walk(d)] | |
# with file descriptor for a dir | |
def statfilesfd(dp, files): | |
fd = os.open(dp, os.O_RDONLY) | |
ret = {val: stat(val, follow_symlinks=False, dir_fd=fd) for val in files} | |
os.close(fd) | |
return ret | |
def statdirfd(d='.'): | |
return [(dp, statfilesfd(dp, files)) for dp, _, files in os.walk(d)] | |
def statdirp(d='.'): | |
_, dirs, files = next(os.walk(d)) | |
# exclude symlinked dirs | |
dirs = [d for d in dirs if not islink(d)] | |
return sum((statdir(_) for _ in dirs), [{d: statfiles(d, files)}]) | |
def statdirp_joblib(d='.'): | |
_, dirs, files = next(os.walk(d)) | |
# exclude symlinked dirs | |
dirs = [d for d in dirs if not islink(d)] | |
# now we can parallelize across dirs | |
# Takes notable time! | |
#print("Parallelizing across %d dirs" % len(dirs)) | |
#return sum((statdir(_) for _ in dirs), [{d: statfiles(d, files)}]) | |
return sum( | |
Parallel(n_jobs=2)(#, prefer="threads")( | |
delayed(statdir)(_) for _ in dirs), | |
[{d: statfiles(d, files)}] | |
) | |
def statunsafeplain(d='.'): | |
return [[stat(join(dp, val), follow_symlinks=False) for val in files] for dp, _, files in os.walk('.')] | |
def statsafeplain(d='.'): | |
return [[safestat(join(dp, val)) for val in files] for dp, _, files in os.walk('.')] | |
import sys | |
if True: | |
# cold/warm | |
if len(sys.argv)>1: | |
func = sys.argv[1] | |
print(f"Using {func}") | |
func = locals()[sys.argv[1]] | |
else: | |
func = statdirp | |
f = timeit(func) | |
dfiles1, dt1 = f() | |
dfiles, dt2 = f() | |
assert dfiles1 == dfiles | |
else: | |
# verifying that statdir and statdirp return the same | |
dfiles1, dt1 = timeit(statdir)() | |
dfiles, dt2 = timeit(statdirp)() | |
from pprint import pprint | |
if len(dfiles) < 10: | |
pprint(dfiles1) | |
pprint(dfiles) | |
assert len(dfiles1) == len(dfiles) | |
# but they aren't exactly the same -- next level walk results have no ./ prefix ATM | |
total = 0 | |
for e in dfiles: | |
total += len(e) | |
#from pprint import pprint | |
#pprint(dfiles[-3:]) | |
print("Total %d:%d took %.5f cold %.5f warm" % (len(dfiles), total, dt1, dt2)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment