Skip to content

Instantly share code, notes, and snippets.

@irq0
Created July 27, 2022 11:54
Show Gist options
  • Save irq0/6d2ba59c07de17e236e99f17208720e7 to your computer and use it in GitHub Desktop.
Save irq0/6d2ba59c07de17e236e99f17208720e7 to your computer and use it in GitHub Desktop.
Benchmark random id -> directory tree mappings
#!/usr/bin/env python3
# Benchmark random id -> directory tree mappings
import uuid
import pathlib
import re
import random
import click
import multiprocessing
import time
from contextlib import contextmanager
class Flat(object):
def __init__(self, base_dir):
self.base_dir = base_dir
base_dir.mkdir(parents=True)
def __call__(self, id):
return (self.base_dir / id).mkdir(parents=False, exist_ok=False)
def __str__(self):
return f"Flat({self.base_dir})"
def stat(self, id):
return (self.base_dir / id).stat()
class RegexSplit(object):
def __init__(self, base_dir, regex):
self.base_dir = base_dir
self.regex = regex
base_dir.mkdir(parents=True)
def __str__(self):
return f"RegexSplit({self.base_dir}, {self.regex.pattern})"
def split_levels(self, id):
return self.base_dir / pathlib.Path(
*[part for part in self.regex.split(id) if part != ""]
)
def __call__(self, id):
return self.split_levels(id).mkdir(parents=True)
def stat(self, id):
return self.split_levels(id).stat()
@contextmanager
def log_exec_time(name):
start = time.perf_counter_ns()
yield
end = time.perf_counter_ns()
delta = end - start
print(f"[EXEC TIME] {name}:\t{delta / 1000 / 1000 / 1000:.3f}s\t\t{delta}ns")
def bench(pool, ids, lookups, style):
with log_exec_time("mkdir"):
pool.map(style, ids)
with log_exec_time("lookups"):
pool.map(style.stat, lookups)
@click.command()
@click.argument(
"base_dir",
nargs=1,
type=click.Path(
exists=True,
file_okay=False,
writable=True,
resolve_path=True,
path_type=pathlib.Path,
),
)
@click.option("--nworker", type=int, default=10)
@click.option(
"--nids",
type=int,
default=100000,
help="Number of random ids to generate dir hierarchies from",
)
@click.option(
"--nlookups",
type=int,
default=1000,
help="Number of random ids to lookup from generated dir hierarchies",
)
def run(base_dir, nworker, nids, nlookups):
pool = multiprocessing.Pool(processes=nworker)
ids = [uuid.uuid4().hex for _ in range(nids)]
lookup_ids = random.choices(ids, k=nlookups)
print(f"PARAM: ids:{len(ids)} lookups:{len(lookup_ids)} pool:{repr(pool)}")
styles = [
Flat(base_dir / "flat"),
RegexSplit(base_dir / "two_one_byte_levels", re.compile(r"(.{2})(.{2})(.*)")),
RegexSplit(base_dir / "two_two_byte_levels", re.compile(r"(.{4})(.{4})(.*)")),
RegexSplit(
base_dir / "uuid_sep", re.compile(r"(.{8})(.{4})(.{4})(.{4})(.{12})")
),
RegexSplit(base_dir / "bytes_all_the_way_down", re.compile(r"(.{2})")),
]
for style in styles:
print("RUN:", style)
bench(pool, ids, lookup_ids, style)
if __name__ == "__main__":
run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment