Skip to content

Instantly share code, notes, and snippets.

@showell
Created May 3, 2012 15:41
Show Gist options
  • Save showell/2586626 to your computer and use it in GitHub Desktop.
Save showell/2586626 to your computer and use it in GitHub Desktop.
big_files.py
import random
import time
import os
class SmartDir:
def __init__(self, path):
self.path = path
self.buckets = 50
def clear(self):
path = self.path
os.system("rm -Rf %s" % path)
time.sleep(0.1)
os.system("mkdir -p %s" % path)
time.sleep(0.1)
for i in range(self.buckets):
for j in range(self.buckets):
os.system("mkdir -p %s/%s/%s" % (path, i, j))
time.sleep(0.1)
def smart_fn(self, fn):
path = self.path
hash = fn
subdir1 = hash % self.buckets
subdir2 = (hash / self.buckets) % self.buckets
return "%s/%s/%s/%s" % (path, subdir1, subdir2, fn)
def write_lots_of_little_files(rebuild, num_files, file_size):
smart_dir = SmartDir("tmp_little")
if rebuild:
smart_dir.clear()
for i in range(num_files):
data = "%d\n%s" % (i, str(i) * file_size)
data = data[:file_size]
fn = smart_dir.smart_fn(i)
f = open(fn, 'w')
f.write(data)
f.close()
def read_little(i, file_size):
fn = smart_dir.smart_fn(i)
f = open(fn)
data = f.read(file_size)
if int(data.split('\n')[0]) != i:
raise Exception('broken')
f.close()
return read_little
def write_big_file(rebuild, num_files, file_size):
big_file_fn = "tmp_my_big_file"
if rebuild:
f = open(big_file_fn, "w")
for i in range(num_files):
data = "%d\n%s" % (i, str(i) * file_size)
data = data[:file_size]
f.write(data)
f.close()
def read_big(i, file_size):
fn = big_file_fn
f = open(fn)
f.seek(i * file_size)
data = f.read(file_size)
if int(data.split('\n')[0]) != i:
raise Exception('broken')
f.close()
return read_big
def benchmark(title, reader, num_trials, num_files, file_size):
print "----- benchmark for %s" % title
t_start = time.time()
for i in range(num_trials):
n = random.randrange(num_files)
reader(n, file_size)
trial_time = time.time() - t_start
print "num_trials", num_trials
print "trial time", trial_time
print
def main():
num_files = 20000
file_size = 2000
rebuild = True
read_big = write_big_file(rebuild, num_files, file_size)
read_little = write_lots_of_little_files(rebuild, num_files, file_size)
num_trials = 10000
benchmark("little files", read_little, num_trials, num_files, file_size)
benchmark("big files with seeks", read_big, num_trials, num_files, file_size)
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment