Skip to content

Instantly share code, notes, and snippets.

@BrianHicks
Created November 13, 2014 18:53
Show Gist options
  • Save BrianHicks/99df1add045a08a665dd to your computer and use it in GitHub Desktop.
Save BrianHicks/99df1add045a08a665dd to your computer and use it in GitHub Desktop.
# -*- encoding: utf-8 -*-
import hashlib
import itertools
import os
class BlobStore(dict):
def identify(self, blob):
return 'sha1-%s' % hashlib.sha1(blob).hexdigest()
def store(self, blob):
identity = self.identify(blob)
self[identity] = blob
return identity
def __repr__(self):
return '<BlobStore: %d blobs>' % len(self)
class FileStore(dict):
def __init__(self, store, size=8*1024):
self.backend = store
self.size = size
def store(self, filename, content):
return self.store_blobs(
filename,
(content[i:i+self.size] for i in range(0, len(content), self.size))
)
def store_blobs(self, filename, chunks):
self[filename] = tuple(
self.backend.store(chunk)
for chunk in chunks
)
return filename
def prune(self):
have = set(self.backend.keys())
need = set(itertools.chain.from_iterable(self.values()))
unneeded = have - need
for blob in unneeded:
del self.backend[blob]
return len(unneeded)
def retrieve(self, filename):
return ''.join(
self.backend[chunk]
for chunk in self[filename]
)
def __repr__(self):
return '<FileStore: %s files>' % len(self)
if __name__ == '__main__':
b = BlobStore()
f = FileStore(b)
def status(filestore):
status = ''
status += '%.2fmb stored in %d chunks\n' % (
sum(map(len, filestore.backend.values())) / 1024.0 / 1024.0,
len(filestore.backend)
)
status += '%d total chunks referenced in %d files' % (
sum(map(len, filestore.values())), len(filestore)
)
return status
def header(text):
return (' %s ' % text).center(80, '=')
print header('base')
print status(f)
print header('storing a nonsense file')
print f.store('x.txt', 'some nonsense')
print status(f)
print header('retrieving a nonsense file')
print f.retrieve('x.txt')
print header('storing some project gutenberg texts')
for book in os.listdir(os.path.expanduser('~/data/books')):
with open(os.path.join(os.path.expanduser('~/data/books'), book), 'r') as bookhandle:
print f.store(book, bookhandle.read())
print status(f)
print header('deleting nonsense file')
del f['x.txt']
print status(f)
print header('pruning')
print '%d chunks pruned' % f.prune()
print status(f)
===================================== base =====================================
0.00mb stored in 0 chunks
0 total chunks referenced in 0 files
=========================== storing a nonsense file ============================
x.txt
0.00mb stored in 1 chunks
1 total chunks referenced in 1 files
========================== retrieving a nonsense file ==========================
some nonsense
===================== storing some project gutenberg texts =====================
frankenstein.txt
huckleberryfinn.txt
metamorphosis.txt
prideandprejudice.txt
sherlockholmes.txt
2.40mb stored in 310 chunks
310 total chunks referenced in 6 files
============================ deleting nonsense file ============================
2.40mb stored in 310 chunks
309 total chunks referenced in 5 files
=================================== pruning ====================================
1 chunks pruned
2.40mb stored in 309 chunks
309 total chunks referenced in 5 files
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment