Skip to content

Instantly share code, notes, and snippets.

@toast38coza
Last active November 4, 2019 08:26
Show Gist options
  • Save toast38coza/094017d00c68dc563f69ff5874b531b6 to your computer and use it in GitHub Desktop.
Save toast38coza/094017d00c68dc563f69ff5874b531b6 to your computer and use it in GitHub Desktop.
Take a large file and cache it in memcached

Installation

pip install -r requirements.txt

Run

python chunker.py

Test

python test_chunker.py
from pymemcache.client.base import Client
import hashlib
def get_client():
host = 'localhost'
port = 11211
return Client((host, port))
def get_md5(file_path):
f = open(file_path, 'rb')
content = f.read()
f.close()
return hashlib.md5(content).digest()
def get_chunked_file(file_path, chunk_size):
chunks = []
result = None
f = open(file_path, 'rb')
while result != b'':
result = f.read(chunk_size)
chunks.append(result)
f.close()
return chunks
def store_in_memcached(chunks, key, checksum):
client = get_client()
client.set("{}:hash" . format (key), checksum)
for index, chunk in enumerate(chunks):
item_key = "{}:{}" . format (key, index)
client.set(item_key, chunk)
def get_from_memcached(lenth_of_chunks, key):
client = get_client()
# this would be better, but unfortunately get_many returns a dict,
# which is unordered :(
#chunks = ["{}:{}" . format (key, index) for index in range(0,lenth_of_chunks)]
#result = client.get_many(chunks)
#return b''.join([item for index, item in result.items()])
# since we're not using get_many, you should really do this in one pass
result = b'' . join([client.get("{}:{}" . format (key, index)) for index in range(0,lenth_of_chunks)])
checksum = client.get("{}:hash". format(key))
return (result, checksum)
# before, after
def validate(result, original_md5):
result_md5 = hashlib.md5(result).digest()
assert result_md5 == original_md5, \
'Got: {}. Expected: {}' . format (result_md5, original_md5)
def main():
file_name = 'bigoldfile.dat'
original_checksum = get_md5(file_name)
chunks = get_chunked_file(file_name, 999)
store_in_memcached(chunks, file_name, original_checksum)
result, checksum = get_from_memcached(len(chunks), file_name)
validate(result, checksum)
if __name__ == '__main__':
main()
import unittest
from chunker import get_chunked_file, store_in_memcached, get_client, get_from_memcached, get_md5, validate
host = 'localhost'
port = 11211
class ChunkerTestCase(unittest.TestCase):
def setUp(self):
self.c = get_client()
def test_get_md5(self):
m = get_md5('test_file.dat')
assert m == b'\x0b\x06\xc7\x08i\xa3\x0737\x9ar\xe2\xa8\xc07X', \
'md5 doesnt match. Got: {}' . format (m)
def test_get_chunked_file(self):
result = get_chunked_file('smalloldfile.dat', 10)
def test_store_in_chunks(self):
chunks = ['abc','def','ghi']
checksum = b'\x8a\xa9\x9b\x1fC\x9f\xf7\x12\x93\xe9SW\xba\xc6\xfd\x94'
store_in_memcached(chunks, "test", checksum)
assert self.c.get("test:0") == b'abc', \
'Expect test:0 to be abc. got: {}' . format (self.c.get("test:0"))
assert self.c.get("test:1") == b'def'
assert self.c.get("test:2") == b'ghi'
def test_get_from_memcached(self):
chunks = ['abc','def','ghi']
result, checksum = get_from_memcached(3, "test")
assert result == b'abcdefghi', \
'Expect result to be abcdefghi. Got: {}' . format (result)
assert checksum == b'\x8a\xa9\x9b\x1fC\x9f\xf7\x12\x93\xe9SW\xba\xc6\xfd\x94'
def test_validate(self):
checksum = b'\x8a\xa9\x9b\x1fC\x9f\xf7\x12\x93\xe9SW\xba\xc6\xfd\x94'
validate(b'abcdefghi', checksum)
if __name__ == '__main__':
unittest.main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment