Created
May 8, 2019 05:08
-
-
Save kazimuth/7bbf5d8593f23aac2979a4d1be5e8d43 to your computer and use it in GitHub Desktop.
filesystem persistence for python science stuff
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from typing import Any, Union | |
import hashlib | |
import pandas | |
import numpy as np | |
import json | |
import os | |
from pathlib import Path | |
def _load_json(path: Path): | |
return json.load(path.open('r')) | |
def _store_json(path: Path, obj: Any): | |
if isinstance(obj, dict) or isinstance(obj, list) \ | |
or isinstance(obj, str) or isinstance(obj, int) \ | |
or isinstance(obj, float): | |
json.dump(obj, path.open('a')) | |
return True | |
def _load_npy(path: Path): | |
return np.load(path) | |
def _store_npy(path: Path, arr: Any): | |
if isinstance(arr, np.ndarray): | |
np.save(path, arr) | |
return True | |
def _load_parquet(path: Path): | |
return pandas.read_parquet(str(path)) | |
def _store_parquet(path: Path, df: Any): | |
if isinstance(df, pandas.DataFrame): | |
df.to_parquet(path) | |
return True | |
def _load_datastore(path: Path): | |
if path.is_dir(): | |
return DataStore(path) | |
SERIALIZERS = [ | |
('', _load_datastore, lambda *args: False), | |
('parquet', _load_parquet, _store_parquet), | |
('npy', _load_npy, _store_npy), | |
('json', _load_json, _store_json) | |
] | |
class DataStore: | |
def __init__(self, path: Union[Path, str]): | |
if isinstance(path, str): | |
path = Path(path) | |
self.path = path | |
if not self.path.exists(): | |
os.makedirs(str(self.path)) | |
def _validate(self, key: str): | |
if '..' in key or '/./' in key: | |
raise Exception(f'invalid key: "{key}"') | |
def __getitem__(self, key: str): | |
self._validate(key) | |
for suffix, load, _ in SERIALIZERS: | |
path = self.path / f'{key}.{suffix}' | |
if path.is_file(): | |
result = load(path) | |
if result is not None: | |
return result | |
def __setitem__(self, key: str, obj: Any): | |
self._validate(key) | |
for suffix, _, _ in SERIALIZERS: | |
path = self.path / f'{key}.{suffix}' | |
if path.is_file(): | |
path.unlink() | |
# note: pathlib splits paths for us | |
parent = (self.path / key).parent | |
if not parent.exists(): | |
os.makedirs(parent) | |
for suffix, _, store in SERIALIZERS: | |
path = self.path / f'{key}.{suffix}' | |
if store(self.path / f'{key}.{suffix}', obj): | |
return | |
raise Exception(f'failed to serialize {obj} to {self.path}/{key}') | |
def make(self, key: str): | |
return DataStore(self.path / key) | |
STORE = DataStore('data') | |
CACHE = STORE.make('fsmemo') | |
def ident(f): | |
code_hash = hashlib.sha3_224(f.__code__.co_code).hexdigest()[:8] | |
return f'{f.__module__}.{f.__name__}.{code_hash}' | |
def fsmemo(fun): | |
fid = ident(fun) | |
def wrapper(*args, **kwargs): | |
memo = json.dumps(list(args) + list(sorted((k,v) for k,v in kwargs.items()))) | |
key = hashlib.sha3_224(memo.encode('utf-8')).hexdigest()[:16] | |
if CACHE[f'{fid}/{key}/memo'] == memo: | |
return CACHE[f'{fid}/{key}/result'] | |
result = fun(*args, **kwargs) | |
CACHE[f'{fid}/{key}/memo'] = memo | |
CACHE[f'{fid}/{key}/result'] = result | |
return result | |
return wrapper |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment