Last active
December 22, 2022 16:15
-
-
Save danielpodrazka/d65e77c07963a26f5672630b36e36caf to your computer and use it in GitHub Desktop.
Store the file below in your project and import it to the script that you are currently working on. I wrote an article about this gist: https://medium.com/@daniep/speed-up-developing-python-etls-with-invisible-cache-eb2eaadf6918
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import pandas as pd | |
from diskcache import FanoutCache | |
from diskcache.core import ENOVAL, args_to_key, ft, full_name | |
class cconnect(object): | |
def __init__(self): | |
self.begin = self.__enter__ | |
def __enter__(self): | |
return self | |
def __exit__(self, type, value, traceback): | |
... | |
def memoize( | |
self, name=None, typed=False, expire=None, tag=None, ignore=(), refresh=False | |
): | |
if callable(name): | |
raise TypeError("name cannot be callable") | |
def decorator(func): | |
"""Decorator created by memoize() for callable `func`.""" | |
base = (full_name(func),) if name is None else (name,) | |
@ft.wraps(func) | |
def wrapper(*args, **kwargs): | |
"""Wrapper for callable to cache arguments and return values.""" | |
key = wrapper.__cache_key__(*args, **kwargs) | |
if refresh and self.get(key) is not None: | |
del self[key] | |
result = self.get(key, default=ENOVAL, retry=True) | |
if result is ENOVAL: | |
result = func(*args, **kwargs) | |
if expire is None or expire > 0: | |
try: | |
self.set(key, result, expire, tag=tag, retry=True) | |
except TypeError as e: | |
print(f"diskcache: Couldn't cache {key}") | |
return result | |
def __cache_key__(*args, **kwargs): | |
"""Make key for cache given function arguments.""" | |
return args_to_key(base, args, kwargs, typed, ignore) | |
wrapper.__cache_key__ = __cache_key__ | |
return wrapper | |
return decorator | |
FanoutCache.memoize = memoize | |
DEFAULT_CACHE_SIZE = (2 ** 30) * 50 # 50GB | |
cache = FanoutCache( | |
os.environ.get( | |
"LOCAL_CACHE_PATH", os.path.join(os.path.expanduser("~"), "localcache") | |
), | |
size_limit=os.environ.get("LOCAL_CACHE_SIZE", DEFAULT_CACHE_SIZE), | |
) | |
cache_functions = [ | |
"read_feather", | |
"read_excel", | |
"read_csv", | |
"read_table", | |
"read_sql", | |
"read_sql_query", | |
"read_sql_table", | |
"read_parquet", | |
"read_json", | |
] | |
for func in cache_functions: | |
org_func = eval(f"pd.{func}") | |
if func in ["read_sql", "read_sql_query", "read_sql_table"]: | |
exec(f"pd.{func} = cache.memoize(ignore=(1,))(org_func)") | |
exec(f"pd.re{func} = cache.memoize(refresh=True, ignore=(1,))(org_func)") | |
else: | |
exec(f"pd.{func} = cache.memoize()(org_func)") | |
exec(f"pd.re{func} = cache.memoize(refresh=True)(org_func)") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment