Last active
March 28, 2024 20:14
-
-
Save dsevero/3f3db7acb45d6cd8e945e8a32eaca168 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from hashlib import sha256 | |
from pandas.util import hash_pandas_object | |
from functools import lru_cache | |
class HashableDataFrame(pd.DataFrame): | |
def __init__(self, obj): | |
super().__init__(obj) | |
def __hash__(self): | |
hash_value = sha256(hash_pandas_object(self, index=True).values) | |
hash_value = hash(hash_value.hexdigest()) | |
return hash_value | |
def __eq__(self, other): | |
return self.equals(other) | |
class HashableSeries(pd.Series): | |
def __init__(self, obj): | |
super().__init__(obj) | |
def __hash__(self): | |
hash_value = sha256(hash_pandas_object(self, index=True).values) | |
hash_value = hash(hash_value.hexdigest()) | |
return hash_value | |
def __eq__(self, other): | |
return self.equals(other) | |
def make_hashable_pandas_object(obj): | |
if type(obj) is pd.DataFrame: | |
return HashableDataFrame(obj) | |
elif type(obj) is pd.Series: | |
return HashableSeries(obj) | |
else: | |
raise Exception(f'obj must be of type DataFrame or Series, ' | |
f'but is {type(obj)}') | |
@lru_cache() | |
def cached_function_with_pandas(df, s): | |
# heavy computation ... | |
return ... | |
if __name__ == '__main__': | |
df = ... # some pandas dataframe | |
s = ... # some pandas series | |
s_hashable = make_hashable_pandas_object(s) | |
df_hashable = make_hashable_pandas_object(df) | |
cached_function_with_pandas(df, s) # raises exception | |
cached_function_with_pandas(df_hashable, s_hashable) # works! |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Note that
hash()
is not deterministic across Python runs [src]—if you change:to the following, it should be deterministic across Python runs: