Created
June 17, 2021 20:19
-
-
Save numpde/2da5560b1853f017b64b546d81c9dabf to your computer and use it in GitHub Desktop.
Hash stuff
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# SOURCE: https://home.cc.umanitoba.ca/~psgendb/doc/spades/joblib3/hashing.py | |
""" | |
Fast cryptographic hash of Python objects, with a special case for fast | |
hashing of numpy arrays. | |
""" | |
# Author: Gael Varoquaux <gael dot varoquaux at normalesup dot org> | |
# Copyright (c) 2009 Gael Varoquaux | |
# License: BSD Style, 3 clauses. | |
import warnings | |
import pickle | |
import hashlib | |
import sys | |
import types | |
import struct | |
from ._compat import _bytes_or_unicode | |
import io | |
PY3 = sys.version[0] == '3' | |
if PY3: | |
Pickler = pickle._Pickler | |
else: | |
Pickler = pickle.Pickler | |
class _ConsistentSet(object): | |
""" Class used to ensure the hash of Sets is preserved | |
whatever the order of its items. | |
""" | |
def __init__(self, set_sequence): | |
self._sequence = sorted(set_sequence) | |
class _MyHash(object): | |
""" Class used to hash objects that won't normally pickle """ | |
def __init__(self, *args): | |
self.args = args | |
class Hasher(Pickler): | |
""" A subclass of pickler, to do cryptographic hashing, rather than | |
pickling. | |
""" | |
def __init__(self, hash_name='md5'): | |
self.stream = io.BytesIO() | |
# By default we want a pickle protocol that only changes with | |
# the major python version and not the minor one | |
protocol = (pickle.DEFAULT_PROTOCOL if PY3 | |
else pickle.HIGHEST_PROTOCOL) | |
Pickler.__init__(self, self.stream, protocol=protocol) | |
# Initialise the hash obj | |
self._hash = hashlib.new(hash_name) | |
def hash(self, obj, return_digest=True): | |
try: | |
self.dump(obj) | |
except pickle.PicklingError as e: | |
warnings.warn('PicklingError while hashing %r: %r' % (obj, e)) | |
dumps = self.stream.getvalue() | |
self._hash.update(dumps) | |
if return_digest: | |
return self._hash.hexdigest() | |
def save(self, obj): | |
if isinstance(obj, (types.MethodType, type({}.pop))): | |
# the Pickler cannot pickle instance methods; here we decompose | |
# them into components that make them uniquely identifiable | |
if hasattr(obj, '__func__'): | |
func_name = obj.__func__.__name__ | |
else: | |
func_name = obj.__name__ | |
inst = obj.__self__ | |
if type(inst) == type(pickle): | |
obj = _MyHash(func_name, inst.__name__) | |
elif inst is None: | |
# type(None) or type(module) do not pickle | |
obj = _MyHash(func_name, inst) | |
else: | |
cls = obj.__self__.__class__ | |
obj = _MyHash(func_name, inst, cls) | |
Pickler.save(self, obj) | |
def memoize(self, obj): | |
# We want hashing to be sensitive to value instead of reference. | |
# For example we want ['aa', 'aa'] and ['aa', 'aaZ'[:2]] | |
# to hash to the same value and that's why we disable memoization | |
# for strings | |
if isinstance(obj, _bytes_or_unicode): | |
return | |
Pickler.memoize(self, obj) | |
# The dispatch table of the pickler is not accessible in Python | |
# 3, as these lines are only bugware for IPython, we skip them. | |
def save_global(self, obj, name=None, pack=struct.pack): | |
# We have to override this method in order to deal with objects | |
# defined interactively in IPython that are not injected in | |
# __main__ | |
kwargs = dict(name=name, pack=pack) | |
if sys.version_info >= (3, 4): | |
del kwargs['pack'] | |
try: | |
Pickler.save_global(self, obj, **kwargs) | |
except pickle.PicklingError: | |
Pickler.save_global(self, obj, **kwargs) | |
module = getattr(obj, "__module__", None) | |
if module == '__main__': | |
my_name = name | |
if my_name is None: | |
my_name = obj.__name__ | |
mod = sys.modules[module] | |
if not hasattr(mod, my_name): | |
# IPython doesn't inject the variables define | |
# interactively in __main__ | |
setattr(mod, my_name, obj) | |
dispatch = Pickler.dispatch.copy() | |
# builtin | |
dispatch[type(len)] = save_global | |
# type | |
dispatch[type(object)] = save_global | |
# classobj | |
dispatch[type(Pickler)] = save_global | |
# function | |
dispatch[type(pickle.dump)] = save_global | |
def _batch_setitems(self, items): | |
# forces order of keys in dict to ensure consistent hash | |
Pickler._batch_setitems(self, iter(sorted(items))) | |
def save_set(self, set_items): | |
# forces order of items in Set to ensure consistent hash | |
Pickler.save(self, _ConsistentSet(set_items)) | |
dispatch[type(set())] = save_set | |
class NumpyHasher(Hasher): | |
""" Special case the hasher for when numpy is loaded. | |
""" | |
def __init__(self, hash_name='md5', coerce_mmap=False): | |
""" | |
Parameters | |
---------- | |
hash_name: string | |
The hash algorithm to be used | |
coerce_mmap: boolean | |
Make no difference between np.memmap and np.ndarray | |
objects. | |
""" | |
self.coerce_mmap = coerce_mmap | |
Hasher.__init__(self, hash_name=hash_name) | |
# delayed import of numpy, to avoid tight coupling | |
import numpy as np | |
self.np = np | |
if hasattr(np, 'getbuffer'): | |
self._getbuffer = np.getbuffer | |
else: | |
self._getbuffer = memoryview | |
def save(self, obj): | |
""" Subclass the save method, to hash ndarray subclass, rather | |
than pickling them. Off course, this is a total abuse of | |
the Pickler class. | |
""" | |
if isinstance(obj, self.np.ndarray) and not obj.dtype.hasobject: | |
# Compute a hash of the object: | |
try: | |
# memoryview is not supported for some dtypes, | |
# e.g. datetime64, see | |
# https://github.com/numpy/numpy/issues/4983. The | |
# workaround is to view the array as bytes before | |
# taking the memoryview | |
obj_bytes_view = obj.view(self.np.uint8) | |
self._hash.update(self._getbuffer(obj_bytes_view)) | |
# ValueError is raised by .view when the array is not contiguous | |
# BufferError is raised by Python 3 in the hash update if | |
# the array is Fortran rather than C contiguous | |
except (ValueError, BufferError): | |
# Cater for non-single-segment arrays: this creates a | |
# copy, and thus aleviates this issue. | |
# XXX: There might be a more efficient way of doing this | |
obj_bytes_view = obj.flatten().view(self.np.uint8) | |
self._hash.update(self._getbuffer(obj_bytes_view)) | |
# We store the class, to be able to distinguish between | |
# Objects with the same binary content, but different | |
# classes. | |
if self.coerce_mmap and isinstance(obj, self.np.memmap): | |
# We don't make the difference between memmap and | |
# normal ndarrays, to be able to reload previously | |
# computed results with memmap. | |
klass = self.np.ndarray | |
else: | |
klass = obj.__class__ | |
# We also return the dtype and the shape, to distinguish | |
# different views on the same data with different dtypes. | |
# The object will be pickled by the pickler hashed at the end. | |
obj = (klass, ('HASHED', obj.dtype, obj.shape, obj.strides)) | |
elif isinstance(obj, self.np.dtype): | |
# Atomic dtype objects are interned by their default constructor: | |
# np.dtype('f8') is np.dtype('f8') | |
# This interning is not maintained by a | |
# pickle.loads + pickle.dumps cycle, because __reduce__ | |
# uses copy=True in the dtype constructor. This | |
# non-deterministic behavior causes the internal memoizer | |
# of the hasher to generate different hash values | |
# depending on the history of the dtype object. | |
# To prevent the hash from being sensitive to this, we use | |
# .descr which is a full (and never interned) description of | |
# the array dtype according to the numpy doc. | |
klass = obj.__class__ | |
obj = (klass, ('HASHED', obj.descr)) | |
Hasher.save(self, obj) | |
def hash(obj, hash_name='md5', coerce_mmap=False): | |
""" Quick calculation of a hash to identify uniquely Python objects | |
containing numpy arrays. | |
Parameters | |
----------- | |
hash_name: 'md5' or 'sha1' | |
Hashing algorithm used. sha1 is supposedly safer, but md5 is | |
faster. | |
coerce_mmap: boolean | |
Make no difference between np.memmap and np.ndarray | |
""" | |
if 'numpy' in sys.modules: | |
hasher = NumpyHasher(hash_name=hash_name, coerce_mmap=coerce_mmap) | |
else: | |
hasher = Hasher(hash_name=hash_name) | |
return hasher.hash(obj) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment