Skip to content

Instantly share code, notes, and snippets.

@numpde
Created June 17, 2021 20:19
Show Gist options
  • Save numpde/2da5560b1853f017b64b546d81c9dabf to your computer and use it in GitHub Desktop.
Save numpde/2da5560b1853f017b64b546d81c9dabf to your computer and use it in GitHub Desktop.
Hash stuff
# SOURCE: https://home.cc.umanitoba.ca/~psgendb/doc/spades/joblib3/hashing.py
"""
Fast cryptographic hash of Python objects, with a special case for fast
hashing of numpy arrays.
"""
# Author: Gael Varoquaux <gael dot varoquaux at normalesup dot org>
# Copyright (c) 2009 Gael Varoquaux
# License: BSD Style, 3 clauses.
import warnings
import pickle
import hashlib
import sys
import types
import struct
from ._compat import _bytes_or_unicode
import io
PY3 = sys.version[0] == '3'
if PY3:
Pickler = pickle._Pickler
else:
Pickler = pickle.Pickler
class _ConsistentSet(object):
""" Class used to ensure the hash of Sets is preserved
whatever the order of its items.
"""
def __init__(self, set_sequence):
self._sequence = sorted(set_sequence)
class _MyHash(object):
""" Class used to hash objects that won't normally pickle """
def __init__(self, *args):
self.args = args
class Hasher(Pickler):
""" A subclass of pickler, to do cryptographic hashing, rather than
pickling.
"""
def __init__(self, hash_name='md5'):
self.stream = io.BytesIO()
# By default we want a pickle protocol that only changes with
# the major python version and not the minor one
protocol = (pickle.DEFAULT_PROTOCOL if PY3
else pickle.HIGHEST_PROTOCOL)
Pickler.__init__(self, self.stream, protocol=protocol)
# Initialise the hash obj
self._hash = hashlib.new(hash_name)
def hash(self, obj, return_digest=True):
try:
self.dump(obj)
except pickle.PicklingError as e:
warnings.warn('PicklingError while hashing %r: %r' % (obj, e))
dumps = self.stream.getvalue()
self._hash.update(dumps)
if return_digest:
return self._hash.hexdigest()
def save(self, obj):
if isinstance(obj, (types.MethodType, type({}.pop))):
# the Pickler cannot pickle instance methods; here we decompose
# them into components that make them uniquely identifiable
if hasattr(obj, '__func__'):
func_name = obj.__func__.__name__
else:
func_name = obj.__name__
inst = obj.__self__
if type(inst) == type(pickle):
obj = _MyHash(func_name, inst.__name__)
elif inst is None:
# type(None) or type(module) do not pickle
obj = _MyHash(func_name, inst)
else:
cls = obj.__self__.__class__
obj = _MyHash(func_name, inst, cls)
Pickler.save(self, obj)
def memoize(self, obj):
# We want hashing to be sensitive to value instead of reference.
# For example we want ['aa', 'aa'] and ['aa', 'aaZ'[:2]]
# to hash to the same value and that's why we disable memoization
# for strings
if isinstance(obj, _bytes_or_unicode):
return
Pickler.memoize(self, obj)
# The dispatch table of the pickler is not accessible in Python
# 3, as these lines are only bugware for IPython, we skip them.
def save_global(self, obj, name=None, pack=struct.pack):
# We have to override this method in order to deal with objects
# defined interactively in IPython that are not injected in
# __main__
kwargs = dict(name=name, pack=pack)
if sys.version_info >= (3, 4):
del kwargs['pack']
try:
Pickler.save_global(self, obj, **kwargs)
except pickle.PicklingError:
Pickler.save_global(self, obj, **kwargs)
module = getattr(obj, "__module__", None)
if module == '__main__':
my_name = name
if my_name is None:
my_name = obj.__name__
mod = sys.modules[module]
if not hasattr(mod, my_name):
# IPython doesn't inject the variables define
# interactively in __main__
setattr(mod, my_name, obj)
dispatch = Pickler.dispatch.copy()
# builtin
dispatch[type(len)] = save_global
# type
dispatch[type(object)] = save_global
# classobj
dispatch[type(Pickler)] = save_global
# function
dispatch[type(pickle.dump)] = save_global
def _batch_setitems(self, items):
# forces order of keys in dict to ensure consistent hash
Pickler._batch_setitems(self, iter(sorted(items)))
def save_set(self, set_items):
# forces order of items in Set to ensure consistent hash
Pickler.save(self, _ConsistentSet(set_items))
dispatch[type(set())] = save_set
class NumpyHasher(Hasher):
""" Special case the hasher for when numpy is loaded.
"""
def __init__(self, hash_name='md5', coerce_mmap=False):
"""
Parameters
----------
hash_name: string
The hash algorithm to be used
coerce_mmap: boolean
Make no difference between np.memmap and np.ndarray
objects.
"""
self.coerce_mmap = coerce_mmap
Hasher.__init__(self, hash_name=hash_name)
# delayed import of numpy, to avoid tight coupling
import numpy as np
self.np = np
if hasattr(np, 'getbuffer'):
self._getbuffer = np.getbuffer
else:
self._getbuffer = memoryview
def save(self, obj):
""" Subclass the save method, to hash ndarray subclass, rather
than pickling them. Off course, this is a total abuse of
the Pickler class.
"""
if isinstance(obj, self.np.ndarray) and not obj.dtype.hasobject:
# Compute a hash of the object:
try:
# memoryview is not supported for some dtypes,
# e.g. datetime64, see
# https://github.com/numpy/numpy/issues/4983. The
# workaround is to view the array as bytes before
# taking the memoryview
obj_bytes_view = obj.view(self.np.uint8)
self._hash.update(self._getbuffer(obj_bytes_view))
# ValueError is raised by .view when the array is not contiguous
# BufferError is raised by Python 3 in the hash update if
# the array is Fortran rather than C contiguous
except (ValueError, BufferError):
# Cater for non-single-segment arrays: this creates a
# copy, and thus aleviates this issue.
# XXX: There might be a more efficient way of doing this
obj_bytes_view = obj.flatten().view(self.np.uint8)
self._hash.update(self._getbuffer(obj_bytes_view))
# We store the class, to be able to distinguish between
# Objects with the same binary content, but different
# classes.
if self.coerce_mmap and isinstance(obj, self.np.memmap):
# We don't make the difference between memmap and
# normal ndarrays, to be able to reload previously
# computed results with memmap.
klass = self.np.ndarray
else:
klass = obj.__class__
# We also return the dtype and the shape, to distinguish
# different views on the same data with different dtypes.
# The object will be pickled by the pickler hashed at the end.
obj = (klass, ('HASHED', obj.dtype, obj.shape, obj.strides))
elif isinstance(obj, self.np.dtype):
# Atomic dtype objects are interned by their default constructor:
# np.dtype('f8') is np.dtype('f8')
# This interning is not maintained by a
# pickle.loads + pickle.dumps cycle, because __reduce__
# uses copy=True in the dtype constructor. This
# non-deterministic behavior causes the internal memoizer
# of the hasher to generate different hash values
# depending on the history of the dtype object.
# To prevent the hash from being sensitive to this, we use
# .descr which is a full (and never interned) description of
# the array dtype according to the numpy doc.
klass = obj.__class__
obj = (klass, ('HASHED', obj.descr))
Hasher.save(self, obj)
def hash(obj, hash_name='md5', coerce_mmap=False):
""" Quick calculation of a hash to identify uniquely Python objects
containing numpy arrays.
Parameters
-----------
hash_name: 'md5' or 'sha1'
Hashing algorithm used. sha1 is supposedly safer, but md5 is
faster.
coerce_mmap: boolean
Make no difference between np.memmap and np.ndarray
"""
if 'numpy' in sys.modules:
hasher = NumpyHasher(hash_name=hash_name, coerce_mmap=coerce_mmap)
else:
hasher = Hasher(hash_name=hash_name)
return hasher.hash(obj)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment