Created
October 8, 2014 20:49
-
-
Save MattFaus/b5e1658f7d7ccc0d9e12 to your computer and use it in GitHub Desktop.
An improvement over the CompressedFeatures class introduced at http://derandomized.com/post/51709771229/compressed-features-for-machine-learning#disqus_thread by not requiring the key->component mapping to be stored.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class DeterministicCompressedFeatures(CompressedFeatures): | |
"""Generates random components after seeding with the component_key. | |
By using a known seed to generate the random components, we do not need to | |
store or manage them. We can just recompute them whenever we need. | |
""" | |
def __init__(self, num_features=RANDOM_FEATURE_LENGTH): | |
super(DeterministicallyRandomFeatures, self).__init__(num_features) | |
# We can't use a defaultdict, because we must pass a parameter to | |
# _generate_component() | |
self.random_components = {} | |
def _generate_component(self, component_key): | |
# We must use hashlib, because hash() is unreliable | |
# http://stackoverflow.com/questions/793761/built-in-python-hash-function | |
big_hash = int(hashlib.md5(repr(component_key)).hexdigest(), 16) | |
# Shrink the 39-digit hash to something seed() will accept | |
lil_hash = int(big_hash % ((1 << 31) - 1)) | |
np.random.seed(lil_hash) | |
# Deterministically compute the feature vector, based on the key | |
rv = np.random.randn(self.num_features, 1) | |
rv /= np.sqrt(np.dot(rv.T, rv)) # normalize to unit length | |
return rv | |
def increment_component(self, component_key, scale=1.0): | |
"""Increments feature_vector by the specified component. | |
Arguments: | |
component_key - The component to increment by. If this key has | |
never been seen before, the component values are generated. | |
scale - The multiplicative factor to apply against the | |
component values. | |
""" | |
if not self.contains_component(component_key): | |
if self.dynamic_mode: | |
self.random_components[component_key] = ( | |
self._generate_component(component_key)) | |
else: | |
return False | |
self.feature_vector += scale * self.random_components[component_key] | |
return True |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment