Created
April 21, 2017 08:10
-
-
Save thundergolfer/745d276e57e0f555c6f0d4becf124f7c to your computer and use it in GitHub Desktop.
Cosine Distance class. Use to avoid 'plain old data' code-smell in ML. *Currently in progress*
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import tensorflow as tf | |
""" NOTES: | |
Currently this class has a "calculate" static method that is used to calculate cosine distances given two | |
Tensorflow tensors. Calculating distances for *multiple* vector pairs is conceptually incompatible with this | |
class. | |
This class is an object that wraps the creation/calculation of *one* Cosine Distance measure. It was intended to | |
be used in a situation like this. | |
self.model_threshold = 1.0 # <<-- plain-old-data code smell | |
You would do this: | |
self.model_threshold = CosineDistance('MAX') | |
Currently including the Tensorflow tensor "calculate()" method breaks the `repr()` functionality, because it expects | |
a float to be the value of `self.measure`. | |
It's possible that having a useful "calculate()" method isn't possible given how this class is supposed to | |
represent one Cosine Distance measure. | |
Just for kicks you could a 1-to-1 "calculate()" method that takes in two vectors. This would allow for the | |
x = CosineDistance(vector_one, vector_two) | |
fanciness | |
""" | |
class CosineDistance(): | |
def __init__(self, tensor_1, tensor_2=None): | |
if tensor_2 is None: | |
if isinstance(tensor_1, basestring): | |
if tensor_1 == 'MAX': | |
self.measure = 1.0 | |
elif tensor_1 == 'MIN': | |
self.measure = 0.0 | |
else: | |
distance = float(tensor_1) | |
if 0.0 > distance or 1.0 < distance: | |
raise ValueError("A cosine distance measure must be between 0 and 1 (inclusive)") | |
self.measure = distance | |
else: | |
self.measure = self.calculate(tensor_1, tensor_2) | |
@staticmethod | |
def calculate(tensor_1, tensor_2): | |
""" | |
Calculates the cosine distances between each row of | |
two 2-D tensors. | |
Parameters: | |
@tensor-1: The first (correct) tensor | |
@tensor-2: the second tensor | |
Returns: | |
The cosine distances between tensor-1 and tensor-2 | |
""" | |
normed_tensor_1 = tf.nn.l2_normalize(tensor_1, dim=1) | |
normed_tensor_2 = tf.nn.l2_normalize(tensor_2, dim=1) | |
return (1 - tf.matmul(normed_tensor_1, normed_tensor_2, transpose_b=True)) | |
def __eq__(self, other): | |
if isinstance(other, CosineDistance): | |
return self.measure == other.measure | |
elif isinstance(other, (int, long)): | |
return self.measure == other | |
else: | |
raise TypeError("CosineDistance can only be compared to numbers or other CosineDistance objects") | |
def __ne__(self, other): | |
return not self.__eq__(other) | |
def __lt__(self, other): | |
if isinstance(other, CosineDistance): | |
return self.measure < other.measure | |
elif isinstance(other, (int, long)): | |
return self.measure < other | |
else: | |
raise TypeError("CosineDistance can only be compared to numbers or other CosineDistance objects") | |
def __gt__(self, other): | |
return not self.__eq__(other) and not self.__lt__(other) | |
def __le__(self, other): | |
return not self.__gt__(other) | |
def __ge__(self, other): | |
return not self.__lt__(other) | |
def __repr__(self): | |
return "CosineDistance " + str(round(self.measure, 2)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Cosine distance is actually within the range
[-1, 1]
so this code has a bug