In the name of God
This gist contains implementation of Embedding Similarity Measurement in C++
and Python
.
#include <cstddef>
#include <cmath>
#include <stdexcept>
float CalculateManhattanDistance(const std::vector<float>& embedding1, const std::vector<float>& embedding2)
{
if(embedding1.size() != embedding2.size())
{
throw std::invalid_argument("Embedding sizes should be equal.");
}
float sum = 0;
std::size_t embeddingSize = embedding1.size();
for(std::size_t i = 0; i < embeddingSize; i++)
{
float distance = std::abs(embedding2[i] - embedding1[i]);
sum += distance;
}
float manhattanDistance = sum;
return manhattanDistance;
}
#include <cstddef>
#include <cmath>
#include <stdexcept>
float CalculateEuclideanDistance(const std::vector<float>& embedding1, const std::vector<float>& embedding2)
{
if(embedding1.size() != embedding2.size())
{
throw std::invalid_argument("Embedding sizes should be equal.");
}
float sum = 0;
std::size_t embeddingSize = embedding1.size();
for(std::size_t i = 0; i < embeddingSize; i++)
{
float distance = embedding2[i] - embedding1[i];
sum += distance * distance;
}
float euclideanDistance = std::sqrt(sum);
return euclideanDistance;
}
#include <cstddef>
#include <cmath>
#include <stdexcept>
float CalculateMinkowskiDistance(const std::vector<float>& embedding1, const std::vector<float>& embedding2, int p)
{
if(embedding1.size() != embedding2.size())
{
throw std::invalid_argument("Embedding sizes should be equal.");
}
float sum = 0;
std::size_t embeddingSize = embedding1.size();
for(std::size_t i = 0; i < embeddingSize; i++)
{
float distance = std::abs(embedding2[i] - embedding1[i]);
sum += std::pow(distance, p);
}
float minkowskiDistance = std::pow(sum, 1.0f / p);
return minkowskiDistance;
}
float CalculateL1Norm(const std::vector<float>& embedding1, const std::vector<float>& embedding2)
{
return CalculateManhattanDistance(embedding1, embedding2);
}
float CalculateL2Norm(const std::vector<float>& embedding1, const std::vector<float>& embedding2)
{
return CalculateEuclideanDistance(embedding1, embedding2);
}
float CalculateLPNorm(const std::vector<float>& embedding1, const std::vector<float>& embedding2, int p)
{
return CalculateMinkowskiDistance(embedding1, embedding2, p);
}
#include <cstddef>
#include <cmath>
#include <stdexcept>
float CalculateCosineSimilarity(const std::vector<float>& embedding1, const std::vector<float>& embedding2)
{
if(embedding1.size() != embedding2.size())
{
throw std::invalid_argument("Embedding sizes should be equal.");
}
float aa = 0;
float bb = 0;
float ab = 0;
std::size_t embeddingSize = embedding1.size();
for(std::size_t i = 0; i < embeddingSize; i++)
{
aa += std::pow(embedding1[i], 2);
bb += std::pow(embedding2[i], 2);
ab += embedding1[i] * embedding2[i];
}
float cosineSimilarity = ab / std::sqrt(aa * bb);
return cosineSimilarity;
}
# define PI 3.14159265358979323846
float CalculateAngularDistance(const std::vector<float>& embedding1, const std::vector<float>& embedding2)
{
float cosineSimilarity = CalculateCosineSimilarity(embedding1, embedding2);
float angularDistance = std::acos(cosineSimilarity) / PI;
return angularDistance;
}
float CalculateAngularSimilarity(const std::vector<float>& embedding1, const std::vector<float>& embedding2)
{
float angularDistance = CalculateAngularDistance(embedding1, embedding2);
float angularSimilarity = 1 - angularDistance;
return angularSimilarity;
}
import math
from scipy import spatial
def calculate_cosine_distance(a, b):
cosine_distance = float(spatial.distance.cosine(a, b))
return cosine_distance
def calculate_cosine_similarity(a, b):
cosine_similarity = 1 - calculate_cosine_distance(a, b)
return cosine_similarity
def calculate_angular_distance(a, b):
cosine_similarity = calculate_cosine_similarity(a, b)
angular_distance = math.acos(cosine_similarity) / math.pi
return angular_distance
def calculate_angular_similarity(a, b):
angular_similarity = 1 - calculate_angular_distance(a, b)
return angular_similarity
import math
import numpy as np
def calculate_cosine_similarity(a, b):
dot_product = np.dot(a, b)
magnitude_a = np.linalg.norm(a)
magnitude_b = np.linalg.norm(b)
cosine_similarity = dot_product / (magnitude_a * magnitude_b)
return cosine_similarity
def calculate_cosine_distance(a, b):
cosine_similarity = calculate_cosine_similarity(a, b)
cosine_distance = 1 - cosine_similarity
return cosine_distance
def calculate_angular_similarity(a, b):
cosine_similarity = calculate_cosine_similarity(a, b)
angular_similarity = 1 - (math.acos(cosine_similarity) / math.pi)
return angular_similarity
def calculate_angular_distance(a, b):
angular_similarity = calculate_angular_similarity(a, b)
angular_distance = 1 - angular_similarity
return angular_distance
import time
import numpy as np # np.__version__ == '1.23.5'
import tensorflow as tf # tf.__version__ == '2.11.0'
EMBEDDINGS_LENGTH = 512
NUMBER_OF_EMBEDDINGS = 1000 * 1000
def calculate_cosine_similarities(x, embeddings):
cosine_similarities = -1 * tf.keras.losses.cosine_similarity(x, embeddings)
return cosine_similarities.numpy()
def find_closest_embeddings(x, embeddings, top_k=1):
cosine_similarities = calculate_cosine_similarities(x, embeddings)
values, indices = tf.math.top_k(cosine_similarities, k=top_k)
return values.numpy(), indices.numpy()
def main():
# x shape: (512)
# Embeddings shape: (1000000, 512)
x = np.random.rand(EMBEDDINGS_LENGTH).astype(np.float32)
embeddings = np.random.rand(NUMBER_OF_EMBEDDINGS, EMBEDDINGS_LENGTH).astype(np.float32)
print('Embeddings shape: ', embeddings.shape)
n = 100
sum_duration = 0
for i in range(n):
start = time.time()
best_values, best_indices = find_closest_embeddings(x, embeddings, top_k=1)
end = time.time()
duration = end - start
sum_duration += duration
print('Duration (seconds): {}, Best value: {}, Best index: {}'.format(duration, best_values[0], best_indices[0]))
# Average duration (seconds): 1.707 for Intel(R) Core(TM) i7-10700 CPU @ 2.90GHz
# Average duration (seconds): 0.961 for NVIDIA 1080 ti
print('Average duration (seconds): ', sum_duration / n)
if __name__ == '__main__':
main()