Last active
June 25, 2023 04:05
-
-
Save vhxs/6c01f4093e5492cc606d840dfa052038 to your computer and use it in GitHub Desktop.
Jaccard metric spaces cannot be isometrically embedded in Euclidean space
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from random import sample | |
# The Jaccard distance is truly a metric: https://arxiv.org/pdf/1612.02696.pdf | |
def jaccard_distance(set1: set, set2: set): | |
union_size = len(set1.union(set2)) | |
intersection_size = len(set1.intersection(set2)) | |
return 1 - (intersection_size / union_size) | |
# Give me some random sets | |
def random_sets_generator(universe_size=10, set_size=5, num_sets=10): | |
for _ in range(num_sets): | |
yield set(sample(range(universe_size), set_size)) | |
def random_vectors_generator(dimension=10, num_vectors=10): | |
for _ in range(num_vectors): | |
yield np.random.rand(dimension) | |
# Yield the distance matrix of randomly generated sets, with the Jaccard metric, as a numpy array | |
def random_jaccard_metric_space_generator(universe_size=10, set_size=5, num_sets=10): | |
while True: | |
# generate random sets | |
random_sets = [random_set for random_set in random_sets_generator(universe_size, set_size, num_sets)] | |
# compute distance matrix | |
matrix_as_list = [] | |
for i in range(num_sets): | |
matrix_as_list.append([]) | |
for j in range(num_sets): | |
distance = jaccard_distance(random_sets[i], random_sets[j]) | |
matrix_as_list[i].append(distance) | |
yield np.array(matrix_as_list), random_sets | |
def random_euclidean_metric_space_generator(dimension=10, num_vectors=10): | |
while True: | |
# generate random sets | |
random_vectors = [random_set for random_set in random_vectors_generator(dimension, num_vectors)] | |
# compute distance matrix | |
matrix_as_list = [] | |
for i in range(num_vectors): | |
matrix_as_list.append([]) | |
for j in range(num_vectors): | |
distance = np.linalg.norm(random_vectors[i] - random_vectors[j])**2 | |
matrix_as_list[i].append(distance) | |
yield np.array(matrix_as_list), random_vectors | |
# A finite metric space can be embedded in Euclidean space if the following matrix is positive semidefinite | |
# https://en.wikipedia.org/wiki/Euclidean_distance_matrix#Characterizations | |
def is_embeddable(distance_matrix): | |
num_points = distance_matrix.shape[0] | |
matrix_as_list = [] | |
for i in range(1, num_points): | |
matrix_as_list.append([]) | |
for j in range(1, num_points): | |
elt = (distance_matrix[0][i]**2 + distance_matrix[0][j]**2 - distance_matrix[i][j]**2) / 2 | |
matrix_as_list[i-1].append(elt) | |
is_this_positive_semidefinite = np.array(matrix_as_list) | |
return np.all(np.linalg.eigvals(is_this_positive_semidefinite) >= 0) | |
if __name__ == "__main__": | |
for random_distance_matrix, random_sets in random_jaccard_metric_space_generator(): | |
if not is_embeddable(random_distance_matrix): | |
print("Counterexample found!") | |
print(random_sets) | |
exit() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Gist updated. Thanks for the correction!