Last active
June 25, 2023 04:05
-
-
Save vhxs/6c01f4093e5492cc606d840dfa052038 to your computer and use it in GitHub Desktop.
Jaccard metric spaces cannot be isometrically embedded in Euclidean space
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from random import sample | |
# The Jaccard distance is truly a metric: https://arxiv.org/pdf/1612.02696.pdf | |
def jaccard_distance(set1: set, set2: set): | |
union_size = len(set1.union(set2)) | |
intersection_size = len(set1.intersection(set2)) | |
return 1 - (intersection_size / union_size) | |
# Give me some random sets | |
def random_sets_generator(universe_size=10, set_size=5, num_sets=10): | |
for _ in range(num_sets): | |
yield set(sample(range(universe_size), set_size)) | |
def random_vectors_generator(dimension=10, num_vectors=10): | |
for _ in range(num_vectors): | |
yield np.random.rand(dimension) | |
# Yield the distance matrix of randomly generated sets, with the Jaccard metric, as a numpy array | |
def random_jaccard_metric_space_generator(universe_size=10, set_size=5, num_sets=10): | |
while True: | |
# generate random sets | |
random_sets = [random_set for random_set in random_sets_generator(universe_size, set_size, num_sets)] | |
# compute distance matrix | |
matrix_as_list = [] | |
for i in range(num_sets): | |
matrix_as_list.append([]) | |
for j in range(num_sets): | |
distance = jaccard_distance(random_sets[i], random_sets[j]) | |
matrix_as_list[i].append(distance) | |
yield np.array(matrix_as_list), random_sets | |
def random_euclidean_metric_space_generator(dimension=10, num_vectors=10): | |
while True: | |
# generate random sets | |
random_vectors = [random_set for random_set in random_vectors_generator(dimension, num_vectors)] | |
# compute distance matrix | |
matrix_as_list = [] | |
for i in range(num_vectors): | |
matrix_as_list.append([]) | |
for j in range(num_vectors): | |
distance = np.linalg.norm(random_vectors[i] - random_vectors[j])**2 | |
matrix_as_list[i].append(distance) | |
yield np.array(matrix_as_list), random_vectors | |
# A finite metric space can be embedded in Euclidean space if the following matrix is positive semidefinite | |
# https://en.wikipedia.org/wiki/Euclidean_distance_matrix#Characterizations | |
def is_embeddable(distance_matrix): | |
num_points = distance_matrix.shape[0] | |
matrix_as_list = [] | |
for i in range(1, num_points): | |
matrix_as_list.append([]) | |
for j in range(1, num_points): | |
elt = (distance_matrix[0][i]**2 + distance_matrix[0][j]**2 - distance_matrix[i][j]**2) / 2 | |
matrix_as_list[i-1].append(elt) | |
is_this_positive_semidefinite = np.array(matrix_as_list) | |
return np.all(np.linalg.eigvals(is_this_positive_semidefinite) >= 0) | |
if __name__ == "__main__": | |
for random_distance_matrix, random_sets in random_jaccard_metric_space_generator(): | |
if not is_embeddable(random_distance_matrix): | |
print("Counterexample found!") | |
print(random_sets) | |
exit() |
At the top of the Wikipedia article you see that "Euclidean distance matrix" actually means the matrix of squared distances. Granted, it's a strange piece of terminology.
Gist updated. Thanks for the correction!
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@bjoernkjoshanssen unclear to me. I just translated the criterion on this Wikipedia page into Python code, which I think I did correctly; it doesn't square the elements of the distance matrix: https://en.wikipedia.org/wiki/Euclidean_distance_matrix#Characterizations
I didn't dig into the derivation of the Schoenberg criterion to understand why it's true (couldn't find a non-pay-walled proof either). Happy to be corrected on any of the above.