Skip to content

Instantly share code, notes, and snippets.

@DelightRun
Forked from mehdidc/intdim_mle.py
Created January 26, 2019 08:49
Show Gist options
  • Save DelightRun/9401840716a945f34688096463dba39f to your computer and use it in GitHub Desktop.
Save DelightRun/9401840716a945f34688096463dba39f to your computer and use it in GitHub Desktop.
"""
Implementation of 'Maximum Likelihood Estimation of Intrinsic Dimension' by Elizaveta Levina and Peter J. Bickel
how to use
----------
The goal is to estimate intrinsic dimensionality of data, the estimation of dimensionality is scale dependent
(depending on how much you zoom into the data distribution you can find different dimesionality), so they
propose to average it over different scales, the interval of the scales [k1, k2] are the only parameters of the algorithm.
This code also provides a way to repeat the estimation with bootstrapping to estimate uncertainty.
Here is one example with swiss roll :
from sklearn.datasets import make_swiss_roll
X, _ = make_swiss_roll(1000)
k1 = 10 # start of interval(included)
k2 = 20 # end of interval(included)
intdim_k_repeated = repeated(intrinsic_dim_scale_interval,
X,
mode='bootstrap',
nb_iter=500, # nb_iter for bootstrapping
verbose=1,
k1=k1, k2=k2)
intdim_k_repeated = np.array(intdim_k_repeated)
# the shape of intdim_k_repeated is (nb_iter, size_of_interval) where
# nb_iter is number of bootstrap iterations (here 500) and size_of_interval
# is (k2 - k1 + 1).
# Plotting the histogram of intrinsic dimensionality estimations repeated over
# nb_iter experiments
plt.hist(intdim_k_repeated.mean(axis=1))
"""
from tqdm import tqdm
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
def intrinsic_dim_sample_wise(X, k=5):
neighb = NearestNeighbors(n_neighbors=k + 1).fit(X)
dist, ind = neighb.kneighbors(X)
dist = dist[:, 1:]
dist = dist[:, 0:k]
assert dist.shape == (X.shape[0], k)
assert np.all(dist > 0)
d = np.log(dist[:, k - 1: k] / dist[:, 0:k-1])
d = d.sum(axis=1) / (k - 2)
d = 1. / d
intdim_sample = d
return intdim_sample
def intrinsic_dim_scale_interval(X, k1=10, k2=20):
X = pd.DataFrame(X).drop_duplicates().values # remove duplicates in case you use bootstrapping
intdim_k = []
for k in range(k1, k2 + 1):
m = intrinsic_dim_sample_wise(X, k).mean()
intdim_k.append(m)
return intdim_k
def repeated(func, X, nb_iter=100, random_state=None, verbose=0, mode='bootstrap', **func_kw):
if random_state is None:
rng = np.random
else:
rng = np.random.RandomState(random_state)
nb_examples = X.shape[0]
results = []
iters = range(nb_iter)
if verbose > 0:
iters = tqdm(iters)
for i in iters:
if mode == 'bootstrap':
Xr = X[rng.randint(0, nb_examples, size=nb_examples)]
elif mode == 'shuffle':
ind = np.arange(nb_examples)
rng.shuffle(ind)
Xr = X[ind]
elif mode == 'same':
Xr = X
else:
raise ValueError('unknown mode : {}'.format(mode))
results.append(func(Xr, **func_kw))
return results
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment