mehdidc · April 30, 2017 21:01
diff --git a/manifold_eval.py b/manifold_eval.py
 """
 This is an implementation of the distance proposed by "An Information Geometry of Statistical Manifold Learning, Ke Sun, Stéphane Marchand-Maillet 
 " to measure quality of embedding techniques (e.g PCA, Isomap, LLE, TSNE). It is needs data points matrix of
 the original data as well as the same data points with the embedding coordinates, and some additional parameters to fix.

 How to use ?
 ============

 Basic swiss roll example:

 from sklearn.datasets import make_swiss_roll
 from sklearn.decomposition import PCA
 from sklearn.manifold import Isomap

 NN = 100 # nb of  neighbors
 k = 10 # logk defines the entropy of tau_Y and tau_Z distributions
 ks = 5

 Y, labels = make_swiss_roll(1000)

 Z = PCA(n_components=2).fit_transform(Y)

 dist_to_PCA, _ = compute_dist(Y, Z, NN=NN, k=k, ks=ks, 
                              compute_density=False,
                              nb_integrations=1, # repeat if you want to estimate variance of the integral estimation
                              integration_method='monte_carlo')
 T = Isomap(n_components=2).fit_transform(Y)

 dist_to_isomap, _ = compute_dist(Y, T, NN=NN, k=k, ks=ks, 
                                compute_density=False,
                                nb_integrations=1, # repeat if you want to estimate variance of the integral estimation
                                integration_method='monte_carlo')
 print("dist to PCA : {} +/- {}".format(np.mean(dist_to_PCA), np.std(dist_to_PCA)))
 print("dist to Isomap : {} +/- {}".format(np.mean(dist_to_isomap), np.std(dist_to_isomap)))

 """
 import numpy as np
 from sklearn.metrics.pairwise import pairwise_distances
 from scipy.optimize import newton
 from scipy.integrate import dblquad
 from scipy.misc import logsumexp
 from itertools import product
 from tqdm import tqdm


 def softmax(w):
    maxes = np.amax(w, axis=1)
    maxes = maxes.reshape(maxes.shape[0], 1)
    e = np.exp(w - maxes)
    dist = e / np.sum(e, axis=1, keepdims=True)
    return dist
    
 def entropy_p_univariate(s):
    r"""
    Compute (carefully to avoid numerical error)
    entropy of p defined by : p_i = exp(-s_i)/(\sum_j exp(-s_j))
    """
    log_norm = logsumexp(-s)
    log_p = - s - log_norm
    ent = (np.exp(log_p) * s).sum() + log_norm
    return ent


 def construct_Gcov(S_Y, S_Z, S, NN=5):
    """
    Used to construct 'g_ab' in the paper
    """
    
    #ind = S.argsort(axis=1)
    #ind = ind[:, 1:NN + 1] # start by 1 because we dont want to count a point as neighbor to itself
    ind_Y = S_Y.argsort(axis=1)[:, 1:NN + 1]
    ind_Z = S_Z.argsort(axis=1)[:, 1:NN + 1]
    
    dist_Y = np.zeros((S_Y.shape[0], NN * 2))
    for i in range(S_Y.shape[0]):
        ind = list(set(ind_Y[i]) | set(ind_Z[i]))
        dist_Y[i, 0:len(ind)] = S_Y[i][ind]
        
    dist_Z = np.zeros((S_Y.shape[0], NN * 2))
    for i in range(S_Z.shape[0]):
        ind = list(set(ind_Y[i]) | set(ind_Z[i]))
        dist_Z[i, 0:len(ind)] = S_Z[i][ind]
    
    dist_S = np.empty((S_Y.shape[0], NN * 2))
    dist_S[:, :] = np.inf
    for i in range(S.shape[0]):
        ind = list(set(ind_Y[i]) | set(ind_Z[i]))
        dist_S[i, 0:len(ind)] = S[i][ind]
    
    P_S = softmax(-dist_S)
    Gcov = ((P_S * (dist_Y * dist_Z)).sum(axis=1) - 
            (P_S * dist_Y).sum(axis=1) * (P_S * dist_Z).sum(axis=1))
    return Gcov

 def construct_Gvar(S_Y, S, NN=5):
    """
    Used to construct 'g_aa' and 'g_bb' in the paper
    """
    return construct_Gcov(S_Y, S_Y, S, NN=NN)

 def construct_tau(S, k, NN=5, tau0=1, verbose=1):
    """
    find tau_i for each i which we scale the distances in row i so that the distribution in row i as entropy logk
    it is done in the paper with 'binary search' or more known as 'bisection method'. the goal of bisection method
    is to find the root of a function. what we want here for a row i is find tau_i so that :
    -\sum_j p_i(j)log p_i(j) = logk
    where p(j) is proba of j given i defined by equation 1 in paper but where the distances
    are scaled by tau_i
    I rather used a newton optimization algo instead of bisection, it works too.
    """
    tau = []
    for i in range(S.shape[0]):
        s = S[i]
        s = s[np.argsort(s)]
        s = s[1:NN + 1]
        def f(tau_sqrt):
            return entropy_p_univariate(s * tau_sqrt**2) - np.log(k)
        tau_sqrt_i = newton(f, tau0)
        tau_i = tau_sqrt_i ** 2
        if verbose > 0:
            print("row {}, entropy:{}, logk={}, tau:{}".format(i, entropy_p_univariate(s * tau_i), np.log(k), tau_i))
        tau.append(tau_i)
    tau = np.array(tau)
    return tau

 def vol(tau_Y, tau_Z, gvar_Y, gvar_Z, gcov_YZ):
    """
    compute the vol term, it is defined in equation 9 of the paper
    """
    cov_term = (tau_Y * tau_Z * gcov_YZ).sum()
    var_Y_term = (tau_Y**2 * gvar_Y).sum()
    var_Z_term = (tau_Z**2 * gvar_Z).sum()
    return np.sqrt(1 - (cov_term**2) / (var_Y_term * var_Z_term))

 def sig(tau_Y, tau_Z, gvar_Y, gvar_Z):
    """
    compute the \sigma term, it is defined after proposition 13 in the paper
    """
    var_Y_term = (tau_Y**2 * gvar_Y).sum()
    var_Z_term = (tau_Z**2 * gvar_Z).sum()
    return np.sqrt(var_Y_term) * np.sqrt(var_Z_term)

 def compute_density(tau_Y, tau_Z, gvar_Y, gvar_Z, gcov_YZ):
    """
    compute the density term directly (instead of using vol*sig)
    """
    cov_term = (tau_Y * tau_Z * gcov_YZ).sum()
    var_Y_term = (tau_Y**2 * gvar_Y).sum()
    var_Z_term = (tau_Z**2 * gvar_Z).sum()
    return np.sqrt(var_Y_term * var_Z_term - cov_term**2)

 def build_density_func(S_Y, S_Z, tau_Y, tau_Z, NN=5):
    def compute(a, b):
        """
        compute density at (a, b). the measure proposed in the paper is a doule integration
        of the density computed in this function for a and b 
        """
        S = a * S_Y + b * S_Z
        #TODO: not correct it should be based on input(Y) OR output(Z) NN neighbours
        gcov_YZ = construct_Gcov(S_Y, S_Z, S, NN=NN) # g_ab in the paper
        gvar_Y = construct_Gvar(S_Y, S, NN=NN) # g_aa in the paper
        gvar_Z = construct_Gvar(S_Z, S, NN=NN) # g_bb in the paper
        return compute_density(tau_Y, tau_Z, gvar_Y, gvar_Z, gcov_YZ)
    return compute

 def integrate_2d(func, a_range, b_range, nb_iter=100, verbose=1):
    """
    Generic 2d integration of func(a, b) with monte carlo using uniform sampling for a and b
    
    a_range : tuple defining of the interval to sample a from
    b_range : tuple defining of the interval to sample b from
    """
    val_mean = 0.
    iters = range(nb_iter)
    if verbose > 0:
        iters = tqdm(iters)
    for i in iters:
        a = np.random.uniform(low=a_range[0], high=a_range[1])
        b = np.random.uniform(low=b_range[0], high=b_range[1])
        val = func(a, b)
        val_mean = (val + i * val_mean) / (i + 1)
    return val_mean * (a_range[1] - a_range[0]) * (b_range[1] - b_range[0])

 def compute_dist(Y, Z, NN=100, k=10, ks=5, 
                 nb_iter=None, 
                 integration_method='monte_carlo',
                 compute_density=False, 
                 density_grid_size=10, 
                 nb_integrations=1,
                 metric='euclidean',
                 verbose=1):
    S_Y = pairwise_distances(Y, metric=metric)
    S_Z = pairwise_distances(Z, metric=metric)
    tau_Y = construct_tau(S_Y, k=k, tau0=1, NN=NN, verbose=0)
    tau_Z = construct_tau(S_Z, k=k, tau0=1, NN=NN, verbose=0)
    get_density = build_density_func(S_Y, S_Z, tau_Y, tau_Z, NN=NN)
    
    if compute_density:
        a_range = b_range = (1./k, 1./ks)
        a = np.linspace(a_range[0], a_range[1], density_grid_size)
        b = np.linspace(b_range[0], b_range[1], density_grid_size)
        density = np.empty((density_grid_size, density_grid_size))
        for i, j in product(np.arange(len(a)), np.arange(len(b))):
            density[i, j] = get_density(1./a[i], 1./b[j])
    dists = []
    
    a_range = b_range = (ks, k)
    for i in range(nb_integrations):
        if integration_method == 'monte_carlo':
            if nb_iter is None:
                nb_iter = 100
            dist = integrate_2d(get_density, a_range, b_range, nb_iter=nb_iter, verbose=1)
        elif integration_method == 'quad':
            l = 1./k
            r = 1./ks
            dist, err = dblquad(get_density, l, r, lambda a:l, lambda a:r)
            if verbose > 0:
                print('quad integration abs error : {}'.format(err))
        else:
            raise ValueError('Invalid integration method : {}'.format(integration_method))
        dists.append(dist)
    if compute_density:
        return dists, density
    else:
        return dists, None
	"""
	This is an implementation of the distance proposed by "An Information Geometry of Statistical Manifold Learning, Ke Sun, Stéphane Marchand-Maillet
	" to measure quality of embedding techniques (e.g PCA, Isomap, LLE, TSNE). It is needs data points matrix of
	the original data as well as the same data points with the embedding coordinates, and some additional parameters to fix.

	How to use ?
	============

	Basic swiss roll example:

	from sklearn.datasets import make_swiss_roll
	from sklearn.decomposition import PCA
	from sklearn.manifold import Isomap

	NN = 100 # nb of neighbors
	k = 10 # logk defines the entropy of tau_Y and tau_Z distributions
	ks = 5

	Y, labels = make_swiss_roll(1000)

	Z = PCA(n_components=2).fit_transform(Y)

	dist_to_PCA, _ = compute_dist(Y, Z, NN=NN, k=k, ks=ks,
	compute_density=False,
	nb_integrations=1, # repeat if you want to estimate variance of the integral estimation
	integration_method='monte_carlo')
	T = Isomap(n_components=2).fit_transform(Y)

	dist_to_isomap, _ = compute_dist(Y, T, NN=NN, k=k, ks=ks,
	compute_density=False,
	nb_integrations=1, # repeat if you want to estimate variance of the integral estimation
	integration_method='monte_carlo')
	print("dist to PCA : {} +/- {}".format(np.mean(dist_to_PCA), np.std(dist_to_PCA)))
	print("dist to Isomap : {} +/- {}".format(np.mean(dist_to_isomap), np.std(dist_to_isomap)))

	"""
	import numpy as np
	from sklearn.metrics.pairwise import pairwise_distances
	from scipy.optimize import newton
	from scipy.integrate import dblquad
	from scipy.misc import logsumexp
	from itertools import product
	from tqdm import tqdm


	def softmax(w):
	maxes = np.amax(w, axis=1)
	maxes = maxes.reshape(maxes.shape[0], 1)
	e = np.exp(w - maxes)
	dist = e / np.sum(e, axis=1, keepdims=True)
	return dist

	def entropy_p_univariate(s):
	r"""
	Compute (carefully to avoid numerical error)
	entropy of p defined by : p_i = exp(-s_i)/(\sum_j exp(-s_j))
	"""
	log_norm = logsumexp(-s)
	log_p = - s - log_norm
	ent = (np.exp(log_p) * s).sum() + log_norm
	return ent


	def construct_Gcov(S_Y, S_Z, S, NN=5):
	"""
	Used to construct 'g_ab' in the paper
	"""

	#ind = S.argsort(axis=1)
	#ind = ind[:, 1:NN + 1] # start by 1 because we dont want to count a point as neighbor to itself
	ind_Y = S_Y.argsort(axis=1)[:, 1:NN + 1]
	ind_Z = S_Z.argsort(axis=1)[:, 1:NN + 1]

	dist_Y = np.zeros((S_Y.shape[0], NN * 2))
	for i in range(S_Y.shape[0]):
	ind = list(set(ind_Y[i]) \| set(ind_Z[i]))
	dist_Y[i, 0:len(ind)] = S_Y[i][ind]

	dist_Z = np.zeros((S_Y.shape[0], NN * 2))
	for i in range(S_Z.shape[0]):
	ind = list(set(ind_Y[i]) \| set(ind_Z[i]))
	dist_Z[i, 0:len(ind)] = S_Z[i][ind]

	dist_S = np.empty((S_Y.shape[0], NN * 2))
	dist_S[:, :] = np.inf
	for i in range(S.shape[0]):
	ind = list(set(ind_Y[i]) \| set(ind_Z[i]))
	dist_S[i, 0:len(ind)] = S[i][ind]

	P_S = softmax(-dist_S)
	Gcov = ((P_S * (dist_Y * dist_Z)).sum(axis=1) -
	(P_S * dist_Y).sum(axis=1) * (P_S * dist_Z).sum(axis=1))
	return Gcov

	def construct_Gvar(S_Y, S, NN=5):
	"""
	Used to construct 'g_aa' and 'g_bb' in the paper
	"""
	return construct_Gcov(S_Y, S_Y, S, NN=NN)

	def construct_tau(S, k, NN=5, tau0=1, verbose=1):
	"""
	find tau_i for each i which we scale the distances in row i so that the distribution in row i as entropy logk
	it is done in the paper with 'binary search' or more known as 'bisection method'. the goal of bisection method
	is to find the root of a function. what we want here for a row i is find tau_i so that :
	-\sum_j p_i(j)log p_i(j) = logk
	where p(j) is proba of j given i defined by equation 1 in paper but where the distances
	are scaled by tau_i
	I rather used a newton optimization algo instead of bisection, it works too.
	"""
	tau = []
	for i in range(S.shape[0]):
	s = S[i]
	s = s[np.argsort(s)]
	s = s[1:NN + 1]
	def f(tau_sqrt):
	return entropy_p_univariate(s * tau_sqrt**2) - np.log(k)
	tau_sqrt_i = newton(f, tau0)
	tau_i = tau_sqrt_i ** 2
	if verbose > 0:
	print("row {}, entropy:{}, logk={}, tau:{}".format(i, entropy_p_univariate(s * tau_i), np.log(k), tau_i))
	tau.append(tau_i)
	tau = np.array(tau)
	return tau

	def vol(tau_Y, tau_Z, gvar_Y, gvar_Z, gcov_YZ):
	"""
	compute the vol term, it is defined in equation 9 of the paper
	"""
	cov_term = (tau_Y * tau_Z * gcov_YZ).sum()
	var_Y_term = (tau_Y*2 gvar_Y).sum()
	var_Z_term = (tau_Z*2 gvar_Z).sum()
	return np.sqrt(1 - (cov_term*2) / (var_Y_term var_Z_term))

	def sig(tau_Y, tau_Z, gvar_Y, gvar_Z):
	"""
	compute the \sigma term, it is defined after proposition 13 in the paper
	"""
	var_Y_term = (tau_Y*2 gvar_Y).sum()
	var_Z_term = (tau_Z*2 gvar_Z).sum()
	return np.sqrt(var_Y_term) * np.sqrt(var_Z_term)

	def compute_density(tau_Y, tau_Z, gvar_Y, gvar_Z, gcov_YZ):
	"""
	compute the density term directly (instead of using vol*sig)
	"""
	cov_term = (tau_Y * tau_Z * gcov_YZ).sum()
	var_Y_term = (tau_Y*2 gvar_Y).sum()
	var_Z_term = (tau_Z*2 gvar_Z).sum()
	return np.sqrt(var_Y_term * var_Z_term - cov_term**2)

	def build_density_func(S_Y, S_Z, tau_Y, tau_Z, NN=5):
	def compute(a, b):
	"""
	compute density at (a, b). the measure proposed in the paper is a doule integration
	of the density computed in this function for a and b
	"""
	S = a * S_Y + b * S_Z
	#TODO: not correct it should be based on input(Y) OR output(Z) NN neighbours
	gcov_YZ = construct_Gcov(S_Y, S_Z, S, NN=NN) # g_ab in the paper
	gvar_Y = construct_Gvar(S_Y, S, NN=NN) # g_aa in the paper
	gvar_Z = construct_Gvar(S_Z, S, NN=NN) # g_bb in the paper
	return compute_density(tau_Y, tau_Z, gvar_Y, gvar_Z, gcov_YZ)
	return compute

	def integrate_2d(func, a_range, b_range, nb_iter=100, verbose=1):
	"""
	Generic 2d integration of func(a, b) with monte carlo using uniform sampling for a and b

	a_range : tuple defining of the interval to sample a from
	b_range : tuple defining of the interval to sample b from
	"""
	val_mean = 0.
	iters = range(nb_iter)
	if verbose > 0:
	iters = tqdm(iters)
	for i in iters:
	a = np.random.uniform(low=a_range[0], high=a_range[1])
	b = np.random.uniform(low=b_range[0], high=b_range[1])
	val = func(a, b)
	val_mean = (val + i * val_mean) / (i + 1)
	return val_mean * (a_range[1] - a_range[0]) * (b_range[1] - b_range[0])

	def compute_dist(Y, Z, NN=100, k=10, ks=5,
	nb_iter=None,
	integration_method='monte_carlo',
	compute_density=False,
	density_grid_size=10,
	nb_integrations=1,
	metric='euclidean',
	verbose=1):
	S_Y = pairwise_distances(Y, metric=metric)
	S_Z = pairwise_distances(Z, metric=metric)
	tau_Y = construct_tau(S_Y, k=k, tau0=1, NN=NN, verbose=0)
	tau_Z = construct_tau(S_Z, k=k, tau0=1, NN=NN, verbose=0)
	get_density = build_density_func(S_Y, S_Z, tau_Y, tau_Z, NN=NN)

	if compute_density:
	a_range = b_range = (1./k, 1./ks)
	a = np.linspace(a_range[0], a_range[1], density_grid_size)
	b = np.linspace(b_range[0], b_range[1], density_grid_size)
	density = np.empty((density_grid_size, density_grid_size))
	for i, j in product(np.arange(len(a)), np.arange(len(b))):
	density[i, j] = get_density(1./a[i], 1./b[j])
	dists = []

	a_range = b_range = (ks, k)
	for i in range(nb_integrations):
	if integration_method == 'monte_carlo':
	if nb_iter is None:
	nb_iter = 100
	dist = integrate_2d(get_density, a_range, b_range, nb_iter=nb_iter, verbose=1)
	elif integration_method == 'quad':
	l = 1./k
	r = 1./ks
	dist, err = dblquad(get_density, l, r, lambda a:l, lambda a:r)
	if verbose > 0:
	print('quad integration abs error : {}'.format(err))
	else:
	raise ValueError('Invalid integration method : {}'.format(integration_method))
	dists.append(dist)
	if compute_density:
	return dists, density
	else:
	return dists, None