sergeyprokudin · January 12, 2019 17:17
diff --git a/pca.py b/pca.py
 import numpy as np

 def pca(x, n_components):
    """ Principal component analysis of the data
    m - number of samples
    n - number of dimensions
    k - number of components
    
    TL;DR: 
       (a) compute the covariance matrix x.T@x
       (b) find k max eigenvectors (i.e. vectors corresponding to max eigenvalues)
       (c) to compress: x@v, to decompress: x@[email protected]
       v = np.eig(x.T@x)[:,0:n_components]
       
    Find a pair of matrice V \in R^(k*n), W \in R^{n*k}:
    
    V*,W* = argmin_{V, W}{\sum_{i=1}^{m}||x_i - V*W*x_i||^2} (1)
     
    Lemma. If (V, W) - solution of (1) => 
        (a) V - orthonormal, i.e. V^T*V = I
        (b) W = V^T
    Proof: lemma 23.1 from "Understanding Machine Learning" by Shalev-Schwartz, Ben-David
    
    => 
    
    We need to find a matrix V* such that:
    
    V* = argmin_{V: V^T*V=I}{\sum_{i=1}^{m}{||x_i - V*V^T*x_i||^2}} (2)
    
    Algo #1: loss function is convex => gradient decent will do the job.
    Algo #2: exact solution. 
    
    property 1: ||x-y||^2 = ||x||^2 - 2*y^T*x + y^T*y  (3)
    property 2: (ABC)^T = C^T*B^T*A^T  (4)
    property 3: x \in R^{nx1}, y \in R{nx1} => x^T*y = trace(x*y^T) (5)
    
    now x=x, y=V*V^T*x => due (4), y^T = x^T*V*V^T
    ||x-V*V^T*x||^2 = ||x||^2 - 2*x^T*V*V^T*x + x^T*V*V^T*V*V^T*x
    
    due to V^T*V=I:
    ||x-V*V^T*x||^2 = ||x||^2 - x^T*V*V^T*x  
    
    due to (5) where x=V^T*x, y=V^T*x:
    ||x-V*V^T*x||^2 = ||x||^2 - trace(V^T*x*x^T*V)
    
    trace is a linear operator =>
    (2) <=> argmax_{V: V^T*V=I}{\sum_{i=1}^{m}{trace(V^T*\sum{x_i*x_i^T}*V)}} (2)
    
    A = \sum{x_i*x_i^T} \in R^{n*n} - covariance matrix!
    It's symmetric => can be rewritten in the following way:
    A = VDV^T
    
    Theorem: if A = \sum{x_i*x_i^T}, then the solution to (1) is the matrix V \in R^(nxk), whose columns are 
    k eigenvectors corresponding to k largest eigenvalues, and W=V^T \in R^{k*n}.
    
    Proof: theorem 23.2 from "Understanding Machine Learning" by Shalev-Schwartz, Ben-David
    
    So, to find V we need to find k largest eigenvalues and corresponding eigenvectors.
    
    Complexity of the vanilla version: O(n^3)
    
    How we'll do it? -> spectral decomposition of X^T@X
    
    Another variant (if n>>m): get eigenvectors U of B=X@X^T, 
    then X^T@U/||X^T@U|| will be eigenvecotrs of A=X^T@X
    
    
    Parameters
    ----------
    
    x: numpy array [m, n]
    n_comps: int
        number of principle components to extract
    
    Returns
    -------
    v: numpy array [n_dims, n_comps]
        orthonormal matrix whose columns correspond to k eigen vectors of X*X^T with largest eigenvalues
        i.e. to compress some input x: x@v, 
             to decompress some values x_enc: x@[email protected]
    """
    
    covariance_matrix = x.T@x 
    cov_eigen_values, cov_eigen_vectors = np.linalg.eig(covariance_matrix)
    v = cov_eigen_vectors[:, 0:n_components]
    
    return v


 #DEMO

 # Boston housing data

 from sklearn.datasets import load_boston
 from sklearn.decomposition import PCA

 ds = load_boston()
 x = ds['data']

 k = 2
 print("number of components: %d" % k)

 pca_sk = PCA(n_components=k)
 pca_sk.fit(x)
 x_rec_sk = pca_sk.inverse_transform(pca_sk.transform(x))
 mse_sk = np.mean(np.sum(np.square(x-x_rec_sk), axis=1))

 print("MSE sklearn: %f" % mse_sk)

 v = pca(x, k)                 
 x_rec_ours = x@[email protected]
 mse_ours = np.mean(np.sum(np.square(x-x_rec_ours), axis=1))
                 
 print("MSE ours: %f" % mse_ours)


 # Synthetic data, principal components visualization

 import matplotlib.pyplot as plt

 x = np.random.normal(size=[1000, 2])
 A = np.random.normal(size=[2, 2])
 print(A)
 xA = x@A
 #plt.scatter(x[:, 0], x[:, 1])
 plt.scatter(xA[:, 0], xA[:, 1])

 v = pca(xA, 2)
 plt.scatter(xA[:, 0], xA[:, 1], label='data')
 plt.plot([0, v[0,0]], [0, v[1,0]], c='blue', label='first principle component')
 plt.plot([0, v[0,1]], [0, v[1,1]], c='red', label='second principle component')
 plt.legend()
 plt.xlim(-3, 3)
 plt.ylim(-3, 3)
 plt.show()
	import numpy as np

	def pca(x, n_components):
	""" Principal component analysis of the data
	m - number of samples
	n - number of dimensions
	k - number of components

	TL;DR:
	(a) compute the covariance matrix x.T@x
	(b) find k max eigenvectors (i.e. vectors corresponding to max eigenvalues)
	(c) to compress: x@v, to decompress: x@[email protected]
	v = np.eig(x.T@x)[:,0:n_components]

	Find a pair of matrice V \in R^(kn), W \in R^{nk}:

	V,W = argmin_{V, W}{\sum_{i=1}^{m}\|\|x_i - VWx_i\|\|^2} (1)

	Lemma. If (V, W) - solution of (1) =>
	(a) V - orthonormal, i.e. V^T*V = I
	(b) W = V^T
	Proof: lemma 23.1 from "Understanding Machine Learning" by Shalev-Schwartz, Ben-David

	=>

	We need to find a matrix V* such that:

	V* = argmin_{V: V^TV=I}{\sum_{i=1}^{m}{\|\|x_i - VV^T*x_i\|\|^2}} (2)

	Algo #1: loss function is convex => gradient decent will do the job.
	Algo #2: exact solution.

	property 1: \|\|x-y\|\|^2 = \|\|x\|\|^2 - 2y^Tx + y^T*y (3)
	property 2: (ABC)^T = C^TB^TA^T (4)
	property 3: x \in R^{nx1}, y \in R{nx1} => x^Ty = trace(xy^T) (5)

	now x=x, y=VV^Tx => due (4), y^T = x^TVV^T
	\|\|x-VV^Tx\|\|^2 = \|\|x\|\|^2 - 2x^TVV^Tx + x^TVV^TVV^T*x

	due to V^T*V=I:
	\|\|x-VV^Tx\|\|^2 = \|\|x\|\|^2 - x^TVV^T*x

	due to (5) where x=V^Tx, y=V^Tx:
	\|\|x-VV^Tx\|\|^2 = \|\|x\|\|^2 - trace(V^Txx^T*V)

	trace is a linear operator =>
	(2) <=> argmax_{V: V^TV=I}{\sum_{i=1}^{m}{trace(V^T\sum{x_ix_i^T}V)}} (2)

	A = \sum{x_ix_i^T} \in R^{nn} - covariance matrix!
	It's symmetric => can be rewritten in the following way:
	A = VDV^T

	Theorem: if A = \sum{x_i*x_i^T}, then the solution to (1) is the matrix V \in R^(nxk), whose columns are
	k eigenvectors corresponding to k largest eigenvalues, and W=V^T \in R^{k*n}.

	Proof: theorem 23.2 from "Understanding Machine Learning" by Shalev-Schwartz, Ben-David

	So, to find V we need to find k largest eigenvalues and corresponding eigenvectors.

	Complexity of the vanilla version: O(n^3)

	How we'll do it? -> spectral decomposition of X^T@X

	Another variant (if n>>m): get eigenvectors U of B=X@X^T,
	then X^T@U/\|\|X^T@U\|\| will be eigenvecotrs of A=X^T@X


	Parameters
	----------

	x: numpy array [m, n]
	n_comps: int
	number of principle components to extract

	Returns
	-------
	v: numpy array [n_dims, n_comps]
	orthonormal matrix whose columns correspond to k eigen vectors of X*X^T with largest eigenvalues
	i.e. to compress some input x: x@v,
	to decompress some values x_enc: x@[email protected]
	"""

	covariance_matrix = x.T@x
	cov_eigen_values, cov_eigen_vectors = np.linalg.eig(covariance_matrix)
	v = cov_eigen_vectors[:, 0:n_components]

	return v


	#DEMO

	# Boston housing data

	from sklearn.datasets import load_boston
	from sklearn.decomposition import PCA

	ds = load_boston()
	x = ds['data']

	k = 2
	print("number of components: %d" % k)

	pca_sk = PCA(n_components=k)
	pca_sk.fit(x)
	x_rec_sk = pca_sk.inverse_transform(pca_sk.transform(x))
	mse_sk = np.mean(np.sum(np.square(x-x_rec_sk), axis=1))

	print("MSE sklearn: %f" % mse_sk)

	v = pca(x, k)
	x_rec_ours = x@[email protected]
	mse_ours = np.mean(np.sum(np.square(x-x_rec_ours), axis=1))

	print("MSE ours: %f" % mse_ours)


	# Synthetic data, principal components visualization

	import matplotlib.pyplot as plt

	x = np.random.normal(size=[1000, 2])
	A = np.random.normal(size=[2, 2])
	print(A)
	xA = x@A
	#plt.scatter(x[:, 0], x[:, 1])
	plt.scatter(xA[:, 0], xA[:, 1])

	v = pca(xA, 2)
	plt.scatter(xA[:, 0], xA[:, 1], label='data')
	plt.plot([0, v[0,0]], [0, v[1,0]], c='blue', label='first principle component')
	plt.plot([0, v[0,1]], [0, v[1,1]], c='red', label='second principle component')
	plt.legend()
	plt.xlim(-3, 3)
	plt.ylim(-3, 3)
	plt.show()