-
-
Save agramfort/726574 to your computer and use it in GitHub Desktop.
| """ | |
| Author: Oliver Mitevski | |
| References: | |
| A Generalized Linear Model for Principal Component Analysis of Binary Data, | |
| Andrew I. Schein; Lawrence K. Saul; Lyle H. Ungar | |
| The code was translated and adapted from Jakob Verbeek's | |
| "Hidden Markov models and mixtures for Binary PCA" implementation in MATLAB | |
| """ | |
| import sys | |
| import numpy as np | |
| from scipy import linalg | |
| # from scipy import * | |
| # from numpy import * | |
| # from scipy.linalg import diagsvd, svd | |
| # from scipy.sparse import linalg | |
| # import cPickle, gzip | |
| import time | |
| import pylab as pl | |
| def bpca(X, L=2, max_iters=30): | |
| X = np.asanyarray(X, dtype=np.float) | |
| N, D = X.shape | |
| # x = X | |
| X = 2*X - 1 | |
| delta = np.random.permutation(N) | |
| Delta = X[delta[0], :] / 100 | |
| U = 1e-4 * np.random.random( (N, L) ) | |
| V = 1e-4 * np.random.random( (L, D) ) | |
| #for c=1:C; Th(:,:,c) = U(:,:,c)*V(:,:,c) + ones(N,1)*Delta(c,:); end; | |
| Th = np.zeros((N, D)) | |
| Th = np.dot(U, V) + Delta[None,:] # with broadcasting | |
| for iter in range(max_iters): | |
| print iter | |
| # Update U | |
| T = np.tanh(Th/2) / Th | |
| B = np.dot(V, (X - T*Delta[None,:]).T) | |
| for n in range(N): | |
| A = np.dot(V * T[n,:][None,:], V.T) | |
| U[n,:] = linalg.solve(A, B[:,n]).T | |
| Th = np.dot(U, V) + Delta[None,:] # with broadcasting | |
| Q = np.random.random(N) | |
| # normalize it | |
| # Q = np.sqrt(np.dot(Q, Q.conj())) # XXX weird Q is a float | |
| Q = linalg.norm(Q) # gives the same result | |
| # Update V | |
| T = np.tanh(Th/2) / Th | |
| U2 = np.c_[U, np.ones((N,1))] | |
| # U2 = U2 * np.tile(Q, (L+1, 1)).T | |
| U2 *= Q # XXX : equivalent as above if Q is a float | |
| B = np.dot(U2.T, X) | |
| Uo = np.c_[U, np.ones((N, 1))] | |
| for d in range(D): | |
| A = np.dot(U2.T * T[:, d][None,:], Uo) | |
| V2 = linalg.solve(A, B[:, d]) | |
| Delta[d] = V2[-1] | |
| if L > 0: | |
| V[:, d] = V2[0:-1] | |
| Th = np.dot(U, V) + Delta[None,:] # with broadcasting | |
| print U.shape | |
| # plotM1(U[0:10000:1,0:2], labels[0:10000:10]) | |
| U1, S, V = linalg.svd(U) | |
| Vh = V.T | |
| U1 = np.mat(U1[:, 0:L]) | |
| # Vh = np.mat(Vh[0:L, :]) | |
| Vh = Vh[0:L, :] | |
| codes = np.dot(U, Vh.T) | |
| return codes | |
| def main(): | |
| # Load the dataset | |
| from save_load import save_file, load_file | |
| try: | |
| input_data = load_file(sys.argv[1]) | |
| sparse = False | |
| except: | |
| print 'loading sparse' | |
| sparse = True | |
| from scipy.io import mmio as mm | |
| input_data = mm.mmread(sys.argv[1]) | |
| input_data = np.array(input_data.todense()) | |
| N, D = input_data.shape | |
| print N, D | |
| # make data binary | |
| # for i in range(N): | |
| # for j in range(D): | |
| # if input_data[i,j] > 0.0: | |
| # input_data[i,j] = 1.0 | |
| # save_file('news-10-stemmed.index2200/binDict', input_data) | |
| # print 'saved binary' | |
| X = input_data | |
| labels = load_file(sys.argv[2]) | |
| start = time.clock() | |
| codes = bpca(X, L=2, max_iters = 20) | |
| print "Time for computing binary PCA took", time.clock()-start | |
| save_file(sys.argv[3], codes) | |
| if __name__ == "__main__": | |
| n_samples, n_features = 10, 30 | |
| np.random.seed(0) | |
| X = np.random.randn(n_samples, n_features) | |
| codes = bpca(X, L=2, max_iters=20) | |
| print codes | |
| # main() |
great. Double check that I did not introduce any bug.
could you put this code in the same API as PCA in the scikit-learn? with fit and transform? It's not clear from your code what are the components estimated and how you would transform a new dataset ie. compute the PCA scores aka codes on a new X. Do you have access to something similar as the explained variance of each component?
It seems to work. I know what you mean, but it's not clear to me either how the transform could be done, I should go back to the paper. If this method turns to be non-parametric, then it's not so great. I kinda lost interest since it performs better for non-binary data than for binary.
good if it (sill) works. I wanted to use it to find latent components for a binary matrix. How did you evaluate the performance?
just by visualization of the labels of the 20-newsgroups. They seemed kind of clustered, so it's working.
that's a fairly empiric validation :) I would have expected "it reproduces the results form the paper" :)
maybe it should be tested on different datasets. The one I used (tf-idf feature vectors) may not be the most suitable. Nevertheless it didn't fail, so there's hope ;)
thanks perfect!
I'll give it a try soon.