Created
July 18, 2018 02:36
-
-
Save willtownes/b28a410d8a3a6a52b1725ba298149425 to your computer and use it in GitHub Desktop.
Probabilistic Principal Components Analysis with incomplete data via gradient descent
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#gradient descent probabilistic PCA with missing data | |
#Will Townes | |
L<-2 #number of latent dimensions, more dims=more complexity | |
#tune these hyperparameters until get good results | |
penalty<- 1e-8 | |
learn_rate<- .1 | |
niter<-100000 #number of iterations | |
data(iris) | |
Y<-t(as.matrix(iris[,1:4])) | |
#first PC separates the species with complete data | |
pca_factors<-prcomp(t(Y))$x | |
plot(pca_factors[,1],pca_factors[,2],col=factor(iris$Species),main="PCA with complete data") | |
#apply missinginess | |
missing_rate<-.25 | |
N<-ncol(Y) | |
G<-nrow(Y) | |
Z<-rbinom(N*G,1,1-missing_rate) | |
Y[Z==0]<-0 #0 is missing value | |
mean(Y==0) | |
#regular PCA doesn't work anymore with missing data | |
Y2<-Y | |
Y2[Y==0]<-mean(Y2[Y!=0]) | |
pca_factors<-prcomp(t(Y2))$x | |
plot(pca_factors[,1],pca_factors[,2],col=factor(iris$Species),main="PCA with missing data- imputation") | |
#probabilistic PCA (MAP solution to Gaussian bilinear model) | |
#y_ng = w_g + u_n'v_g + random_error | |
Z<-Y!=0 | |
nvals<-sum(Z) | |
U<-matrix(rnorm(N*L),nrow=N) | |
V<-matrix(rnorm(G*L),nrow=G) | |
w<-apply(Y,1,function(x){mean(x[x!=0])}) #intercepts for each variable | |
rmse<-rep(NA,niter) | |
mu<-w + V %*% t(U) #recycles w | |
err<- Y - mu*Z #multiply by Z to mask out the missing values | |
for(t in 1:niter){ | |
#goal is to minimize the mean squared error (with regularization) | |
#update U | |
grad_u<- (-t(err)%*%V + penalty*U)/nvals | |
U<-U-learn_rate*grad_u | |
err<- Y-Z*(w+V%*%t(U)) | |
#update V and w | |
grad_w<- -rowSums(err)/nvals | |
grad_v<- (-err%*%U + penalty*V)/nvals | |
V<-V-learn_rate*grad_v | |
w<-w-learn_rate*grad_w | |
err<- Y-Z*(w+V%*%t(U)) | |
rmse[t]<-sqrt(sum(err^2)/nvals) | |
} | |
plot(rmse,type="l") #decreasing error as optimization goes along | |
#does our latent space represent the species well? | |
plot(U[,1],U[,2],col=factor(iris$Species),main="Probabilistic PCA- gradient descent") | |
#note we did not apply any orthogonalizing or centering post-processing since this is a low-dimensional dataset. So, the loadings vectors are not orthogonal and the factors don't have mean zero. | |
#compare to bioconductor package pcaMethods | |
#this package uses E-M algorithm instead of gradient descent | |
library(pcaMethods) | |
Y[Z==0]<-NA | |
ppca_factors<-scores(ppca(t(Y))) | |
plot(ppca_factors[,1],ppca_factors[,2],col=factor(iris$Species),main="PPCA from pcaMethods package") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment