willtownes · July 18, 2018 02:36
diff --git a/pca_missing.R b/pca_missing.R
 #gradient descent probabilistic PCA with missing data
 #Will Townes

 L<-2 #number of latent dimensions, more dims=more complexity
 #tune these hyperparameters until get good results
 penalty<- 1e-8
 learn_rate<- .1
 niter<-100000 #number of iterations

 data(iris)
 Y<-t(as.matrix(iris[,1:4]))
 #first PC separates the species with complete data
 pca_factors<-prcomp(t(Y))$x
 plot(pca_factors[,1],pca_factors[,2],col=factor(iris$Species),main="PCA with complete data")

 #apply missinginess
 missing_rate<-.25
 N<-ncol(Y)
 G<-nrow(Y)
 Z<-rbinom(N*G,1,1-missing_rate)
 Y[Z==0]<-0 #0 is missing value
 mean(Y==0)
 #regular PCA doesn't work anymore with missing data
 Y2<-Y
 Y2[Y==0]<-mean(Y2[Y!=0])
 pca_factors<-prcomp(t(Y2))$x
 plot(pca_factors[,1],pca_factors[,2],col=factor(iris$Species),main="PCA with missing data- imputation")

 #probabilistic PCA (MAP solution to Gaussian bilinear model)
 #y_ng = w_g + u_n'v_g + random_error
 Z<-Y!=0
 nvals<-sum(Z)
 U<-matrix(rnorm(N*L),nrow=N)
 V<-matrix(rnorm(G*L),nrow=G)
 w<-apply(Y,1,function(x){mean(x[x!=0])}) #intercepts for each variable
 rmse<-rep(NA,niter)
 mu<-w + V %*% t(U) #recycles w
 err<- Y - mu*Z #multiply by Z to mask out the missing values
 for(t in 1:niter){
  #goal is to minimize the mean squared error (with regularization)
  #update U
  grad_u<- (-t(err)%*%V + penalty*U)/nvals
  U<-U-learn_rate*grad_u
  err<- Y-Z*(w+V%*%t(U))
  #update V and w
  grad_w<- -rowSums(err)/nvals
  grad_v<- (-err%*%U + penalty*V)/nvals
  V<-V-learn_rate*grad_v
  w<-w-learn_rate*grad_w
  err<- Y-Z*(w+V%*%t(U))
  rmse[t]<-sqrt(sum(err^2)/nvals)
 }
 plot(rmse,type="l") #decreasing error as optimization goes along
 #does our latent space represent the species well?
 plot(U[,1],U[,2],col=factor(iris$Species),main="Probabilistic PCA- gradient descent")
 #note we did not apply any orthogonalizing or centering post-processing since this is a low-dimensional dataset. So, the loadings vectors are not orthogonal and the factors don't have mean zero. 

 #compare to bioconductor package pcaMethods
 #this package uses E-M algorithm instead of gradient descent
 library(pcaMethods)
 Y[Z==0]<-NA
 ppca_factors<-scores(ppca(t(Y)))
 plot(ppca_factors[,1],ppca_factors[,2],col=factor(iris$Species),main="PPCA from pcaMethods package")
	#gradient descent probabilistic PCA with missing data
	#Will Townes

	L<-2 #number of latent dimensions, more dims=more complexity
	#tune these hyperparameters until get good results
	penalty<- 1e-8
	learn_rate<- .1
	niter<-100000 #number of iterations

	data(iris)
	Y<-t(as.matrix(iris[,1:4]))
	#first PC separates the species with complete data
	pca_factors<-prcomp(t(Y))$x
	plot(pca_factors[,1],pca_factors[,2],col=factor(iris$Species),main="PCA with complete data")

	#apply missinginess
	missing_rate<-.25
	N<-ncol(Y)
	G<-nrow(Y)
	Z<-rbinom(N*G,1,1-missing_rate)
	Y[Z==0]<-0 #0 is missing value
	mean(Y==0)
	#regular PCA doesn't work anymore with missing data
	Y2<-Y
	Y2[Y==0]<-mean(Y2[Y!=0])
	pca_factors<-prcomp(t(Y2))$x
	plot(pca_factors[,1],pca_factors[,2],col=factor(iris$Species),main="PCA with missing data- imputation")

	#probabilistic PCA (MAP solution to Gaussian bilinear model)
	#y_ng = w_g + u_n'v_g + random_error
	Z<-Y!=0
	nvals<-sum(Z)
	U<-matrix(rnorm(N*L),nrow=N)
	V<-matrix(rnorm(G*L),nrow=G)
	w<-apply(Y,1,function(x){mean(x[x!=0])}) #intercepts for each variable
	rmse<-rep(NA,niter)
	mu<-w + V %*% t(U) #recycles w
	err<- Y - mu*Z #multiply by Z to mask out the missing values
	for(t in 1:niter){
	#goal is to minimize the mean squared error (with regularization)
	#update U
	grad_u<- (-t(err)%%V + penaltyU)/nvals
	U<-U-learn_rate*grad_u
	err<- Y-Z(w+V%%t(U))
	#update V and w
	grad_w<- -rowSums(err)/nvals
	grad_v<- (-err%%U + penaltyV)/nvals
	V<-V-learn_rate*grad_v
	w<-w-learn_rate*grad_w
	err<- Y-Z(w+V%%t(U))
	rmse[t]<-sqrt(sum(err^2)/nvals)
	}
	plot(rmse,type="l") #decreasing error as optimization goes along
	#does our latent space represent the species well?
	plot(U[,1],U[,2],col=factor(iris$Species),main="Probabilistic PCA- gradient descent")
	#note we did not apply any orthogonalizing or centering post-processing since this is a low-dimensional dataset. So, the loadings vectors are not orthogonal and the factors don't have mean zero.

	#compare to bioconductor package pcaMethods
	#this package uses E-M algorithm instead of gradient descent
	library(pcaMethods)
	Y[Z==0]<-NA
	ppca_factors<-scores(ppca(t(Y)))
	plot(ppca_factors[,1],ppca_factors[,2],col=factor(iris$Species),main="PPCA from pcaMethods package")