#gradient descent probabilistic PCA with missing data #Will Townes L<-2 #number of latent dimensions, more dims=more complexity #tune these hyperparameters until get good results penalty<- 1e-8 learn_rate<- .1 niter<-100000 #number of iterations data(iris) Y<-t(as.matrix(iris[,1:4])) #first PC separates the species with complete data pca_factors<-prcomp(t(Y))$x plot(pca_factors[,1],pca_factors[,2],col=factor(iris$Species),main="PCA with complete data") #apply missinginess missing_rate<-.25 N<-ncol(Y) G<-nrow(Y) Z<-rbinom(N*G,1,1-missing_rate) Y[Z==0]<-0 #0 is missing value mean(Y==0) #regular PCA doesn't work anymore with missing data Y2<-Y Y2[Y==0]<-mean(Y2[Y!=0]) pca_factors<-prcomp(t(Y2))$x plot(pca_factors[,1],pca_factors[,2],col=factor(iris$Species),main="PCA with missing data- imputation") #probabilistic PCA (MAP solution to Gaussian bilinear model) #y_ng = w_g + u_n'v_g + random_error Z<-Y!=0 nvals<-sum(Z) U<-matrix(rnorm(N*L),nrow=N) V<-matrix(rnorm(G*L),nrow=G) w<-apply(Y,1,function(x){mean(x[x!=0])}) #intercepts for each variable rmse<-rep(NA,niter) mu<-w + V %*% t(U) #recycles w err<- Y - mu*Z #multiply by Z to mask out the missing values for(t in 1:niter){ #goal is to minimize the mean squared error (with regularization) #update U grad_u<- (-t(err)%*%V + penalty*U)/nvals U<-U-learn_rate*grad_u err<- Y-Z*(w+V%*%t(U)) #update V and w grad_w<- -rowSums(err)/nvals grad_v<- (-err%*%U + penalty*V)/nvals V<-V-learn_rate*grad_v w<-w-learn_rate*grad_w err<- Y-Z*(w+V%*%t(U)) rmse[t]<-sqrt(sum(err^2)/nvals) } plot(rmse,type="l") #decreasing error as optimization goes along #does our latent space represent the species well? plot(U[,1],U[,2],col=factor(iris$Species),main="Probabilistic PCA- gradient descent") #note we did not apply any orthogonalizing or centering post-processing since this is a low-dimensional dataset. So, the loadings vectors are not orthogonal and the factors don't have mean zero. #compare to bioconductor package pcaMethods #this package uses E-M algorithm instead of gradient descent library(pcaMethods) Y[Z==0]<-NA ppca_factors<-scores(ppca(t(Y))) plot(ppca_factors[,1],ppca_factors[,2],col=factor(iris$Species),main="PPCA from pcaMethods package")