Created
July 18, 2018 02:36
-
-
Save willtownes/b28a410d8a3a6a52b1725ba298149425 to your computer and use it in GitHub Desktop.
Revisions
-
willtownes created this gist
Jul 18, 2018 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,63 @@ #gradient descent probabilistic PCA with missing data #Will Townes L<-2 #number of latent dimensions, more dims=more complexity #tune these hyperparameters until get good results penalty<- 1e-8 learn_rate<- .1 niter<-100000 #number of iterations data(iris) Y<-t(as.matrix(iris[,1:4])) #first PC separates the species with complete data pca_factors<-prcomp(t(Y))$x plot(pca_factors[,1],pca_factors[,2],col=factor(iris$Species),main="PCA with complete data") #apply missinginess missing_rate<-.25 N<-ncol(Y) G<-nrow(Y) Z<-rbinom(N*G,1,1-missing_rate) Y[Z==0]<-0 #0 is missing value mean(Y==0) #regular PCA doesn't work anymore with missing data Y2<-Y Y2[Y==0]<-mean(Y2[Y!=0]) pca_factors<-prcomp(t(Y2))$x plot(pca_factors[,1],pca_factors[,2],col=factor(iris$Species),main="PCA with missing data- imputation") #probabilistic PCA (MAP solution to Gaussian bilinear model) #y_ng = w_g + u_n'v_g + random_error Z<-Y!=0 nvals<-sum(Z) U<-matrix(rnorm(N*L),nrow=N) V<-matrix(rnorm(G*L),nrow=G) w<-apply(Y,1,function(x){mean(x[x!=0])}) #intercepts for each variable rmse<-rep(NA,niter) mu<-w + V %*% t(U) #recycles w err<- Y - mu*Z #multiply by Z to mask out the missing values for(t in 1:niter){ #goal is to minimize the mean squared error (with regularization) #update U grad_u<- (-t(err)%*%V + penalty*U)/nvals U<-U-learn_rate*grad_u err<- Y-Z*(w+V%*%t(U)) #update V and w grad_w<- -rowSums(err)/nvals grad_v<- (-err%*%U + penalty*V)/nvals V<-V-learn_rate*grad_v w<-w-learn_rate*grad_w err<- Y-Z*(w+V%*%t(U)) rmse[t]<-sqrt(sum(err^2)/nvals) } plot(rmse,type="l") #decreasing error as optimization goes along #does our latent space represent the species well? plot(U[,1],U[,2],col=factor(iris$Species),main="Probabilistic PCA- gradient descent") #note we did not apply any orthogonalizing or centering post-processing since this is a low-dimensional dataset. So, the loadings vectors are not orthogonal and the factors don't have mean zero. #compare to bioconductor package pcaMethods #this package uses E-M algorithm instead of gradient descent library(pcaMethods) Y[Z==0]<-NA ppca_factors<-scores(ppca(t(Y))) plot(ppca_factors[,1],ppca_factors[,2],col=factor(iris$Species),main="PPCA from pcaMethods package")