willtownes · July 18, 2018 02:36 · Jul 18, 2018
diff --git a/pca_missing.R b/pca_missing.R
@@ -0,0 +1,63 @@
+#gradient descent probabilistic PCA with missing data
+#Will Townes
+
+L<-2 #number of latent dimensions, more dims=more complexity
+#tune these hyperparameters until get good results
+penalty<- 1e-8
+learn_rate<- .1
+niter<-100000 #number of iterations
+
+data(iris)
+Y<-t(as.matrix(iris[,1:4]))
+#first PC separates the species with complete data
+pca_factors<-prcomp(t(Y))$x
+plot(pca_factors[,1],pca_factors[,2],col=factor(iris$Species),main="PCA with complete data")
+
+#apply missinginess
+missing_rate<-.25
+N<-ncol(Y)
+G<-nrow(Y)
+Z<-rbinom(N*G,1,1-missing_rate)
+Y[Z==0]<-0 #0 is missing value
+mean(Y==0)
+#regular PCA doesn't work anymore with missing data
+Y2<-Y
+Y2[Y==0]<-mean(Y2[Y!=0])
+pca_factors<-prcomp(t(Y2))$x
+plot(pca_factors[,1],pca_factors[,2],col=factor(iris$Species),main="PCA with missing data- imputation")
+
+#probabilistic PCA (MAP solution to Gaussian bilinear model)
+#y_ng = w_g + u_n'v_g + random_error
+Z<-Y!=0
+nvals<-sum(Z)
+U<-matrix(rnorm(N*L),nrow=N)
+V<-matrix(rnorm(G*L),nrow=G)
+w<-apply(Y,1,function(x){mean(x[x!=0])}) #intercepts for each variable
+rmse<-rep(NA,niter)
+mu<-w + V %*% t(U) #recycles w
+err<- Y - mu*Z #multiply by Z to mask out the missing values
+for(t in 1:niter){
+  #goal is to minimize the mean squared error (with regularization)
+  #update U
+  grad_u<- (-t(err)%*%V + penalty*U)/nvals
+  U<-U-learn_rate*grad_u
+  err<- Y-Z*(w+V%*%t(U))
+  #update V and w
+  grad_w<- -rowSums(err)/nvals
+  grad_v<- (-err%*%U + penalty*V)/nvals
+  V<-V-learn_rate*grad_v
+  w<-w-learn_rate*grad_w
+  err<- Y-Z*(w+V%*%t(U))
+  rmse[t]<-sqrt(sum(err^2)/nvals)
+}
+plot(rmse,type="l") #decreasing error as optimization goes along
+#does our latent space represent the species well?
+plot(U[,1],U[,2],col=factor(iris$Species),main="Probabilistic PCA- gradient descent")
+#note we did not apply any orthogonalizing or centering post-processing since this is a low-dimensional dataset. So, the loadings vectors are not orthogonal and the factors don't have mean zero. 
+
+#compare to bioconductor package pcaMethods
+#this package uses E-M algorithm instead of gradient descent
+library(pcaMethods)
+Y[Z==0]<-NA
+ppca_factors<-scores(ppca(t(Y)))
+plot(ppca_factors[,1],ppca_factors[,2],col=factor(iris$Species),main="PPCA from pcaMethods package")
No results found