Skip to content

Instantly share code, notes, and snippets.

@willtownes
Created July 18, 2018 02:36
Show Gist options
  • Select an option

  • Save willtownes/b28a410d8a3a6a52b1725ba298149425 to your computer and use it in GitHub Desktop.

Select an option

Save willtownes/b28a410d8a3a6a52b1725ba298149425 to your computer and use it in GitHub Desktop.

Revisions

  1. willtownes created this gist Jul 18, 2018.
    63 changes: 63 additions & 0 deletions pca_missing.R
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,63 @@
    #gradient descent probabilistic PCA with missing data
    #Will Townes

    L<-2 #number of latent dimensions, more dims=more complexity
    #tune these hyperparameters until get good results
    penalty<- 1e-8
    learn_rate<- .1
    niter<-100000 #number of iterations

    data(iris)
    Y<-t(as.matrix(iris[,1:4]))
    #first PC separates the species with complete data
    pca_factors<-prcomp(t(Y))$x
    plot(pca_factors[,1],pca_factors[,2],col=factor(iris$Species),main="PCA with complete data")

    #apply missinginess
    missing_rate<-.25
    N<-ncol(Y)
    G<-nrow(Y)
    Z<-rbinom(N*G,1,1-missing_rate)
    Y[Z==0]<-0 #0 is missing value
    mean(Y==0)
    #regular PCA doesn't work anymore with missing data
    Y2<-Y
    Y2[Y==0]<-mean(Y2[Y!=0])
    pca_factors<-prcomp(t(Y2))$x
    plot(pca_factors[,1],pca_factors[,2],col=factor(iris$Species),main="PCA with missing data- imputation")

    #probabilistic PCA (MAP solution to Gaussian bilinear model)
    #y_ng = w_g + u_n'v_g + random_error
    Z<-Y!=0
    nvals<-sum(Z)
    U<-matrix(rnorm(N*L),nrow=N)
    V<-matrix(rnorm(G*L),nrow=G)
    w<-apply(Y,1,function(x){mean(x[x!=0])}) #intercepts for each variable
    rmse<-rep(NA,niter)
    mu<-w + V %*% t(U) #recycles w
    err<- Y - mu*Z #multiply by Z to mask out the missing values
    for(t in 1:niter){
    #goal is to minimize the mean squared error (with regularization)
    #update U
    grad_u<- (-t(err)%*%V + penalty*U)/nvals
    U<-U-learn_rate*grad_u
    err<- Y-Z*(w+V%*%t(U))
    #update V and w
    grad_w<- -rowSums(err)/nvals
    grad_v<- (-err%*%U + penalty*V)/nvals
    V<-V-learn_rate*grad_v
    w<-w-learn_rate*grad_w
    err<- Y-Z*(w+V%*%t(U))
    rmse[t]<-sqrt(sum(err^2)/nvals)
    }
    plot(rmse,type="l") #decreasing error as optimization goes along
    #does our latent space represent the species well?
    plot(U[,1],U[,2],col=factor(iris$Species),main="Probabilistic PCA- gradient descent")
    #note we did not apply any orthogonalizing or centering post-processing since this is a low-dimensional dataset. So, the loadings vectors are not orthogonal and the factors don't have mean zero.

    #compare to bioconductor package pcaMethods
    #this package uses E-M algorithm instead of gradient descent
    library(pcaMethods)
    Y[Z==0]<-NA
    ppca_factors<-scores(ppca(t(Y)))
    plot(ppca_factors[,1],ppca_factors[,2],col=factor(iris$Species),main="PPCA from pcaMethods package")