Search code examples
rggplot2pcaggbiplot

PCA Scaling with ggbiplot


I'm trying to plot a principal component analysis using prcomp and ggbiplot. I'm getting data values outside of the unit circle, and haven't been able to rescale the data prior to calling prcomp in such a way that I can constrain the data to the unit circle.

data(wine)
require(ggbiplot)
wine.pca=prcomp(wine[,1:3],scale.=TRUE)
ggbiplot(wine.pca,obs.scale = 1, 
         var.scale=1,groups=wine.class,ellipse=TRUE,circle=TRUE)

I tried scaling by subtracting mean and dividing by standard deviation before calling prcomp:

wine2=wine[,1:3]
mean=apply(wine2,2,mean)
sd=apply(wine2,2,mean)
for(i in 1:ncol(wine2)){
  wine2[,i]=(wine2[,i]-mean[i])/sd[i]
}
wine2.pca=prcomp(wine2,scale.=TRUE)
ggbiplot(wine2.pca,obs.scale=1, 
         var.scale=1,groups=wine.class,ellipse=TRUE,circle=TRUE)

ggbiplot package installed as follows:

require(devtools)
install_github('ggbiplot','vqv')

Output of either code chunk:

enter image description here

Per @Brian Hanson's comment below, I'm adding an additional image reflecting the output I'm trying to get.

enter image description here


Solution

  • I edited the code for the plot function and was able to get the functionality I wanted.

    ggbiplot2=function(pcobj, choices = 1:2, scale = 1, pc.biplot = TRUE, 
             obs.scale = 1 - scale, var.scale = scale, 
             groups = NULL, ellipse = FALSE, ellipse.prob = 0.68, 
             labels = NULL, labels.size = 3, alpha = 1, 
             var.axes = TRUE, 
             circle = FALSE, circle.prob = 0.69, 
             varname.size = 3, varname.adjust = 1.5, 
             varname.abbrev = FALSE, ...)
    {
      library(ggplot2)
      library(plyr)
      library(scales)
      library(grid)
    
      stopifnot(length(choices) == 2)
    
      # Recover the SVD
      if(inherits(pcobj, 'prcomp')){
        nobs.factor <- sqrt(nrow(pcobj$x) - 1)
        d <- pcobj$sdev
        u <- sweep(pcobj$x, 2, 1 / (d * nobs.factor), FUN = '*')
        v <- pcobj$rotation
      } else if(inherits(pcobj, 'princomp')) {
        nobs.factor <- sqrt(pcobj$n.obs)
        d <- pcobj$sdev
        u <- sweep(pcobj$scores, 2, 1 / (d * nobs.factor), FUN = '*')
        v <- pcobj$loadings
      } else if(inherits(pcobj, 'PCA')) {
        nobs.factor <- sqrt(nrow(pcobj$call$X))
        d <- unlist(sqrt(pcobj$eig)[1])
        u <- sweep(pcobj$ind$coord, 2, 1 / (d * nobs.factor), FUN = '*')
        v <- sweep(pcobj$var$coord,2,sqrt(pcobj$eig[1:ncol(pcobj$var$coord),1]),FUN="/")
      } else {
        stop('Expected a object of class prcomp, princomp or PCA')
      }
    
      # Scores
      df.u <- as.data.frame(sweep(u[,choices], 2, d[choices]^obs.scale, FUN='*'))
    
      # Directions
      v <- sweep(v, 2, d^var.scale, FUN='*')
      df.v <- as.data.frame(v[, choices])
    
      names(df.u) <- c('xvar', 'yvar')
      names(df.v) <- names(df.u)
    
      if(pc.biplot) {
        df.u <- df.u * nobs.factor
      }
    
      # Scale the radius of the correlation circle so that it corresponds to 
      # a data ellipse for the standardized PC scores
      r <- 1
    
      # Scale directions
      v.scale <- rowSums(v^2)
      df.v <- df.v / sqrt(max(v.scale))
    
      ## Scale Scores
      r.scale=sqrt(max(df.u[,1]^2+df.u[,2]^2))
      df.u=.99*df.u/r.scale
    
      # Change the labels for the axes
      if(obs.scale == 0) {
        u.axis.labs <- paste('standardized PC', choices, sep='')
      } else {
        u.axis.labs <- paste('PC', choices, sep='')
      }
    
      # Append the proportion of explained variance to the axis labels
      u.axis.labs <- paste(u.axis.labs, 
                           sprintf('(%0.1f%% explained var.)', 
                                   100 * pcobj$sdev[choices]^2/sum(pcobj$sdev^2)))
    
      # Score Labels
      if(!is.null(labels)) {
        df.u$labels <- labels
      }
    
      # Grouping variable
      if(!is.null(groups)) {
        df.u$groups <- groups
      }
    
      # Variable Names
      if(varname.abbrev) {
        df.v$varname <- abbreviate(rownames(v))
      } else {
        df.v$varname <- rownames(v)
      }
    
      # Variables for text label placement
      df.v$angle <- with(df.v, (180/pi) * atan(yvar / xvar))
      df.v$hjust = with(df.v, (1 - varname.adjust * sign(xvar)) / 2)
    
      # Base plot
      g <- ggplot(data = df.u, aes(x = xvar, y = yvar)) + 
        xlab(u.axis.labs[1]) + ylab(u.axis.labs[2]) + coord_equal()
    
      if(var.axes) {
        # Draw circle
        if(circle) 
        {
          theta <- c(seq(-pi, pi, length = 50), seq(pi, -pi, length = 50))
          circle <- data.frame(xvar = r * cos(theta), yvar = r * sin(theta))
          g <- g + geom_path(data = circle, color = muted('white'), 
                             size = 1/2, alpha = 1/3)
        }
    
        # Draw directions
        g <- g +
          geom_segment(data = df.v,
                       aes(x = 0, y = 0, xend = xvar, yend = yvar),
                       arrow = arrow(length = unit(1/2, 'picas')), 
                       color = muted('red'))
      }
    
      # Draw either labels or points
      if(!is.null(df.u$labels)) {
        if(!is.null(df.u$groups)) {
          g <- g + geom_text(aes(label = labels, color = groups), 
                             size = labels.size)
        } else {
          g <- g + geom_text(aes(label = labels), size = labels.size)      
        }
      } else {
        if(!is.null(df.u$groups)) {
          g <- g + geom_point(aes(color = groups), alpha = alpha)
        } else {
          g <- g + geom_point(alpha = alpha)      
        }
      }
    
      # Overlay a concentration ellipse if there are groups
      if(!is.null(df.u$groups) && ellipse) {
        theta <- c(seq(-pi, pi, length = 50), seq(pi, -pi, length = 50))
        circle <- cbind(cos(theta), sin(theta))
    
        ell <- ddply(df.u, 'groups', function(x) {
          if(nrow(x) < 2) {
            return(NULL)
          } else if(nrow(x) == 2) {
            sigma <- var(cbind(x$xvar, x$yvar))
          } else {
            sigma <- diag(c(var(x$xvar), var(x$yvar)))
          }
          mu <- c(mean(x$xvar), mean(x$yvar))
          ed <- sqrt(qchisq(ellipse.prob, df = 2))
          data.frame(sweep(circle %*% chol(sigma) * ed, 2, mu, FUN = '+'), 
                     groups = x$groups[1])
        })
        names(ell)[1:2] <- c('xvar', 'yvar')
        g <- g + geom_path(data = ell, aes(color = groups, group = groups))
      }
    
      # Label the variable axes
      if(var.axes) {
        g <- g + 
          geom_text(data = df.v, 
                    aes(label = varname, x = xvar, y = yvar, 
                        angle = angle, hjust = hjust), 
                    color = 'darkred', size = varname.size)
      }
      # Change the name of the legend for groups
      # if(!is.null(groups)) {
      #   g <- g + scale_color_brewer(name = deparse(substitute(groups)), 
      #                               palette = 'Dark2')
      # }
    
      # TODO: Add a second set of axes
    
      return(g)
    }