Search code examples
rpca

extract principal components from PCA in missMDA


I'm performing a multiple imputation PCA on a dataset that has several missing values in one variable, and I want to extract the first principal component to use in another model, but I can't figure out how to extract it from the results.

# packages
library(tidyverse)
library(missMDA)

# dataset
df <- structure(list(
  A = c(39.64, 39.23, 38.89, 38.63, 38.44, 38.32, 38.27, 38.3, 38.4, 38.56, 38.79, 39.06, 39.36, 
  39.68, 40.01, 40.34, 40.68, 41.05, 41.46, 41.93, 42.48, 43.14, 43.92, 44.84, 45.9, 47.1, 48.4, 
  49.78, 51.2, 52.62, 54.01, 55.31, 56.52, 57.6, 58.54, 59.33, 59.98, 60.46, 60.78, 60.94, 60.92, 
  60.71, 60.3, 59.69, 58.87, 57.86, 56.67, 55.33, 53.87, 52.33, 50.75), 
  B = c(3.29, 3.29, 3.3, 3.32, 3.33, 3.35, 3.36, 3.38, 3.4, 3.42, 3.44, 3.46, 3.48, 3.5, 3.52, 3.53, 
  3.54, 3.55, 3.55, 3.54, 3.53, 3.51, 3.48, 3.44, 3.4, 3.35, 3.3, 3.24, 3.18, 3.12, 3.07, 3.02, 2.98, 
  2.96, 2.94, 2.94, 2.95, 2.98, 3.02, 3.08, 3.14, 3.22, 3.3, 3.4, 3.49, 3.59, 3.7, 3.8, 3.91, 4.02, 
  4.12), 
  C = c(NA, NA, NA, NA, NA, NA, NA, NA, 12.23, 4.3, 3.44, 3.49, 5.59, 0.76, 4.2, 4.61, 4.6, 8.03, 
  4.26, 3.31, 4.16, 0.37, -2.17, -1.93, -3.7, -0.48, -4.63, -2.89, -4.29, -2.35, -2.27, -1.66, -2.82, 
  -3.2, -2.55, -0.9, -2.42, -0.84, -1.46, -5.39, -2.57, -2.12, -0.59, 1.75, 3.7, 0.99, -3.28, -3.2, 
  -0.6, -0.61, -1.94)), class = "data.frame", row.names = c(NA, -51L))

# estimate number of dimensions
pcaDim <- estim_ncpPCA(X = df, method.cv = "Kfold")

# multiple imputation PCA
mi <- MIPCA(X = df, ncp = pcaDim$ncp, scale = TRUE, nboot = 999)

# dataframes that I can extract from mi
# 1. returns the values of one imputed dataset using imputePCA
mi$res.imputePCA
# 2. returns the values of each of the 999 imputed datasets
mi$res.MI
# 3. the kitchen sink - there's so much here and it's really unclear what it all is, but maybe the principal components are here somewhere??
mi$call

Solution

  • I would think if you wanted to have a single principal component from the imputed data, you would probably want to create the component for each imputed dataset, then estimate the model using each imputed component then pool the models together using Rubin's rules. Here's an example with some data I made up that is similar to your data.

    library(mitools)
    library(tidyverse)
    library(missMDA)
    
    ## Make Data
    set.seed(1234)
    X <- MASS::mvrnorm(51, c(40, 3, 12), matrix(c(1,.8, .8, .8 , 1, .85, .8, .85, 1), ncol=3))
    df <- as.data.frame(X)
    names(df) <- c("A", "B", "C")
    df$z <- df$A + rnorm(51, 0, 5)
    pc1 <- princomp(scale(df[,c("A", "B", "C")]), scores = TRUE)
    df$comp1 <- pc1$scores[,1]
    df$y <- with(df, z - comp1 + rnorm(51, 0, 4))
    
    ## induce missingness on C
    df2 <- df
    df2$C[1:8] <- NA
    
    ## estimate dimensionality 
    pcaDim <- estim_ncpPCA(X = df2[,c("A", "B", "C")], method.cv = "Kfold")
    
    # multiple imputation PCA
    mi <- MIPCA(X = df2[,c("A", "B", "C")], ncp = pcaDim$ncp, scale = TRUE, nboot = 2500)
    
    ## Estimate the component for each imputed dataset and save it alongside the other model data
    dat_list<- lapply(1:2500, function(i){
      out <- df2
      out$comp1 <- princomp(scale(mi$res.MI[[i]]), scores=TRUE)$scores[,1]
      out
    })
    
    ## estimate the model for all the different values of `comp1`. 
    mods <- lapply(dat_list, \(d)lm(y ~ z + comp1, data=d))
    
    ## summarise models
    summary(MIcombine(mods))
    #> Multiple imputation results:
    #>       MIcombine.default(mods)
    #>               results         se    (lower     upper) missInfo
    #> (Intercept)  3.509051 4.04150890 -4.412161 11.4302632      0 %
    #> z            0.933240 0.09788341  0.741392  1.1250880      0 %
    #> comp1       -1.180513 0.36088797 -1.887840 -0.4731855      0 %
    

    Created on 2023-06-06 with reprex v2.0.2