extract principal components from PCA in missMDA

I'm performing a multiple imputation PCA on a dataset that has several missing values in one variable, and I want to extract the first principal component to use in another model, but I can't figure out how to extract it from the results.

# packages
library(tidyverse)
library(missMDA)

# dataset
df <- structure(list(
  A = c(39.64, 39.23, 38.89, 38.63, 38.44, 38.32, 38.27, 38.3, 38.4, 38.56, 38.79, 39.06, 39.36, 
  39.68, 40.01, 40.34, 40.68, 41.05, 41.46, 41.93, 42.48, 43.14, 43.92, 44.84, 45.9, 47.1, 48.4, 
  49.78, 51.2, 52.62, 54.01, 55.31, 56.52, 57.6, 58.54, 59.33, 59.98, 60.46, 60.78, 60.94, 60.92, 
  60.71, 60.3, 59.69, 58.87, 57.86, 56.67, 55.33, 53.87, 52.33, 50.75), 
  B = c(3.29, 3.29, 3.3, 3.32, 3.33, 3.35, 3.36, 3.38, 3.4, 3.42, 3.44, 3.46, 3.48, 3.5, 3.52, 3.53, 
  3.54, 3.55, 3.55, 3.54, 3.53, 3.51, 3.48, 3.44, 3.4, 3.35, 3.3, 3.24, 3.18, 3.12, 3.07, 3.02, 2.98, 
  2.96, 2.94, 2.94, 2.95, 2.98, 3.02, 3.08, 3.14, 3.22, 3.3, 3.4, 3.49, 3.59, 3.7, 3.8, 3.91, 4.02, 
  4.12), 
  C = c(NA, NA, NA, NA, NA, NA, NA, NA, 12.23, 4.3, 3.44, 3.49, 5.59, 0.76, 4.2, 4.61, 4.6, 8.03, 
  4.26, 3.31, 4.16, 0.37, -2.17, -1.93, -3.7, -0.48, -4.63, -2.89, -4.29, -2.35, -2.27, -1.66, -2.82, 
  -3.2, -2.55, -0.9, -2.42, -0.84, -1.46, -5.39, -2.57, -2.12, -0.59, 1.75, 3.7, 0.99, -3.28, -3.2, 
  -0.6, -0.61, -1.94)), class = "data.frame", row.names = c(NA, -51L))

# estimate number of dimensions
pcaDim <- estim_ncpPCA(X = df, method.cv = "Kfold")

# multiple imputation PCA
mi <- MIPCA(X = df, ncp = pcaDim$ncp, scale = TRUE, nboot = 999)

# dataframes that I can extract from mi
# 1. returns the values of one imputed dataset using imputePCA
mi$res.imputePCA
# 2. returns the values of each of the 999 imputed datasets
mi$res.MI
# 3. the kitchen sink - there's so much here and it's really unclear what it all is, but maybe the principal components are here somewhere??
mi$call

Solution

I would think if you wanted to have a single principal component from the imputed data, you would probably want to create the component for each imputed dataset, then estimate the model using each imputed component then pool the models together using Rubin's rules. Here's an example with some data I made up that is similar to your data.

library(mitools)
library(tidyverse)
library(missMDA)

## Make Data
set.seed(1234)
X <- MASS::mvrnorm(51, c(40, 3, 12), matrix(c(1,.8, .8, .8 , 1, .85, .8, .85, 1), ncol=3))
df <- as.data.frame(X)
names(df) <- c("A", "B", "C")
df$z <- df$A + rnorm(51, 0, 5)
pc1 <- princomp(scale(df[,c("A", "B", "C")]), scores = TRUE)
df$comp1 <- pc1$scores[,1]
df$y <- with(df, z - comp1 + rnorm(51, 0, 4))

## induce missingness on C
df2 <- df
df2$C[1:8] <- NA

## estimate dimensionality 
pcaDim <- estim_ncpPCA(X = df2[,c("A", "B", "C")], method.cv = "Kfold")

# multiple imputation PCA
mi <- MIPCA(X = df2[,c("A", "B", "C")], ncp = pcaDim$ncp, scale = TRUE, nboot = 2500)

## Estimate the component for each imputed dataset and save it alongside the other model data
dat_list<- lapply(1:2500, function(i){
  out <- df2
  out$comp1 <- princomp(scale(mi$res.MI[[i]]), scores=TRUE)$scores[,1]
  out
})

## estimate the model for all the different values of `comp1`. 
mods <- lapply(dat_list, \(d)lm(y ~ z + comp1, data=d))

## summarise models
summary(MIcombine(mods))
#> Multiple imputation results:
#>       MIcombine.default(mods)
#>               results         se    (lower     upper) missInfo
#> (Intercept)  3.509051 4.04150890 -4.412161 11.4302632      0 %
#> z            0.933240 0.09788341  0.741392  1.1250880      0 %
#> comp1       -1.180513 0.36088797 -1.887840 -0.4731855      0 %

^{Created on 2023-06-06 with reprex v2.0.2}