I'm performing a multiple imputation PCA on a dataset that has several missing values in one variable, and I want to extract the first principal component to use in another model, but I can't figure out how to extract it from the results.
# packages
library(tidyverse)
library(missMDA)
# dataset
df <- structure(list(
A = c(39.64, 39.23, 38.89, 38.63, 38.44, 38.32, 38.27, 38.3, 38.4, 38.56, 38.79, 39.06, 39.36,
39.68, 40.01, 40.34, 40.68, 41.05, 41.46, 41.93, 42.48, 43.14, 43.92, 44.84, 45.9, 47.1, 48.4,
49.78, 51.2, 52.62, 54.01, 55.31, 56.52, 57.6, 58.54, 59.33, 59.98, 60.46, 60.78, 60.94, 60.92,
60.71, 60.3, 59.69, 58.87, 57.86, 56.67, 55.33, 53.87, 52.33, 50.75),
B = c(3.29, 3.29, 3.3, 3.32, 3.33, 3.35, 3.36, 3.38, 3.4, 3.42, 3.44, 3.46, 3.48, 3.5, 3.52, 3.53,
3.54, 3.55, 3.55, 3.54, 3.53, 3.51, 3.48, 3.44, 3.4, 3.35, 3.3, 3.24, 3.18, 3.12, 3.07, 3.02, 2.98,
2.96, 2.94, 2.94, 2.95, 2.98, 3.02, 3.08, 3.14, 3.22, 3.3, 3.4, 3.49, 3.59, 3.7, 3.8, 3.91, 4.02,
4.12),
C = c(NA, NA, NA, NA, NA, NA, NA, NA, 12.23, 4.3, 3.44, 3.49, 5.59, 0.76, 4.2, 4.61, 4.6, 8.03,
4.26, 3.31, 4.16, 0.37, -2.17, -1.93, -3.7, -0.48, -4.63, -2.89, -4.29, -2.35, -2.27, -1.66, -2.82,
-3.2, -2.55, -0.9, -2.42, -0.84, -1.46, -5.39, -2.57, -2.12, -0.59, 1.75, 3.7, 0.99, -3.28, -3.2,
-0.6, -0.61, -1.94)), class = "data.frame", row.names = c(NA, -51L))
# estimate number of dimensions
pcaDim <- estim_ncpPCA(X = df, method.cv = "Kfold")
# multiple imputation PCA
mi <- MIPCA(X = df, ncp = pcaDim$ncp, scale = TRUE, nboot = 999)
# dataframes that I can extract from mi
# 1. returns the values of one imputed dataset using imputePCA
mi$res.imputePCA
# 2. returns the values of each of the 999 imputed datasets
mi$res.MI
# 3. the kitchen sink - there's so much here and it's really unclear what it all is, but maybe the principal components are here somewhere??
mi$call
I would think if you wanted to have a single principal component from the imputed data, you would probably want to create the component for each imputed dataset, then estimate the model using each imputed component then pool the models together using Rubin's rules. Here's an example with some data I made up that is similar to your data.
library(mitools)
library(tidyverse)
library(missMDA)
## Make Data
set.seed(1234)
X <- MASS::mvrnorm(51, c(40, 3, 12), matrix(c(1,.8, .8, .8 , 1, .85, .8, .85, 1), ncol=3))
df <- as.data.frame(X)
names(df) <- c("A", "B", "C")
df$z <- df$A + rnorm(51, 0, 5)
pc1 <- princomp(scale(df[,c("A", "B", "C")]), scores = TRUE)
df$comp1 <- pc1$scores[,1]
df$y <- with(df, z - comp1 + rnorm(51, 0, 4))
## induce missingness on C
df2 <- df
df2$C[1:8] <- NA
## estimate dimensionality
pcaDim <- estim_ncpPCA(X = df2[,c("A", "B", "C")], method.cv = "Kfold")
# multiple imputation PCA
mi <- MIPCA(X = df2[,c("A", "B", "C")], ncp = pcaDim$ncp, scale = TRUE, nboot = 2500)
## Estimate the component for each imputed dataset and save it alongside the other model data
dat_list<- lapply(1:2500, function(i){
out <- df2
out$comp1 <- princomp(scale(mi$res.MI[[i]]), scores=TRUE)$scores[,1]
out
})
## estimate the model for all the different values of `comp1`.
mods <- lapply(dat_list, \(d)lm(y ~ z + comp1, data=d))
## summarise models
summary(MIcombine(mods))
#> Multiple imputation results:
#> MIcombine.default(mods)
#> results se (lower upper) missInfo
#> (Intercept) 3.509051 4.04150890 -4.412161 11.4302632 0 %
#> z 0.933240 0.09788341 0.741392 1.1250880 0 %
#> comp1 -1.180513 0.36088797 -1.887840 -0.4731855 0 %
Created on 2023-06-06 with reprex v2.0.2