The following code generates data (source):
# Set the seed for reproducibility
set.seed(123)
# Generate random data
n <- 490
PTSD <- sample(c(1, 2, NA), n, replace = TRUE) #class(PTSD) = "numeric"
ANX <- sample(c(1, 2, NA), n, replace = TRUE) #class(ANX) = "numeric"
DEP <- sample(c(1, 2, NA), n, replace = TRUE) #class(DEP) = "numeric"
# Create the data frame
df <- data.frame(PTSD, ANX, DEP) #class(df) = "data.frame"
# Label the values: 1 = Low, 2 = High
expss::val_lab(df$PTSD) = expss::num_lab("1 Low
2 High")
expss::val_lab(df$ANX) = expss::num_lab("1 Low
2 High")
expss::val_lab(df$DEP) = expss::num_lab("1 Low
2 High")
# Create a list of tables for each variable to count 1s, 2s, and NAs
count_results <- list(
PTSD = table(df$PTSD, useNA = "ifany"),
ANX = table(df$ANX, useNA = "ifany"),
DEP = table(df$DEP, useNA = "ifany")
)
This portion of the code does some frequency count and summarises data:
# Combine the count tables into a single table
count_table <- do.call(rbind, count_results)
# Initialize empty vectors to store results
variable_names <- character()
sample_sizes <- numeric()
# Loop through the test results and extract relevant information
for (variable_name in names(count_results)) {
sample_sizes <- c(sample_sizes, sum(count_results[[variable_name]]))
variable_names <- c(variable_names, variable_name)
}
# Create summary data frame
summary_df <- data.frame(
Variable = variable_names,
N = sample_sizes
)
# Combine the count table and chi-squared summary table by columns
final_result <- cbind(count_table, summary_df)
# Remove Variable column in the middle of the table
final_result <- subset(final_result, select = -c(Variable))
This portion of the code does what I call "combination analysis" (it is one of the answers of the above mentioned SO thread):
t3 <- c("PTSD","ANX","DEP")
combs <- map(seq_along(t3),\(n)combn(t3,n,simplify = FALSE)) |> flatten()
filts <- parse_exprs(map_chr(combs,\(x)paste0(x ,'== 2',collapse=' & ')))
filtsnames <- parse_exprs(map_chr(combs,\(x)paste0(x ,collapse=' + ')))
names(filts) <- filtsnames
out2 <- map_int(filts,\(x){
df %>%
mutate(id = row_number())%>%
filter(!!(x))%>%
summarise(
n = n())
} |> pull(n)
)
enframe(out2)
The last command generates this (which is what was requested by the author of the question):
# A tibble: 7 × 2
name value
<chr> <int>
1 PTSD 167
2 ANX 156
3 DEP 156
4 PTSD + ANX 56
5 PTSD + DEP 52
6 ANX + DEP 51
7 PTSD + ANX + DEP 23
However, when looking into it, the number of combinations is higher than that, namely (with corrected table generated in MS Excel, function COUNTIFS(criteria_range1, criteria1, ...)
:
Combination n
---------------------------------
PTSD High 167
PTSD High, ANX Low 61
PTSD High, DEP Low 58
PTSD High, ANX High 56
PTSD High, DEP High 52
PTSD High, ANX Low, DEP Low 24
PTSD High, ANX High, DEP Low 14
PTSD High, ANX Low, DEP High 16
PTSD High, ANX High, DEP High 23
ANX High 156
ANX High, PTSD Low 50
ANX High, DEP Low 46
ANX High, DEP High 51
ANX High, PTSD Low, DEP Low 19
ANX High, PTSD Low, DEP High 14
DEP High 156
DEP High, PTSD Low 57
DEP High, ANX Low 52
DEP High, PTSD Low, ANX Low 17
My question:
If you use model.matrix
with interaction effects you can get the combinations you want from df
:
First we set the missing in df
to zero and set the columns to be factors:
> df[is.na(df)] <- 0
> df <- lapply(df, factor) |> as.data.frame()
Then find the model matrix, sum the columns:
> all_combs <- model.matrix(~PTSD*ANX*DEP , df) |> colSums()
Finally I think you only keep the elements with at least one 'high' value (check this is really what you want to do):
> all_combs[grepl("High",names(all_combs))] |> cbind()
[,1]
PTSDHigh 167
ANXHigh 156
DEPHigh 156
PTSDHigh:ANXLow 61
PTSDLow:ANXHigh 50
PTSDHigh:ANXHigh 56
PTSDHigh:DEPLow 58
PTSDLow:DEPHigh 57
PTSDHigh:DEPHigh 52
ANXHigh:DEPLow 46
ANXLow:DEPHigh 52
ANXHigh:DEPHigh 51
PTSDHigh:ANXLow:DEPLow 24
PTSDLow:ANXHigh:DEPLow 19
PTSDHigh:ANXHigh:DEPLow 14
PTSDLow:ANXLow:DEPHigh 17
PTSDHigh:ANXLow:DEPHigh 16
PTSDLow:ANXHigh:DEPHigh 14
PTSDHigh:ANXHigh:DEPHigh 23