The following code generates data:
pacman::p_load(tidyverse, expss)
# Set the seed for reproducibility
set.seed(123)
# Generate random data
n <- 490
PTSD <- sample(c(1, 2, NA), n, replace = TRUE) #class(PTSD) = "numeric"
ANX <- sample(c(1, 2, NA), n, replace = TRUE) #class(ANX) = "numeric"
DEP <- sample(c(1, 2, NA), n, replace = TRUE) #class(DEP) = "numeric"
# Create the data frame
df <- data.frame(PTSD, ANX, DEP) #class(df) = "data.frame"
# Label the values: 1 = Low, 2 = High
expss::val_lab(df$PTSD) = expss::num_lab("1 Low
2 High")
expss::val_lab(df$ANX) = expss::num_lab("1 Low
2 High")
expss::val_lab(df$DEP) = expss::num_lab("1 Low
2 High")
# Create a list of tables for each variable to count 1s, 2s, and NAs
count_results <- list(
PTSD = table(df$PTSD, useNA = "ifany"),
ANX = table(df$ANX, useNA = "ifany"),
DEP = table(df$DEP, useNA = "ifany")
)
This portion of the code does some frequency count and summarises data:
# Combine the count tables into a single table
count_table <- do.call(rbind, count_results)
# Initialize empty vectors to store results
variable_names <- character()
sample_sizes <- numeric()
# Loop through the test results and extract relevant information
for (variable_name in names(count_results)) {
sample_sizes <- c(sample_sizes, sum(count_results[[variable_name]]))
variable_names <- c(variable_names, variable_name)
}
# Create summary data frame
summary_df <- data.frame(
Variable = variable_names,
N = sample_sizes
)
# Combine the count table and chi-squared summary table by columns
final_result <- cbind(count_table, summary_df)
# Remove Variable column in the middle of the table
final_result <- subset(final_result, select = -c(Variable))
This portion of the code does what I call "combination analysis" (it is based on the accepted answer of this SO thread):
library(dplyr)
out <- df %>%
mutate(id = row_number())%>%
tidyr::pivot_longer(PTSD:DEP) %>%
filter(value == 2)%>%
summarise(combination = toString(name),.by=id) %>%
summarise(n = n(), .by = combination)
Printing the frequency count and summary generates this:
> print(final_result)
Low High NA N
PTSD 164 167 159 490
ANX 157 156 177 490
DEP 168 156 166 490
And printing the frequency count and summary generates this:
# A tibble: 7 × 2
combination n
<chr> <int>
1 ANX 72
2 ANX, DEP 28
3 PTSD 82
4 DEP 76
5 PTSD, DEP 29
6 PTSD, ANX 33
7 PTSD, ANX, DEP 23
What I am really interested in is the "High" frequencies and their combinations (i.e., PTSD == 2, ANX == 2, and DEP == 2).
Therefore, I expected that PTSD High, ANX High, and DEP High be the same between both tables, but this is not the case!
In order to check what the second table (i.e., tibble table) should have shown, I exported the df
to a CSV file and imported it in a spreadsheet.
I used the COUNTIFS
function (with the following syntax COUNTIFS(criteria_range1, criteria1, [criteria_range2, criteria2]…)
) and I got the following table:
Combination n
--------------------
PTSD 167
ANX 156
DEP 156
PTSD + ANX 56
PTSD + DEP 52
ANX + DEP 51
PTSD + ANX + DEP 23
My question:
The following is a reproduction of what you must have done in excel with countif
library(tidyverse)
library(rlang)
t3 <- c("PTSD","ANX","DEP")
(combs <- map(seq_along(t3),\(n)combn(t3,n,simplify = FALSE)) |> flatten())
(filts <- parse_exprs(map_chr(combs,\(x)paste0(x ,'== 2',collapse=' & '))))
(filtsnames <- parse_exprs(map_chr(combs,\(x)paste0(x ,collapse=' + '))))
names(filts) <- filtsnames
(out2 <- map_int(filts,\(x){
df %>%
mutate(id = row_number())%>%
filter(!!(x))%>%
summarise(
n = n())
} |> pull(n)
))
enframe(out2)