I am trying to create a new column in my data frame (NEW
) using the case_when
functionality in dplyr
. I am able to get the code below to run, but I am wondering if there is a way to create this new column based on the four columns that start with COL_
as opposed to how it is currently written, looking only at COL_1
. Otherwise, I would have to repeat each case four times (once for each COL_1
, COL_2
, COL_3
, and COL_4
).
library(dplyr)
set.seed(1)
# Make sample data
data <- data.frame(STRATUM_ID = c(rep("C19", 5), rep("C20", 15), rep("C21", 4)),
COL_1 = sample(c(rep("X", 3), rep("T", 2), rep("Y", 7), rep("Z", 5), rep("D", 5), rep("G", 2)), 24, replace = T),
COL_2 = sample(c(rep("T", 4), rep("G", 6), rep("Y", 3), rep("C", 2), rep("W", 6), rep("R", 3)), 24, replace = T),
COL_3 = sample(c(rep("G", 1), rep("F", 5), rep("D", 3), rep("Z", 7), rep("C", 3), rep("E", 5)), 24, replace = T),
COL_4 = sample(c(rep("E", 7), rep("G", 2), rep("Y", 7), rep("D", 5), rep("V", 1), rep("U", 2)), 24, replace = T))
# Create new column based on COL columns
data <- data %>% mutate(NEW = case_when(
STRATUM_ID == "C20" & COL_1 == "X" ~ "Class_A",
STRATUM_ID == "C20" & COL_1 %in% c("C", "D", "E") ~ "Class_B",
STRATUM_ID == "C20" & COL_1 %in% c("U", "V", "W", "Y") ~ "Class_C",
STRATUM_ID == "C20" & COL_1 == "T" ~ "Class_D",
STRATUM_ID == "C20" & COL_1 %in% c("G", "Z", "R") ~ "Class_E",
STRATUM_ID == "C20" & COL_1 == "F" ~ "Class_F",
STRATUM_ID == "C20" & is.na(data$COL_1) ~ "Unknown",
TRUE ~ STRATUM_ID
))
I have tried to use the following to address looking through all COL
columns:
data <- data %>% mutate(test = case_when(
STRATUM_ID == "C20" & grep("COL", colnames(data)) %in% c("C", "D", "E") ~ "CLASS_B"))
data <- data %>% mutate(test = case_when(
STRATUM_ID == "C20" & vars(starts_with("COL")) %in% c("C", "D", "E") ~ "CLASS_B"))
Forgive me as the dataset I'm working with is actually much, much larger and I've tried my best to simplify the issue here.
library(dplyr)
data %>% mutate(NEW = case_when(
STRATUM_ID == "C20" & if_any(starts_with("COL"), ~ . == "X") ~ "Class_A",
STRATUM_ID == "C20" & if_any(starts_with("COL"), ~ . %in% c("C", "D", "E")) ~ "Class_B",
STRATUM_ID == "C20" & if_any(starts_with("COL"), ~ . %in% c("U", "V", "W", "Y")) ~ "Class_C",
STRATUM_ID == "C20" & if_any(starts_with("COL"), ~ . == "T") ~ "Class_D",
STRATUM_ID == "C20" & if_any(starts_with("COL"), ~ . %in% c("G", "Z", "R")) ~ "Class_E",
STRATUM_ID == "C20" & if_any(starts_with("COL"), ~ . == "F") ~ "Class_F",
STRATUM_ID == "C20" & is.na(data$COL_1) ~ "Unknown",
TRUE ~ STRATUM_ID
))
I am not 100% sure what you would be repeating for each "COL" but based on your attempts this looks like what you are trying to do.