Search code examples
rtidyversedplyrtidyselect

R: adapt mutate call from handling three binary variables to n binary variables


I've got a dataframe with 3 binary variables that relate to time period 1 and three corresponding variables that relate to time 2.

df <- data.frame("user" = c("a","b","c","d","e"), "item_1_time_1" = c(1,0,0,0,NA), "item_2_time_1" = c(1,1,1,0,NA), "item_3_time_1" = c(0,0,1,0,0), "item_1_time_2" = c(1,0,0,0,NA), "item_2_time_2" = c(1,0,0,0,NA), "item_3_time_2" = c(0,0,1,0,1))

df

   user item_1_time_1 item_2_time_1 item_3_time_1 item_1_time_2 item_2_time_2 item_3_time_2
1    a             1             1             0             1             1             0
2    b             0             1             0             0             0             0
3    c             0             1             1             0             0             1
4    d             0             0             1             0             0             0
5    e            NA            NA             0            NA            NA             1

I would like to to know if an observation has a 1 for a given item during period 1 but not during period 2. Moreover, I would like to know if an observation has any instance in which an item is 1 during period 1 and not period 2.

So the ideal output would look like

df2 <- data.frame("user" = c("a","b","c","d","e"), "item_1_time_1" = c(1,0,0,0,NA), "item_2_time_1" = c(1,1,1,0,NA), "item_3_time_1" = c(0,0,1,1,0), "item_1_time_2" = c(1,0,0,0,NA), "item_2_time_2" = c(1,0,0,0,NA), "item_3_time_2" = c(0,0,1,0,1), "item_1_check" = c(1,1,1,1,1), "item_2_check" = c(1,0,0,1,1), "item_3_check" = c(1,1,1,0,1), item_check = c(1,0,0,0,1))

df2 

user item_1_time_1 item_2_time_1 item_3_time_1 item_1_time_2 item_2_time_2 item_3_time_2 item_1_check item_2_check item_3_check item_check
1    a             1             1             0             1             1             0            1            1            1          1
2    b             0             1             0             0             0             0            1            0            1          0
3    c             0             1             1             0             0             1            1            0            1          0
4    d             0             0             1             0             0             0            1            1            0          0
5    e            NA            NA             0            NA            NA             1            1            1            1          1

So far I've tried

library(tidyverse)
df2 <- df %>%
   mutate(across(ends_with('time_2'), replace_na, 0)) %>% 
   mutate(across(ends_with('time_1'), replace_na, 0)) %>% 
   mutate(item_1_check = if_else(item_1_time_1 == 1 & item_1_time_2 == 0, 0, 1),
          item_2_check = if_else(item_2_time_1 == 1 & item_2_time_2 == 0, 0, 1),
          item_3_check = if_else(item_3_time_1 == 1 & item_3_time_2 == 0, 0, 1)) %>% 
   mutate(item_check = pmin(item_1_check, item_2_check, item_3_check))

I would like to generalize the above mutate calls so that they can handle n many items rather than just 3. Is there a way that I can use ends_with('check') for the final mutate? The variable names don't vary but for the item number and time period.


Solution

  • An option would be to reshape to 'long' format and do this once

    library(dplyr)
    library(tidyr)
    df %>% 
      pivot_longer(cols = -user, names_to = c('group', '.value'), 
             names_sep="_(?=time)") %>% 
      mutate(across(starts_with('time'), replace_na, 0)) %>% 
      group_by(group) %>% 
      transmute(user, check = !(time_1 & !time_2)) %>% 
      ungroup %>% 
      group_by(user) %>%
      summarise(check = min(check), .groups = 'drop') %>% 
      right_join(df, .) %>%
      select(names(df), check)
    # user item_1_time_1 item_2_time_1 item_3_time_1 item_1_time_2 item_2_time_2 item_3_time_2 check
    #1    a             1             1             0             1             1             0     1
    #2    b             0             1             0             0             0             0     0
    #3    c             0             1             1             0             0             1     0
    #4    d             0             0             0             0             0             0     1
    #5    e            NA            NA             0            NA            NA             1     1
    

    Or using base R

    df$check <-  +( Reduce(`&`, lapply(split.default(replace(df[-1], 
     is.na(df[-1]), 0), sub("time_\\d+", "", names(df)[-1])), 
        function(x)  !(x[[1]] & !x[[2]]))))