Search code examples
rdplyrtibble

Use dynamically generated column names in dplyr


I have a data frame with multiple columns, the user provides a vector with the column names, and I want to count maximum amount of times an element appears

set.seed(42)
df <- tibble(
  var1 = sample(c(1:3),10,replace=T),
  var2 = sample(c(1:3),10,replace=T),
  var3 = sample(c(1:3),10,replace=T)
)
select_vars <- c("var1", "var3")

df %>% 
    rowwise() %>% 
    mutate(consensus=max(table(unlist(c(var1,var3)))))

# A tibble: 10 x 4
# Rowwise: 
    var1  var2  var3 consensus
   <int> <int> <int>     <int>
 1     1     1     1         2
 2     1     1     3         1
 3     1     2     1         2
 4     1     2     1         2
 5     2     2     2         2
 6     2     3     3         1
 7     2     3     2         2
 8     1     1     1         2
 9     3     1     2         1
10     3     3     2         1

This does exactly what I want, but when I try to use a vector of variables i cant get it to work

df %>% 
  rowwise() %>% 
  mutate(consensus=max(unlist(table(select_vars)) )))

Solution

  • In the OP's code, we need select

    library(dplyr)
    df %>% 
      rowwise() %>% 
      mutate(consensus=max(table(unlist(select(cur_data(), select_vars))) ))
    

    -output

    # A tibble: 10 x 4
    # Rowwise: 
        var1  var2  var3 consensus
       <int> <int> <int>     <int>
     1     1     1     1         2
     2     1     1     3         1
     3     1     2     1         2
     4     1     2     1         2
     5     2     2     2         2
     6     2     3     3         1
     7     2     3     2         2
     8     1     1     1         2
     9     3     1     2         1
    10     3     3     2         1
    

    Or just subset from cur_data() which would only return the data keeping the group attributes

    df %>%
         rowwise %>% 
         mutate(consensus = max(table(unlist(cur_data()[select_vars]))))
    # A tibble: 10 x 4
    # Rowwise: 
        var1  var2  var3 consensus
       <int> <int> <int>     <int>
     1     1     1     1         2
     2     1     1     3         1
     3     1     2     1         2
     4     1     2     1         2
     5     2     2     2         2
     6     2     3     3         1
     7     2     3     2         2
     8     1     1     1         2
     9     3     1     2         1
    10     3     3     2         1
    

    Or using pmap

    library(purrr)
    df %>%
         mutate(consensus = pmap_dbl(cur_data()[select_vars], ~ max(table(c(...)))))
    # A tibble: 10 x 4
        var1  var2  var3 consensus
       <int> <int> <int>     <dbl>
     1     1     1     1         2
     2     1     1     3         1
     3     1     2     1         2
     4     1     2     1         2
     5     2     2     2         2
     6     2     3     3         1
     7     2     3     2         2
     8     1     1     1         2
     9     3     1     2         1
    10     3     3     2         1
    

    As these are rowwise operations, can get some efficiency if we use collapse functions

    library(collapse)
    tfm(df, consensus = dapply(slt(df, select_vars), MARGIN = 1,
           FUN = function(x) fmax(tabulate(x))))
    # A tibble: 10 x 4
        var1  var2  var3 consensus
     * <int> <int> <int>     <int>
     1     1     1     1         2
     2     1     1     3         1
     3     1     2     1         2
     4     1     2     1         2
     5     2     2     2         2
     6     2     3     3         1
     7     2     3     2         2
     8     1     1     1         2
     9     3     1     2         1
    10     3     3     2         1
    

    Benchmarks

    As noted above, collapse is faster (run on a slightly bigger dataset)

    df1 <- df[rep(seq_len(nrow(df)), 1e5), ]
    
    system.time({
    tfm(df1, consensus = dapply(slt(df1, select_vars), MARGIN = 1,
           FUN = function(x) fmax(tabulate(x))))
    
    })
    #user  system elapsed 
    #  5.257   0.123   5.323 
    
    system.time({
    df1 %>%
         mutate(consensus = pmap_dbl(cur_data()[select_vars], ~ max(table(c(...)))))
    
    })
    #user  system elapsed 
    # 54.813   0.517  55.246 
    

    The rowwise operation is taking too much time, so stopped the execution

    df1 %>% 
       rowwise() %>% 
       mutate(consensus=max(table(unlist(select(cur_data(), select_vars))) ))
     })
    Timing stopped at: 575.5 3.342 581.3