Use dynamically generated column names in dplyr

I have a data frame with multiple columns, the user provides a vector with the column names, and I want to count maximum amount of times an element appears

set.seed(42)
df <- tibble(
  var1 = sample(c(1:3),10,replace=T),
  var2 = sample(c(1:3),10,replace=T),
  var3 = sample(c(1:3),10,replace=T)
)
select_vars <- c("var1", "var3")

df %>% 
    rowwise() %>% 
    mutate(consensus=max(table(unlist(c(var1,var3)))))

# A tibble: 10 x 4
# Rowwise: 
    var1  var2  var3 consensus
   <int> <int> <int>     <int>
 1     1     1     1         2
 2     1     1     3         1
 3     1     2     1         2
 4     1     2     1         2
 5     2     2     2         2
 6     2     3     3         1
 7     2     3     2         2
 8     1     1     1         2
 9     3     1     2         1
10     3     3     2         1

This does exactly what I want, but when I try to use a vector of variables i cant get it to work

df %>% 
  rowwise() %>% 
  mutate(consensus=max(unlist(table(select_vars)) )))

Solution

In the OP's code, we need select

library(dplyr)
df %>% 
  rowwise() %>% 
  mutate(consensus=max(table(unlist(select(cur_data(), select_vars))) ))

-output

# A tibble: 10 x 4
# Rowwise: 
    var1  var2  var3 consensus
   <int> <int> <int>     <int>
 1     1     1     1         2
 2     1     1     3         1
 3     1     2     1         2
 4     1     2     1         2
 5     2     2     2         2
 6     2     3     3         1
 7     2     3     2         2
 8     1     1     1         2
 9     3     1     2         1
10     3     3     2         1

Or just subset from cur_data() which would only return the data keeping the group attributes

df %>%
     rowwise %>% 
     mutate(consensus = max(table(unlist(cur_data()[select_vars]))))
# A tibble: 10 x 4
# Rowwise: 
    var1  var2  var3 consensus
   <int> <int> <int>     <int>
 1     1     1     1         2
 2     1     1     3         1
 3     1     2     1         2
 4     1     2     1         2
 5     2     2     2         2
 6     2     3     3         1
 7     2     3     2         2
 8     1     1     1         2
 9     3     1     2         1
10     3     3     2         1

Or using pmap

library(purrr)
df %>%
     mutate(consensus = pmap_dbl(cur_data()[select_vars], ~ max(table(c(...)))))
# A tibble: 10 x 4
    var1  var2  var3 consensus
   <int> <int> <int>     <dbl>
 1     1     1     1         2
 2     1     1     3         1
 3     1     2     1         2
 4     1     2     1         2
 5     2     2     2         2
 6     2     3     3         1
 7     2     3     2         2
 8     1     1     1         2
 9     3     1     2         1
10     3     3     2         1

As these are rowwise operations, can get some efficiency if we use collapse functions

library(collapse)
tfm(df, consensus = dapply(slt(df, select_vars), MARGIN = 1,
       FUN = function(x) fmax(tabulate(x))))
# A tibble: 10 x 4
    var1  var2  var3 consensus
 * <int> <int> <int>     <int>
 1     1     1     1         2
 2     1     1     3         1
 3     1     2     1         2
 4     1     2     1         2
 5     2     2     2         2
 6     2     3     3         1
 7     2     3     2         2
 8     1     1     1         2
 9     3     1     2         1
10     3     3     2         1

Benchmarks

As noted above, collapse is faster (run on a slightly bigger dataset)

df1 <- df[rep(seq_len(nrow(df)), 1e5), ]

system.time({
tfm(df1, consensus = dapply(slt(df1, select_vars), MARGIN = 1,
       FUN = function(x) fmax(tabulate(x))))

})
#user  system elapsed 
#  5.257   0.123   5.323 

system.time({
df1 %>%
     mutate(consensus = pmap_dbl(cur_data()[select_vars], ~ max(table(c(...)))))

})
#user  system elapsed 
# 54.813   0.517  55.246

The rowwise operation is taking too much time, so stopped the execution

df1 %>% 
   rowwise() %>% 
   mutate(consensus=max(table(unlist(select(cur_data(), select_vars))) ))
 })
Timing stopped at: 575.5 3.342 581.3