I have a data frame with multiple columns, the user provides a vector with the column names, and I want to count maximum amount of times an element appears
set.seed(42)
df <- tibble(
var1 = sample(c(1:3),10,replace=T),
var2 = sample(c(1:3),10,replace=T),
var3 = sample(c(1:3),10,replace=T)
)
select_vars <- c("var1", "var3")
df %>%
rowwise() %>%
mutate(consensus=max(table(unlist(c(var1,var3)))))
# A tibble: 10 x 4
# Rowwise:
var1 var2 var3 consensus
<int> <int> <int> <int>
1 1 1 1 2
2 1 1 3 1
3 1 2 1 2
4 1 2 1 2
5 2 2 2 2
6 2 3 3 1
7 2 3 2 2
8 1 1 1 2
9 3 1 2 1
10 3 3 2 1
This does exactly what I want, but when I try to use a vector of variables i cant get it to work
df %>%
rowwise() %>%
mutate(consensus=max(unlist(table(select_vars)) )))
In the OP's code, we need select
library(dplyr)
df %>%
rowwise() %>%
mutate(consensus=max(table(unlist(select(cur_data(), select_vars))) ))
-output
# A tibble: 10 x 4
# Rowwise:
var1 var2 var3 consensus
<int> <int> <int> <int>
1 1 1 1 2
2 1 1 3 1
3 1 2 1 2
4 1 2 1 2
5 2 2 2 2
6 2 3 3 1
7 2 3 2 2
8 1 1 1 2
9 3 1 2 1
10 3 3 2 1
Or just subset from cur_data()
which would only return the data keeping the group attributes
df %>%
rowwise %>%
mutate(consensus = max(table(unlist(cur_data()[select_vars]))))
# A tibble: 10 x 4
# Rowwise:
var1 var2 var3 consensus
<int> <int> <int> <int>
1 1 1 1 2
2 1 1 3 1
3 1 2 1 2
4 1 2 1 2
5 2 2 2 2
6 2 3 3 1
7 2 3 2 2
8 1 1 1 2
9 3 1 2 1
10 3 3 2 1
Or using pmap
library(purrr)
df %>%
mutate(consensus = pmap_dbl(cur_data()[select_vars], ~ max(table(c(...)))))
# A tibble: 10 x 4
var1 var2 var3 consensus
<int> <int> <int> <dbl>
1 1 1 1 2
2 1 1 3 1
3 1 2 1 2
4 1 2 1 2
5 2 2 2 2
6 2 3 3 1
7 2 3 2 2
8 1 1 1 2
9 3 1 2 1
10 3 3 2 1
As these are rowwise operations, can get some efficiency if we use collapse
functions
library(collapse)
tfm(df, consensus = dapply(slt(df, select_vars), MARGIN = 1,
FUN = function(x) fmax(tabulate(x))))
# A tibble: 10 x 4
var1 var2 var3 consensus
* <int> <int> <int> <int>
1 1 1 1 2
2 1 1 3 1
3 1 2 1 2
4 1 2 1 2
5 2 2 2 2
6 2 3 3 1
7 2 3 2 2
8 1 1 1 2
9 3 1 2 1
10 3 3 2 1
As noted above, collapse
is faster (run on a slightly bigger dataset)
df1 <- df[rep(seq_len(nrow(df)), 1e5), ]
system.time({
tfm(df1, consensus = dapply(slt(df1, select_vars), MARGIN = 1,
FUN = function(x) fmax(tabulate(x))))
})
#user system elapsed
# 5.257 0.123 5.323
system.time({
df1 %>%
mutate(consensus = pmap_dbl(cur_data()[select_vars], ~ max(table(c(...)))))
})
#user system elapsed
# 54.813 0.517 55.246
The rowwise
operation is taking too much time, so stopped the execution
df1 %>%
rowwise() %>%
mutate(consensus=max(table(unlist(select(cur_data(), select_vars))) ))
})
Timing stopped at: 575.5 3.342 581.3