I have data.frame
df1
:
df1 <- data.frame(apple = c('0', '0', '0', '1', '0', '1', '0', '0', '0', '1'),
banana = c('1', '0', '0', '0', '1', '0', '1', '0', '0', '0'),
cherry = c('0', '1', '0', '0', '0', '0', '0', '1', '0', '0'),
date = c('0', '0', '1', '0', '0', '0', '0', '0', '1', '0'))
rownames(df1) <- c('one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten')
How can I substitute all 'ones' in each column with the column name and then collapse the column into one new column? Expected result is df2
:
df2 <- data.frame(fruit = c('banana', 'cherry', 'date', 'apple', 'banana',
'apple', 'banana', 'cherry', 'date', 'apple'))
rownames(df2) <- c('one', 'two', 'three', 'four', 'five',
'six', 'seven', 'eight', 'nine', 'ten')
I found the second step here (r collapsing data from multiple columns into one) but I don't quite get there yet.
Using max.col
to subset the names
, create a data.frame
out of it with corresponding rownames
.
> (res <- data.frame(fruit=names(df1)[max.col(df1)]) |> `rownames<-`(rownames(df1)))
fruit
one banana
two cherry
three date
four apple
five banana
six apple
seven banana
eight cherry
nine date
ten apple
> all.equal(df2, res)
[1] TRUE
Seems user2974951's solution (denoted as arr.ind) is fastest (despite some strange outliers).
> arr.ind1 <- \(df1) {
+ df2 <- as.data.frame(which(df1 == 1, arr.ind=TRUE))
+ df2 <- df2[order(df2$row), ]
+ df2$fruit <- colnames(df1)[df2$col]
+ df2[, 3, drop=FALSE]
+ }
> arr.ind2 <- \(df1) {
+ df2 <- as.data.frame(which(df1 == 1, arr.ind=TRUE))
+ df2 <- df2[order(df2$row), ]
+ df2$fruit <- colnames(df1)[df2$col]
+ subset(df2, select=fruit)
+ }
> library(dplyr) |> suppressPackageStartupMessages()
> library(tidyr) |> suppressPackageStartupMessages()
> set.seed(42)
> DF1 <- df1[sample.int(nrow(df1), 1e4, replace=TRUE), ]
> options(width=200)
> (mb <- microbenchmark::microbenchmark(
+ arr.ind1=arr.ind1(DF1),
+ arr.ind2=arr.ind2(DF1),
+ max.col=data.frame(fruit=names(DF1)[max.col(DF1)]) |> `rownames<-`(rownames(DF1)),
+ max.col2=data.frame(fruit=names(DF1)[max.col(DF1)], row.names=rownames(DF1)),
+ max.col3=setNames(as.data.frame(names(DF1)[max.col(DF1)], row.names=rownames(DF1)), 'fruit'),
+ max.col4=setNames(as.data.frame(names(DF1)[max.col(DF1)]), 'fruit') |> `rownames<-`(rownames(DF1)),
+ tidy=DF1 %>%
+ transmute(across(apple:date, ~case_when(. == 1 ~ cur_column()), .names = 'new_{col}')) %>%
+ unite(fruit, starts_with('new'), na.rm = TRUE, sep = ' '),
+ check='equivalent'))
$ Rscript --vanilla foo.R
Unit: milliseconds
expr min lq mean median uq max neval cld
arr.ind1 2.838674 2.891762 3.122499 2.944602 3.135042 11.327219 100 a
arr.ind2 3.360616 3.417063 3.731845 3.471011 3.681735 13.014652 100 ab
max.col 4.823824 4.911244 5.049284 5.002576 5.132965 5.571004 100 bc
max.col2 5.209733 5.407884 5.542997 5.453691 5.604758 6.110364 100 c
max.col3 4.502090 4.573133 4.668055 4.610790 4.696504 5.486427 100 bc
max.col4 4.766816 4.832869 5.235595 4.887230 5.033407 32.472803 100 bc
tidy 66.249594 69.793587 73.463189 71.997308 73.727147 145.391572 100 d
> x11(); microbenchmark:::boxplot.microbenchmark(subset(mb, expr != 'tidy') |> transform(expr=droplevels(expr)))
Data:
> dput(df1)
structure(list(apple = c("0", "0", "0", "1", "0", "1", "0", "0",
"0", "1"), banana = c("1", "0", "0", "0", "1", "0", "1", "0",
"0", "0"), cherry = c("0", "1", "0", "0", "0", "0", "0", "1",
"0", "0"), date = c("0", "0", "1", "0", "0", "0", "0", "0", "1",
"0")), class = "data.frame", row.names = c("one", "two", "three",
"four", "five", "six", "seven", "eight", "nine", "ten"))