r dataframe function merge multiple-columns

Collapse multiple columns into one column based on values in R

I have data.frame df1:

 df1 <- data.frame(apple = c('0', '0', '0', '1', '0', '1', '0', '0', '0', '1'),
      banana = c('1', '0', '0', '0', '1', '0', '1', '0', '0', '0'),
      cherry = c('0', '1', '0', '0', '0', '0', '0', '1', '0', '0'),
      date = c('0', '0', '1', '0', '0', '0', '0', '0', '1', '0'))
 rownames(df1) <- c('one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten')

How can I substitute all 'ones' in each column with the column name and then collapse the column into one new column? Expected result is df2:

 df2 <- data.frame(fruit = c('banana', 'cherry', 'date', 'apple', 'banana', 
      'apple', 'banana', 'cherry', 'date', 'apple'))
 rownames(df2) <- c('one', 'two', 'three', 'four', 'five', 
      'six', 'seven', 'eight', 'nine', 'ten')

I found the second step here (r collapsing data from multiple columns into one) but I don't quite get there yet.

Solution

Using max.col to subset the names, create a data.frame out of it with corresponding rownames.

> (res <- data.frame(fruit=names(df1)[max.col(df1)]) |> `rownames<-`(rownames(df1)))
       fruit
one   banana
two   cherry
three   date
four   apple
five  banana
six    apple
seven banana
eight cherry
nine    date
ten    apple
> all.equal(df2, res)
[1] TRUE

Benchmark

Seems user2974951's solution (denoted as arr.ind) is fastest (despite some strange outliers).

> arr.ind1 <- \(df1) {
+   df2 <- as.data.frame(which(df1 == 1, arr.ind=TRUE))
+   df2 <- df2[order(df2$row), ]
+   df2$fruit <- colnames(df1)[df2$col]
+   df2[, 3, drop=FALSE]
+ }
> arr.ind2 <- \(df1) {
+   df2 <- as.data.frame(which(df1 == 1, arr.ind=TRUE))
+   df2 <- df2[order(df2$row), ]
+   df2$fruit <- colnames(df1)[df2$col]
+   subset(df2, select=fruit)
+ }
> library(dplyr) |> suppressPackageStartupMessages()
> library(tidyr) |> suppressPackageStartupMessages()
> set.seed(42)
> DF1 <- df1[sample.int(nrow(df1), 1e4, replace=TRUE), ]
> options(width=200)
> (mb <- microbenchmark::microbenchmark(
+   arr.ind1=arr.ind1(DF1),
+   arr.ind2=arr.ind2(DF1),
+   max.col=data.frame(fruit=names(DF1)[max.col(DF1)]) |> `rownames<-`(rownames(DF1)),
+   max.col2=data.frame(fruit=names(DF1)[max.col(DF1)], row.names=rownames(DF1)),
+   max.col3=setNames(as.data.frame(names(DF1)[max.col(DF1)], row.names=rownames(DF1)), 'fruit'),
+   max.col4=setNames(as.data.frame(names(DF1)[max.col(DF1)]), 'fruit') |> `rownames<-`(rownames(DF1)),
+   tidy=DF1 %>%
+     transmute(across(apple:date, ~case_when(. == 1 ~ cur_column()), .names = 'new_{col}')) %>% 
+     unite(fruit, starts_with('new'), na.rm = TRUE, sep = ' '),
+   check='equivalent'))

$ Rscript --vanilla foo.R
Unit: milliseconds
     expr       min        lq      mean    median        uq        max neval  cld
 arr.ind1  2.838674  2.891762  3.122499  2.944602  3.135042  11.327219   100 a   
 arr.ind2  3.360616  3.417063  3.731845  3.471011  3.681735  13.014652   100 ab  
  max.col  4.823824  4.911244  5.049284  5.002576  5.132965   5.571004   100  bc 
 max.col2  5.209733  5.407884  5.542997  5.453691  5.604758   6.110364   100   c 
 max.col3  4.502090  4.573133  4.668055  4.610790  4.696504   5.486427   100  bc 
 max.col4  4.766816  4.832869  5.235595  4.887230  5.033407  32.472803   100  bc 
     tidy 66.249594 69.793587 73.463189 71.997308 73.727147 145.391572   100    d


> x11(); microbenchmark:::boxplot.microbenchmark(subset(mb, expr != 'tidy') |> transform(expr=droplevels(expr)))

Data:

> dput(df1)
structure(list(apple = c("0", "0", "0", "1", "0", "1", "0", "0", 
"0", "1"), banana = c("1", "0", "0", "0", "1", "0", "1", "0", 
"0", "0"), cherry = c("0", "1", "0", "0", "0", "0", "0", "1", 
"0", "0"), date = c("0", "0", "1", "0", "0", "0", "0", "0", "1", 
"0")), class = "data.frame", row.names = c("one", "two", "three", 
"four", "five", "six", "seven", "eight", "nine", "ten"))