Search code examples
rdataframefunctionmergemultiple-columns

Collapse multiple columns into one column based on values in R


I have data.frame df1:

 df1 <- data.frame(apple = c('0', '0', '0', '1', '0', '1', '0', '0', '0', '1'),
      banana = c('1', '0', '0', '0', '1', '0', '1', '0', '0', '0'),
      cherry = c('0', '1', '0', '0', '0', '0', '0', '1', '0', '0'),
      date = c('0', '0', '1', '0', '0', '0', '0', '0', '1', '0'))
 rownames(df1) <- c('one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten')

How can I substitute all 'ones' in each column with the column name and then collapse the column into one new column? Expected result is df2:

 df2 <- data.frame(fruit = c('banana', 'cherry', 'date', 'apple', 'banana', 
      'apple', 'banana', 'cherry', 'date', 'apple'))
 rownames(df2) <- c('one', 'two', 'three', 'four', 'five', 
      'six', 'seven', 'eight', 'nine', 'ten')

I found the second step here (r collapsing data from multiple columns into one) but I don't quite get there yet.


Solution

  • Using max.col to subset the names, create a data.frame out of it with corresponding rownames.

    > (res <- data.frame(fruit=names(df1)[max.col(df1)]) |> `rownames<-`(rownames(df1)))
           fruit
    one   banana
    two   cherry
    three   date
    four   apple
    five  banana
    six    apple
    seven banana
    eight cherry
    nine    date
    ten    apple
    > all.equal(df2, res)
    [1] TRUE
    

    Benchmark

    Seems user2974951's solution (denoted as arr.ind) is fastest (despite some strange outliers).

    > arr.ind1 <- \(df1) {
    +   df2 <- as.data.frame(which(df1 == 1, arr.ind=TRUE))
    +   df2 <- df2[order(df2$row), ]
    +   df2$fruit <- colnames(df1)[df2$col]
    +   df2[, 3, drop=FALSE]
    + }
    > arr.ind2 <- \(df1) {
    +   df2 <- as.data.frame(which(df1 == 1, arr.ind=TRUE))
    +   df2 <- df2[order(df2$row), ]
    +   df2$fruit <- colnames(df1)[df2$col]
    +   subset(df2, select=fruit)
    + }
    > library(dplyr) |> suppressPackageStartupMessages()
    > library(tidyr) |> suppressPackageStartupMessages()
    > set.seed(42)
    > DF1 <- df1[sample.int(nrow(df1), 1e4, replace=TRUE), ]
    > options(width=200)
    > (mb <- microbenchmark::microbenchmark(
    +   arr.ind1=arr.ind1(DF1),
    +   arr.ind2=arr.ind2(DF1),
    +   max.col=data.frame(fruit=names(DF1)[max.col(DF1)]) |> `rownames<-`(rownames(DF1)),
    +   max.col2=data.frame(fruit=names(DF1)[max.col(DF1)], row.names=rownames(DF1)),
    +   max.col3=setNames(as.data.frame(names(DF1)[max.col(DF1)], row.names=rownames(DF1)), 'fruit'),
    +   max.col4=setNames(as.data.frame(names(DF1)[max.col(DF1)]), 'fruit') |> `rownames<-`(rownames(DF1)),
    +   tidy=DF1 %>%
    +     transmute(across(apple:date, ~case_when(. == 1 ~ cur_column()), .names = 'new_{col}')) %>% 
    +     unite(fruit, starts_with('new'), na.rm = TRUE, sep = ' '),
    +   check='equivalent'))
    

    $ Rscript --vanilla foo.R
    Unit: milliseconds
         expr       min        lq      mean    median        uq        max neval  cld
     arr.ind1  2.838674  2.891762  3.122499  2.944602  3.135042  11.327219   100 a   
     arr.ind2  3.360616  3.417063  3.731845  3.471011  3.681735  13.014652   100 ab  
      max.col  4.823824  4.911244  5.049284  5.002576  5.132965   5.571004   100  bc 
     max.col2  5.209733  5.407884  5.542997  5.453691  5.604758   6.110364   100   c 
     max.col3  4.502090  4.573133  4.668055  4.610790  4.696504   5.486427   100  bc 
     max.col4  4.766816  4.832869  5.235595  4.887230  5.033407  32.472803   100  bc 
         tidy 66.249594 69.793587 73.463189 71.997308 73.727147 145.391572   100    d
    
    
    > x11(); microbenchmark:::boxplot.microbenchmark(subset(mb, expr != 'tidy') |> transform(expr=droplevels(expr)))
    

    enter image description here


    Data:

    > dput(df1)
    structure(list(apple = c("0", "0", "0", "1", "0", "1", "0", "0", 
    "0", "1"), banana = c("1", "0", "0", "0", "1", "0", "1", "0", 
    "0", "0"), cherry = c("0", "1", "0", "0", "0", "0", "0", "1", 
    "0", "0"), date = c("0", "0", "1", "0", "0", "0", "0", "0", "1", 
    "0")), class = "data.frame", row.names = c("one", "two", "three", 
    "four", "five", "six", "seven", "eight", "nine", "ten"))