Search code examples
rdplyracross

How to convert columns to multiple boolean columns with tidyverse


I have a group of columns for each time and I want to convert it to a lot of boolean columns (one by category) with mutate() and across() like that :

data <- data.frame(category_t1 = c("A","B","C","C","A","B"),
                   category_t2 = c("A","C","B","B","B",NA),
                   category_t3 = c("C","C",NA,"B",NA,"A"))

data %>% mutate(across(starts_with("category"), 
                       ~case_when(.x == "A" ~ TRUE, !is.na(.x) ~ FALSE),
                       .names = "{str_replace(.col, 'category', 'A')}"),
                across(starts_with("category"), 
                       ~case_when(.x == "B" ~ TRUE, !is.na(.x) ~ FALSE),
                       .names = "{str_replace(.col, 'category', 'B')}"),
                across(starts_with("category"), 
                       ~case_when(.x == "C" ~ TRUE, !is.na(.x) ~ FALSE),
                       .names = "{str_replace(.col, 'category', 'C')}"))

Which makes :

category_t1 category_t2 category_t3  A_t1  A_t2  A_t3  B_t1  B_t2  B_t3  C_t1  C_t2
1         A           A           C  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
2         B           C           C FALSE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE
3         C           B        <NA> FALSE FALSE    NA FALSE  TRUE    NA  TRUE FALSE
4         C           B           B FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE FALSE
5         A           B        <NA>  TRUE FALSE    NA FALSE  TRUE    NA FALSE FALSE
6         B        <NA>           A FALSE    NA  TRUE  TRUE    NA FALSE FALSE    NA

It works but I would like to know if there is a better idea because here I am doing the same code 3 times instead of one big code (and imagine if I had 10 times to repeat it...). I though I could do it with map() but I didn't manage to make it work. I think there is a problem because of .names argument in across() that cannot connect with the string I use in case_when().

I think maybe there is something to do in the ... argument, like :

data %>% mutate(across(starts_with("category"),
                       ~case_when(.x == mod ~ TRUE, !is.na(.x) ~ FALSE),
                       mod = levels(as.factor(data$category_t1)),
                       .names = "{str_replace(.col, 'category', mod)}"))

But of course that doesn't work here. Do you know how to do that ?

Thanks a lot.


Solution

  • We may use table in across

    library(dplyr)
    library(stringr)
    library(tidyr)
    data %>%
       mutate(across(everything(), ~ as.data.frame.matrix(table(row_number(), .x) * 
         NA^(is.na(.x)) > 0),
        .names = "{str_remove(.col, 'category_')}")) %>% 
      unpack(where(is.data.frame), names_sep = ".")
    

    -output

    # A tibble: 6 × 12
      category_t1 category_t2 category_t3 t1.A  t1.B  t1.C  t2.A  t2.B  t2.C  t3.A  t3.B  t3.C 
      <chr>       <chr>       <chr>       <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl>
    1 A           A           C           TRUE  FALSE FALSE TRUE  FALSE FALSE FALSE FALSE TRUE 
    2 B           C           C           FALSE TRUE  FALSE FALSE FALSE TRUE  FALSE FALSE TRUE 
    3 C           B           <NA>        FALSE FALSE TRUE  FALSE TRUE  FALSE NA    NA    NA   
    4 C           B           B           FALSE FALSE TRUE  FALSE TRUE  FALSE FALSE TRUE  FALSE
    5 A           B           <NA>        TRUE  FALSE FALSE FALSE TRUE  FALSE NA    NA    NA   
    6 B           <NA>        A           FALSE TRUE  FALSE NA    NA    NA    TRUE  FALSE FALSE
    

    Or use model.matrix from base R

    data1 <- replace(data, is.na(data), "NA")
    lvls <- lapply(data1, \(x) levels(factor(x, levels = c("NA", "A", "B", "C"))))
    m1 <- model.matrix(~ 0 + ., data = data1, xlev = lvls)
    
    out <- cbind(data, m1[, -grep("NA", colnames(m1))] > 0)
    

    -output

    out
    category_t1 category_t2 category_t3 category_t1A category_t1B category_t1C category_t2A category_t2B category_t2C category_t3A category_t3B category_t3C
    1           A           A           C         TRUE        FALSE        FALSE         TRUE        FALSE        FALSE        FALSE        FALSE         TRUE
    2           B           C           C        FALSE         TRUE        FALSE        FALSE        FALSE         TRUE        FALSE        FALSE         TRUE
    3           C           B        <NA>        FALSE        FALSE         TRUE        FALSE         TRUE        FALSE        FALSE        FALSE        FALSE
    4           C           B           B        FALSE        FALSE         TRUE        FALSE         TRUE        FALSE        FALSE         TRUE        FALSE
    5           A           B        <NA>         TRUE        FALSE        FALSE        FALSE         TRUE        FALSE        FALSE        FALSE        FALSE
    6           B        <NA>           A        FALSE         TRUE        FALSE        FALSE        FALSE        FALSE         TRUE        FALSE        FALSE
    > colnames(out)
     [1] "category_t1"  "category_t2"  "category_t3" 
     [4] "category_t1A" "category_t1B" "category_t1C"
     [7] "category_t2A" "category_t2B" "category_t2C"
     [10] "category_t3A"
    [11] "category_t3B" "category_t3C"
    
    

    Or another option with table

    cbind(data, do.call(cbind.data.frame,
      lapply(data, \(x) (table(seq_along(x), x)* NA^is.na(x)) > 0)))
    

    -output

    category_t1 category_t2 category_t3 category_t1.A category_t1.B category_t1.C category_t2.A category_t2.B category_t2.C category_t3.A category_t3.B
    1           A           A           C          TRUE         FALSE         FALSE          TRUE         FALSE         FALSE         FALSE         FALSE
    2           B           C           C         FALSE          TRUE         FALSE         FALSE         FALSE          TRUE         FALSE         FALSE
    3           C           B        <NA>         FALSE         FALSE          TRUE         FALSE          TRUE         FALSE            NA            NA
    4           C           B           B         FALSE         FALSE          TRUE         FALSE          TRUE         FALSE         FALSE          TRUE
    5           A           B        <NA>          TRUE         FALSE         FALSE         FALSE          TRUE         FALSE            NA            NA
    6           B        <NA>           A         FALSE          TRUE         FALSE            NA            NA            NA          TRUE         FALSE
      category_t3.C
    1          TRUE
    2          TRUE
    3            NA
    4         FALSE
    5            NA
    6         FALSE