Search code examples
rtidyversecase-whenmultiple-conditions

Why do I not get my desired category in the new columns with case_when, in R?


I am trying to create a new column, first_attack, with several conditions based on 3 columns. I am trying to achieve this with tidyverse only. Here are my conditions:

  • if I have Yes in one column, regardless of other categories (No/Unknown/NA) in the same id across columns: previous_cabg, previous_pci, previous_ami then assign Yes in test variable
  • if I have No in all columns for the same id then assign NO for the test variable
  • if I have NO for one column and NA/Unknown in the other columns for the same id then assign with No in the test variable
  • if I have Yes in all column for the same id then assign Yes in the test variable
  • if I haveYes in one column and NA/Unknownfor the same id in each column then assignYes`in test variable

This is the type of dataset I have:

structure(list(id = c(112139L, 43919L, 92430L, 87137L, 95417L, 
66955L, 16293L, 61396L, 25379L, 79229L, 27107L, 63243L, 50627L, 
17968L, 83015L, 96549L, 7332L, 4873L, 98131L, 93506L, 52894L, 
59327L, 85003L, 96623L, 82999L, 65769L, 67063L, 21744L, 62961L, 
2229L, 103673L, 9367L, 60215L, 74044L, 58422L, 57530L, 100399L, 
46483L, 108690L, 62017L, 46467L, 79562L, 4800L, 119158L, 103222L, 
32908L, 14491L, 30293L, 52558L, 122304L, 42281L, 1553L, 111771L, 
23087L, 30147L, 37842L, 51552L, 20148L, 28L, 7477L), previous_cabg = structure(c(1L, 
1L, 1L, NA, 1L, NA, NA, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, NA, 1L, 1L, NA, 1L, NA, 1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, 3L, 
1L, 1L, NA, 1L, 1L, 1L, 1L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, 1L, 1L, 1L), .Label = c("No", 
"Unknown", "Yes"), class = "factor"), previous_pci = structure(c(1L, 
1L, 2L, NA, 1L, NA, NA, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 
2L, NA, 2L, 1L, NA, 2L, NA, 1L, 2L, 1L, 1L, 1L, NA, 2L, 1L, 1L, 
2L, 2L, NA, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, 2L, 1L, 1L), .Label = c("No", 
"Yes", "Unknown"), class = "factor"), previous_ami = structure(c(2L, 
2L, 1L, 2L, 2L, NA, 2L, 1L, 2L, 2L, NA, 1L, 2L, 2L, 2L, 2L, 2L, 
1L, NA, 1L, 2L, NA, 1L, NA, 2L, 1L, 2L, 2L, 2L, NA, 1L, 1L, 1L, 
2L, 1L, NA, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 3L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, NA, 2L, 2L, 2L, 1L, 2L), .Label = c("Yes", 
"No", "Unknown"), class = "factor")), row.names = c(NA, -60L), problems = structure(list(
    row = c(34136L, 121773L, 121779L), col = c("1.01 Hospital identifier", 
    "1.01 Hospital identifier", "1.01 Hospital identifier"), 
    expected = c("value in level set", "value in level set", 
    "value in level set"), actual = c("CMH", "CMH", "CMH"), file = c("'../../data/changed/minap_2020_2021_second.csv'", 
    "'../../data/changed/minap_2020_2021_second.csv'", "'../../data/changed/minap_2020_2021_second.csv'"
    )), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame"
)), class = c("tbl_df", "tbl", "data.frame"))

I thought the best is to use case_when. Thus, I oddly I proceeded with it. This is the code.

  test_first_attack <- sample_n %>%
      dplyr::mutate(first_attack = 
                      dplyr::case_when(  
                                        previous_cabg == 'No'  | previous_pci == 'No'  | previous_ami == 'Yes' ~ 'Yes',
                                        previous_cabg == 'No'  | previous_pci == 'Yes' | previous_ami == 'Yes' ~ 'Yes',
                                        previous_cabg == 'Yes' | previous_pci == 'No'  | previous_ami == 'Yes' ~ 'Yes', 
                                        previous_cabg == 'Yes' | previous_pci == 'Yes' | previous_ami == 'No' ~  'Yes', 
                                        previous_cabg == 'No'  | previous_pci == 'No'  | previous_ami == 'Yes' ~ 'Yes',
                                        previous_cabg == 'No'  | previous_pci == 'Yes' | previous_ami == 'Yes' ~ 'Yes',
                                        previous_cabg == 'Yes' | previous_pci == 'No'  | previous_ami == 'Yes' ~ 'Yes', 
                                        previous_cabg == 'Yes' | previous_pci == 'Yes' | previous_ami == 'No' ~  'Yes',
                                        
                                        # deal with the unknown category
                                        previous_cabg == 'Unknown'  | previous_pci == 'Yes' | previous_ami == 'Yes' ~ 'Yes',
                                        previous_cabg == 'Yes' | previous_pci == 'Unknown'  | previous_ami == 'Yes' ~ 'Yes', 
                                        previous_cabg == 'Yes' | previous_pci == 'Yes' | previous_ami == 'Unknown' ~  'Yes', 
                                        previous_cabg == 'Unknown'  | previous_pci == 'Unknown'  | previous_ami == 'Yes' ~ 'Yes',
                                        previous_cabg == 'Unknown'  | previous_pci == 'Yes' | previous_ami == 'Yes' ~ 'Yes',
                                        previous_cabg == 'Yes' | previous_pci == 'Unknown' | previous_ami == 'Yes' ~ 'Yes', 
                                        previous_cabg == 'Yes' | previous_pci == 'Yes' | previous_ami == 'Unknown' ~  'Yes', 
                                        
                                        
                                        previous_cabg == 'Yes' |  previous_pci == 'No'  | previous_ami == 'Yes' ~  'Yes', 
                                        previous_cabg == 'Yes' |  previous_pci == 'No'  | previous_ami == 'No'  ~  'Yes',
                                        previous_cabg == 'No'  |  previous_pci == 'No'  | previous_ami == 'Yes' ~  'Yes',
                                        previous_cabg == 'No'  | previous_pci == 'Yes'  | previous_ami == 'No'  ~ 'Yes', 
                                        
                                        
                                        previous_cabg == 'Yes' |  previous_pci == 'Unknown'   | previous_ami == 'Yes' ~  'Yes', 
                                        previous_cabg == 'Yes' |  previous_pci == 'Unknown'   | previous_ami == 'Unknown'   ~  'Yes',
                                        previous_cabg == 'Unknown'   |  previous_pci == 'Unknown'   | previous_ami == 'Yes' ~  'Yes',
                                        previous_cabg == 'Unknown'   | previous_pci == 'Yes'  | previous_ami == 'Unknown'   ~ 'Yes', 
                                        
                                        
                                        previous_cabg == 'Yes' | previous_pci == 'Unknown' | previous_ami == 'Unknown' ~ 'Yes', 
                                        previous_cabg == 'Unknown'  | previous_pci == 'Yes'| previous_ami == 'Unknown' ~ 'Yes', 
                                        previous_cabg == 'Yes' | previous_pci == 'No' | previous_ami == 'Yes' ~ 'Yes', 
                                        previous_cabg == 'Unknown'  | previous_pci == 'Yes'| previous_ami == 'Yes' ~ 'Yes', 
                                        
                                        previous_cabg == 'Yes' | previous_pci == 'No' | previous_ami == 'No' ~ 'Yes', 
                                        previous_cabg == 'No'  | previous_pci == 'Yes'| previous_ami == 'No' ~ 'Yes', 
                                        previous_cabg == 'Yes' | previous_pci == 'No' | previous_ami == 'Yes' ~ 'Yes', 
                                        previous_cabg == 'No'  | previous_pci == 'Yes'| previous_ami == 'Yes' ~ 'Yes', 
                                        
                                        previous_cabg == 'Yes' | previous_pci == 'Unknown' | previous_ami == 'Unknown' ~ 'Yes', 
                                        previous_cabg == 'Unknown'  | previous_pci == 'Yes'| previous_ami == 'Unknown' ~ 'Yes', 
                                        previous_cabg == 'Yes' | previous_pci == 'Unknown' | previous_ami == 'Yes' ~ 'Yes', 
                                        previous_cabg == 'Unknown' | previous_pci == 'Yes'| previous_ami == 'Yes' ~ 'Yes', 
                                        previous_cabg == 'No'  | previous_pci == 'No'  |  previous_ami == 'No' ~ 'No',
                                        previous_cabg == 'Yes' | previous_pci == 'Yes' |  previous_ami == 'Yes' ~'Yes'
                                        
                                        
                                        
                                        
                      ))

This is the output.

 test_first_attack <-  structure(list(id = c(112139L, 43919L, 92430L, 87137L, 95417L, 
    66955L, 16293L, 61396L, 25379L, 79229L, 27107L, 63243L, 50627L, 
    17968L, 83015L, 96549L, 7332L, 4873L, 98131L, 93506L, 52894L, 
    59327L, 85003L, 96623L, 82999L, 65769L, 67063L, 21744L, 62961L, 
    2229L, 103673L, 9367L, 60215L, 74044L, 58422L, 57530L, 100399L, 
    46483L, 108690L, 62017L, 46467L, 79562L, 4800L, 119158L, 103222L, 
    32908L, 14491L, 30293L, 52558L, 122304L, 42281L, 1553L, 111771L, 
    23087L, 30147L, 37842L, 51552L, 20148L, 28L, 7477L), previous_cabg = structure(c(1L, 
    1L, 1L, NA, 1L, NA, NA, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, NA, 1L, 1L, NA, 1L, NA, 1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, 3L, 
    1L, 1L, NA, 1L, 1L, 1L, 1L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, 1L, 1L, 1L), .Label = c("No", 
    "Unknown", "Yes"), class = "factor"), previous_pci = structure(c(1L, 
    1L, 2L, NA, 1L, NA, NA, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 
    2L, NA, 2L, 1L, NA, 2L, NA, 1L, 2L, 1L, 1L, 1L, NA, 2L, 1L, 1L, 
    2L, 2L, NA, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, 2L, 1L, 1L), .Label = c("No", 
    "Yes", "Unknown"), class = "factor"), previous_ami = structure(c(2L, 
    2L, 1L, 2L, 2L, NA, 2L, 1L, 2L, 2L, NA, 1L, 2L, 2L, 2L, 2L, 2L, 
    1L, NA, 1L, 2L, NA, 1L, NA, 2L, 1L, 2L, 2L, 2L, NA, 1L, 1L, 1L, 
    2L, 1L, NA, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 3L, 2L, 2L, 
    2L, 2L, 2L, 2L, 2L, NA, 2L, 2L, 2L, 1L, 2L), .Label = c("Yes", 
    "No", "Unknown"), class = "factor"), first_attack = c("Yes", 
    "Yes", "Yes", "Yes", "Yes", NA, "Yes", "Yes", "Yes", "Yes", "Yes", 
    "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", NA, "Yes", "Yes", 
    NA, "Yes", NA, "Yes", "Yes", "Yes", "Yes", "Yes", NA, "Yes", 
    "Yes", "Yes", "Yes", "Yes", NA, "Yes", "Yes", "Yes", "Yes", "Yes", 
    "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", 
    "Yes", "Yes", "Yes", "Yes", NA, "Yes", "Yes", "Yes", "Yes", "Yes"
    )), row.names = c(NA, -60L), problems = structure(list(row = c(34136L, 
    121773L, 121779L), col = c("1.01 Hospital identifier", "1.01 Hospital identifier", 
    "1.01 Hospital identifier"), expected = c("value in level set", 
    "value in level set", "value in level set"), actual = c("CMH", 
    "CMH", "CMH"), file = c("'../../data/changed/minap_2020_2021_second.csv'", 
    "'../../data/changed/minap_2020_2021_second.csv'", "'../../data/changed/minap_2020_2021_second.csv'"
    )), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame"
    )), class = c("tbl_df", "tbl", "data.frame"))

As you can see, I get 'Yes' as output for the columns with id rows with NO across all the columns. Let alone taking into consideration more complex conditions like 'Unknown\NA\Yes' or other conditions.

Why I cannot get my desired output? Is there another way to achieve it with tidyverse only?


Solution

  • I think all you need is:

    case_when(
        previous_cabg == 'Yes' | previous_pci == 'Yes' | previous_ami == 'Yes' ~  'Yes',
        previous_cabg == 'No' | previous_pci == 'No' | previous_ami == 'No' ~  'No',
        TRUE ~ "Unknown/NA"
    )
    

    The first line will result in a "Yes" for every row where at least one of the three test columns is "Yes". The rows that satisfy the first line of the case_when will not be tested for any subsequent conditions.

    Of the remaining rows, the second line will result in a "No" where any of the three test columns is "No". The rows that meet the second line of the case_when will not be tested for any subsequent conditions.

    The last line assigns "Unknown/NA" to all remaining rows, which should be only those rows that don't have any "Yes" or "No" values in the three test columns.

    For completeness, here's a full reproducible example using your sample data:

    library(tidyverse)
    
    test_first_attack = structure(list(id = c(112139L, 43919L, 92430L, 87137L, 95417L, 
                                              66955L, 16293L, 61396L, 25379L, 79229L, 27107L, 63243L, 50627L, 
                                              17968L, 83015L, 96549L, 7332L, 4873L, 98131L, 93506L, 52894L, 
                                              59327L, 85003L, 96623L, 82999L, 65769L, 67063L, 21744L, 62961L, 
                                              2229L, 103673L, 9367L, 60215L, 74044L, 58422L, 57530L, 100399L, 
                                              46483L, 108690L, 62017L, 46467L, 79562L, 4800L, 119158L, 103222L, 
                                              32908L, 14491L, 30293L, 52558L, 122304L, 42281L, 1553L, 111771L, 
                                              23087L, 30147L, 37842L, 51552L, 20148L, 28L, 7477L), previous_cabg = structure(c(1L, 
                                                                                                                               1L, 1L, NA, 1L, NA, NA, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 
                                                                                                                               1L, NA, 1L, 1L, NA, 1L, NA, 1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, 3L, 
                                                                                                                               1L, 1L, NA, 1L, 1L, 1L, 1L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
                                                                                                                               1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, 1L, 1L, 1L), .Label = c("No", 
                                                                                                                                                                                       "Unknown", "Yes"), class = "factor"), previous_pci = structure(c(1L, 
                                                                                                                                                                                                                                                        1L, 2L, NA, 1L, NA, NA, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 
                                                                                                                                                                                                                                                        2L, NA, 2L, 1L, NA, 2L, NA, 1L, 2L, 1L, 1L, 1L, NA, 2L, 1L, 1L, 
                                                                                                                                                                                                                                                        2L, 2L, NA, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
                                                                                                                                                                                                                                                        1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, 2L, 1L, 1L), .Label = c("No", 
                                                                                                                                                                                                                                                                                                                "Yes", "Unknown"), class = "factor"), previous_ami = structure(c(2L, 
                                                                                                                                                                                                                                                                                                                                                                                 2L, 1L, 2L, 2L, NA, 2L, 1L, 2L, 2L, NA, 1L, 2L, 2L, 2L, 2L, 2L, 
                                                                                                                                                                                                                                                                                                                                                                                 1L, NA, 1L, 2L, NA, 1L, NA, 2L, 1L, 2L, 2L, 2L, NA, 1L, 1L, 1L, 
                                                                                                                                                                                                                                                                                                                                                                                 2L, 1L, NA, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 3L, 2L, 2L, 
                                                                                                                                                                                                                                                                                                                                                                                 2L, 2L, 2L, 2L, 2L, NA, 2L, 2L, 2L, 1L, 2L), .Label = c("Yes", 
                                                                                                                                                                                                                                                                                                                                                                                                                                         "No", "Unknown"), class = "factor")), row.names = c(NA, -60L), problems = structure(list(
                                                                                                                                                                                                                                                                                                                                                                                                                                           row = c(34136L, 121773L, 121779L), col = c("1.01 Hospital identifier", 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      "1.01 Hospital identifier", "1.01 Hospital identifier"), 
                                                                                                                                                                                                                                                                                                                                                                                                                                           expected = c("value in level set", "value in level set", 
                                                                                                                                                                                                                                                                                                                                                                                                                                                        "value in level set"), actual = c("CMH", "CMH", "CMH"), file = c("'../../data/changed/minap_2020_2021_second.csv'", 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         "'../../data/changed/minap_2020_2021_second.csv'", "'../../data/changed/minap_2020_2021_second.csv'"
                                                                                                                                                                                                                                                                                                                                                                                                                                                        )), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame"
                                                                                                                                                                                                                                                                                                                                                                                                                                                        )), class = c("tbl_df", "tbl", "data.frame"))
    
    
    
    test_first_attack = test_first_attack %>% 
      mutate(first_attack = case_when(
        previous_cabg == 'Yes' | previous_pci == 'Yes' | previous_ami == 'Yes' ~  'Yes',
        previous_cabg == 'No' | previous_pci == 'No' | previous_ami == 'No' ~  'No',
        TRUE ~ "Unknown/NA"
      ))
    
    test_first_attack
    #> # A tibble: 60 x 5
    #>        id previous_cabg previous_pci previous_ami first_attack
    #>     <int> <fct>         <fct>        <fct>        <chr>       
    #>  1 112139 No            No           No           No          
    #>  2  43919 No            No           No           No          
    #>  3  92430 No            Yes          Yes          Yes         
    #>  4  87137 <NA>          <NA>         No           No          
    #>  5  95417 No            No           No           No          
    #>  6  66955 <NA>          <NA>         <NA>         Unknown/NA  
    #>  7  16293 <NA>          <NA>         No           No          
    #>  8  61396 No            Yes          Yes          Yes         
    #>  9  25379 No            Yes          No           Yes         
    #> 10  79229 No            No           No           No          
    #> # … with 50 more rows