Search code examples
rcountmultiple-columnsrowwise

Counting, conditionals and constellation variable for hundred of variables in a data frame in R


I am working with a dataset where I need to evaluate hundreds of columns at the time to create new variables with computations by row. I have three new variables, one needs the "or" operator to decide if there is any "yes" across the ~100 columns. The second one needs to count across the variables how many "yes" I have in total, and the third one needs to create a constellation variable that shows me the name of variables with the "yes" value, all of this by row. I have the code for the first two, but for the third one I am stuck. Also, I am using only a few variables for example purposes but I have ~100 variables that I need to use. My code is below:

#making the data - I am using actually ~100 variables 

test.data <- data.frame(var1 = c("yes", "no", "no", "N/A", NA, NA),
                        var2 = c(NA, NA, "yes", "no", "yes", NA),
                        var3 = c("yes", "yes", "yes", "no", "yes", "N/A"),
                        var4 = c("N/A", "yes", "no", "no", "yes", NA))

# code for the first two variables: is.positive and number.pos - not elegant nor efficient since I #need to work with ~100 vars

final.data <- data.frame(test.data %>%
              mutate(is.positive = ifelse(var1=="yes" | var2=="yes" | var3=="yes" | var4=="yes", 1,
                                       ifelse((is.na(var1) | var1=="N/A") & 
                                                (is.na(var2) | var2=="N/A") & 
                                                (is.na(var3) | var3=="N/A") & 
                                                (is.na(var4) | var4=="N/A"), NA, 0))) %>%
              rowwise() %>%
              mutate(number.pos = sum(c_across(c(var1, var2, var3, var4))=="yes",na.rm=TRUE)))

example


Solution

  • You could do it by making a list column for which ones are positive and then deriving the other values from that.

    library(tidyverse)
    test.data <- data.frame(var1 = c("yes", "no", "no", "N/A", NA, NA),
                            var2 = c(NA, NA, "yes", "no", "yes", NA),
                            var3 = c("yes", "yes", "yes", "no", "yes", "N/A"),
                            var4 = c("N/A", "yes", "no", "no", "yes", NA))
    
    
    nv <- test.data %>% 
      select(var1:var4) %>% 
      names()
    out <- test.data %>% 
      rowwise() %>% 
      mutate(which_pos = list(nv[which(c_across(var1:var4) == "yes")]), 
             num.positive = length(which_pos), 
             is.positive = num.positive > 0) 
    
    out
    #> # A tibble: 6 × 7
    #> # Rowwise: 
    #>   var1  var2  var3  var4  which_pos num.positive is.positive
    #>   <chr> <chr> <chr> <chr> <list>           <int> <lgl>      
    #> 1 yes   <NA>  yes   N/A   <chr [2]>            2 TRUE       
    #> 2 no    <NA>  yes   yes   <chr [2]>            2 TRUE       
    #> 3 no    yes   yes   no    <chr [2]>            2 TRUE       
    #> 4 N/A   no    no    no    <chr [0]>            0 FALSE      
    #> 5 <NA>  yes   yes   yes   <chr [3]>            3 TRUE       
    #> 6 <NA>  <NA>  N/A   <NA>  <chr [0]>            0 FALSE
    out$which_pos
    #> [[1]]
    #> [1] "var1" "var3"
    #> 
    #> [[2]]
    #> [1] "var3" "var4"
    #> 
    #> [[3]]
    #> [1] "var2" "var3"
    #> 
    #> [[4]]
    #> character(0)
    #> 
    #> [[5]]
    #> [1] "var2" "var3" "var4"
    #> 
    #> [[6]]
    #> character(0)
    

    Created on 2022-05-26 by the reprex package (v2.0.1)

    If you wanted a normal column for the variable identifying which ones are positive, you could simply paste the names together to create a string that has comma-separated names:

    library(tidyverse)
    test.data <- data.frame(var1 = c("yes", "no", "no", "N/A", NA, NA),
                            var2 = c(NA, NA, "yes", "no", "yes", NA),
                            var3 = c("yes", "yes", "yes", "no", "yes", "N/A"),
                            var4 = c("N/A", "yes", "no", "no", "yes", NA))
    
    
    nv <- test.data %>% 
      select(var1:var4) %>% 
      names()
    out <- test.data %>% 
      rowwise() %>% 
      mutate(which_pos = paste(nv[which(c_across(var1:var4) == "yes")], collapse=","), 
             num.positive = sum(c_across(var1:var4) == "yes", na.rm=TRUE), 
             is.positive = num.positive > 0) 
    
    out
    #> # A tibble: 6 × 7
    #> # Rowwise: 
    #>   var1  var2  var3  var4  which_pos        num.positive is.positive
    #>   <chr> <chr> <chr> <chr> <chr>                   <int> <lgl>      
    #> 1 yes   <NA>  yes   N/A   "var1,var3"                 2 TRUE       
    #> 2 no    <NA>  yes   yes   "var3,var4"                 2 TRUE       
    #> 3 no    yes   yes   no    "var2,var3"                 2 TRUE       
    #> 4 N/A   no    no    no    ""                          0 FALSE      
    #> 5 <NA>  yes   yes   yes   "var2,var3,var4"            3 TRUE       
    #> 6 <NA>  <NA>  N/A   <NA>  ""                          0 FALSE
    

    Created on 2022-05-26 by the reprex package (v2.0.1)

    The list column might be easier to use in subsequent analyses if needed, but the comma-separated variable maybe easier to use for visual inspection.