Search code examples
rdataframedplyrstringr

How to add new variable according another value of variable?


This is my dataframe:

enter image description here

or you may copy it:

structure(list(A_levels = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 
    College_Uni = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), CSEs = c(0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0), NVQ_HND_HNC = c(0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0), O_levels_GCSEs = c(1, 1, 1, 1, 1, 1, 
    1, 1, 1, 1, 1), Other_prof_qual = c(0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0), Prefer_not_to_answer = c(0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0), None_of_the_above = c(0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0), id = 5000:5010), row.names = c(NA, -11L), class = c("tbl_df", 
"tbl", "data. Frame")) -> df

I want to mutate a edu_level of new variable, it's value according to this:

#my vairabe               edu_level
#College_Uni              high
#A_levels                 medium
#O_levels_GCSEs           medium
#CSEs)                    low
#NVQ_HND_HNC              low
#Other_prof_qual          low
#Prefer_not_to_answer     NA
#None_of_the_above        NA

I think maybe because it is a multiple-choice question, some people have more than two levels, so I don't know how to specify the classification.

I want to choose the priority of its edu_level to be the highest degree first. For example, if it has both College_Uni and A_levels, the value of edu_level is high.


Solution

  • I created a dataframe with different levels of education to showcase the solution;

    library(dplyr)
    library(tidyr)
    
    myvars %>% 
      mutate(num_level = case_when(is.na(edu_level) ~ 0, 
                                   edu_level == "low" ~ 1, 
                                   edu_level == "medium" ~ 2,
                                   edu_level == "high" ~ 3)) -> myvars
    
    df1 %>% 
      pivot_longer(-id, names_to = "degree") %>% 
      filter(value != 0) %>% 
      right_join(., myvars, join_by(degree == variable)) %>% 
      slice(which.max(num_level), .by = id) %>% 
      select(-value, -num_level) %>% ## you can also drop "degree"
      left_join(., df1)
    
    #> Joining with `by = join_by(id)`
    #> # A tibble: 11 x 11
    #>       id degree  edu_level A_levels College_Uni  CSEs NVQ_HND_HNC O_levels_GCSEs
    #>    <int> <chr>   <chr>        <dbl>       <dbl> <dbl>       <dbl>          <dbl>
    #>  1  5000 Colleg~ high             0           1     0           0              1
    #>  2  5001 A_leve~ medium           1           0     0           0              1
    #>  3  5002 A_leve~ medium           1           0     0           0              1
    #>  4  5003 O_leve~ medium           0           0     0           1              1
    #>  5  5004 Colleg~ high             0           1     0           0              1
    #>  6  5005 Other_~ low              0           0     0           0              0
    #>  7  5006 Colleg~ high             0           1     0           1              1
    #>  8  5007 CSEs    low              0           0     1           0              0
    #>  9  5008 Colleg~ high             0           1     0           0              0
    #> 10  5009 Prefer~ <NA>             0           0     0           0              0
    #> 11  5010 None_o~ <NA>             0           0     0           0              0
    #> # i 3 more variables: Other_prof_qual <dbl>, Prefer_not_to_answer <dbl>,
    #> #   None_of_the_above <dbl>
    

    Data:

    data.frame(A_levels =             c(0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), 
               College_Uni =          c(1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0), 
               CSEs =                 c(0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0), 
               NVQ_HND_HNC =          c(0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0), 
               O_levels_GCSEs =       c(1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0), 
               Other_prof_qual =      c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0), 
               Prefer_not_to_answer = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0), 
               None_of_the_above =    c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1), 
               id = 5000:5010) -> df1
    
    
    read.table(text = "variable              edu_level
                       College_Uni           high
                       A_levels              medium
                       O_levels_GCSEs        medium
                       CSEs                  low
                       NVQ_HND_HNC           low
                       Other_prof_qual       low
                       Prefer_not_to_answer  NA
                       None_of_the_above     NA", header = T, stringsAsFactor = F) -> myvars