Search code examples
rdplyr

An efficient way to assign value based on a min-max range and category


I have two df:

the first df contains aptitude data. I need to create a new column sten_score which is based on raw score and the particular subscale. I am currently using case_when to manually go through each subscale and score range to assign sten_score in df#1.

I want to see if there is an efficient way to do this. ChatGPT suggested left_join but that feels error-prone (multiple common elements...

# A tibble: 119 × 5
   response_id       subscale             score grade domain
   <chr>             <chr>                <dbl> <dbl> <chr> 
 1 R_43vyKDjzHx5D7kJ Abstract Reasoning      20    10 AR    
 2 R_43vyKDjzHx5D7kJ Language Aptitude       22    10 LA    
 3 R_43vyKDjzHx5D7kJ Mechanical Reasoning    12    10 MR    
 4 R_43vyKDjzHx5D7kJ Numerical Aptitude      21    10 NA    
 5 R_43vyKDjzHx5D7kJ Perceptual Aptitude     60    10 PA    
 6 R_43vyKDjzHx5D7kJ Spatial Aptitude        21    10 SA    
 7 R_43vyKDjzHx5D7kJ Verbal Reasoning        23    10 VR    
 8 R_45Ut4Vj2cnerN2F Abstract Reasoning      24    10 AR    
 9 R_45Ut4Vj2cnerN2F Language Aptitude       25    10 LA    
10 R_45Ut4Vj2cnerN2F Mechanical Reasoning    18    10 MR

# Dput below ----------------
structure(list(response_id = c("R_43vyKDjzHx5D7kJ", "R_43vyKDjzHx5D7kJ", 
"R_43vyKDjzHx5D7kJ", "R_43vyKDjzHx5D7kJ", "R_43vyKDjzHx5D7kJ", 
"R_43vyKDjzHx5D7kJ", "R_43vyKDjzHx5D7kJ", "R_45Ut4Vj2cnerN2F", 
"R_45Ut4Vj2cnerN2F", "R_45Ut4Vj2cnerN2F"), subscale = c("Abstract Reasoning", 
"Language Aptitude", "Mechanical Reasoning", "Numerical Aptitude", 
"Perceptual Aptitude", "Spatial Aptitude", "Verbal Reasoning", 
"Abstract Reasoning", "Language Aptitude", "Mechanical Reasoning"
), score = c(20, 22, 12, 21, 60, 21, 23, 24, 25, 18), grade = c(10, 
10, 10, 10, 10, 10, 10, 10, 10, 10), domain = c("AR", "LA", "MR", 
"NA", "PA", "SA", "VR", "AR", "LA", "MR")), row.names = c(NA, 
-10L), class = c("tbl_df", "tbl", "data.frame"))

DF #2 contains sten_scores based on grades and subscales:

# A tibble: 140 × 7
   domain min_score max_score  sten  mean    sd grade
   <chr>      <int>     <int> <int> <dbl> <dbl> <dbl>
 1 LA             0         6     1  17.8  5.75    10
 2 LA             7         9     2  17.8  5.75    10
 3 LA            10        12     3  17.8  5.75    10
 4 LA            13        14     4  17.8  5.75    10
 5 LA            15        17     5  17.8  5.75    10
 6 LA            18        20     6  17.8  5.75    10
 7 LA            21        23     7  17.8  5.75    10
 8 LA            24        26     8  17.8  5.75    10
 9 LA            27        28     9  17.8  5.75    10
10 LA            29        30    10  17.8  5.75    10

# dput below ------------------

structure(list(domain = c("LA", "LA", "LA", "LA", "LA", "LA", 
"LA", "LA", "LA", "LA", "AR", "AR", "AR", "AR", "AR", "AR", "AR", 
"AR", "AR", "AR", "VR", "VR", "VR", "VR", "VR", "VR", "VR", "VR", 
"VR", "VR", "MR", "MR", "MR", "MR", "MR", "MR", "MR", "MR", "MR", 
"MR", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", 
"SA", "SA", "SA", "SA", "SA", "SA", "SA", "SA", "SA", "SA", "PA", 
"PA", "PA", "PA", "PA", "PA", "PA", "PA", "PA", "PA", "LA", "LA", 
"LA", "LA", "LA", "LA", "LA", "LA", "LA", "LA", "AR", "AR", "AR", 
"AR", "AR", "AR", "AR", "AR", "AR", "AR", "VR", "VR", "VR", "VR", 
"VR", "VR", "VR", "VR", "VR", "VR", "MR", "MR", "MR", "MR", "MR", 
"MR", "MR", "MR", "MR", "MR", "NA", "NA", "NA", "NA", "NA", "NA", 
"NA", "NA", "NA", "NA", "SA", "SA", "SA", "SA", "SA", "SA", "SA", 
"SA", "SA", "SA", "PA", "PA", "PA", "PA", "PA", "PA", "PA", "PA", 
"PA", "PA"), min_score = c(0L, 7L, 10L, 13L, 15L, 18L, 21L, 24L, 
27L, 29L, 0L, 3L, 6L, 9L, 12L, 16L, 19L, 22L, 25L, 28L, 0L, 5L, 
8L, 11L, 14L, 17L, 20L, 23L, 26L, 29L, 0L, 4L, 7L, 9L, 11L, 13L, 
16L, 18L, 20L, 23L, 0L, 3L, 6L, 10L, 13L, 16L, 20L, 23L, 26L, 
29L, 0L, 3L, 5L, 8L, 11L, 13L, 16L, 19L, 22L, 24L, 0L, 12L, 20L, 
26L, 32L, 38L, 44L, 51L, 57L, 59L, 0L, 5L, 7L, 10L, 13L, 16L, 
19L, 22L, 25L, 28L, 0L, 2L, 5L, 8L, 11L, 14L, 17L, 20L, 23L, 
26L, 0L, 4L, 6L, 9L, 12L, 15L, 18L, 21L, 24L, 27L, 0L, 4L, 6L, 
8L, 10L, 12L, 14L, 16L, 18L, 21L, 0L, 1L, 4L, 7L, 10L, 13L, 16L, 
19L, 23L, 26L, 0L, 2L, 5L, 7L, 10L, 12L, 15L, 17L, 20L, 22L, 
0L, 10L, 17L, 24L, 31L, 38L, 44L, 51L, 57L, 59L), max_score = c(6L, 
9L, 12L, 14L, 17L, 20L, 23L, 26L, 28L, 30L, 2L, 5L, 8L, 11L, 
15L, 18L, 21L, 24L, 27L, 30L, 4L, 7L, 10L, 13L, 16L, 19L, 22L, 
25L, 28L, 30L, 3L, 6L, 8L, 10L, 12L, 15L, 17L, 19L, 22L, 30L, 
2L, 5L, 9L, 12L, 15L, 19L, 22L, 25L, 28L, 30L, 2L, 4L, 7L, 10L, 
12L, 15L, 18L, 21L, 23L, 30L, 11L, 19L, 25L, 31L, 37L, 43L, 50L, 
56L, 58L, 60L, 4L, 6L, 9L, 12L, 15L, 18L, 21L, 24L, 27L, 30L, 
1L, 4L, 7L, 10L, 13L, 16L, 19L, 22L, 25L, 30L, 3L, 5L, 8L, 11L, 
14L, 17L, 20L, 23L, 26L, 30L, 3L, 5L, 7L, 9L, 11L, 13L, 15L, 
17L, 20L, 30L, NA, 3L, 6L, 9L, 12L, 15L, 18L, 22L, 25L, 30L, 
1L, 4L, 6L, 9L, 11L, 14L, 16L, 19L, 21L, 30L, 9L, 16L, 23L, 30L, 
37L, 43L, 50L, 56L, 58L, 60L), sten = c(1L, 2L, 3L, 4L, 5L, 6L, 
7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 
7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 
7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 
7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 
7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L), mean = c(17.84, 
17.84, 17.84, 17.84, 17.84, 17.84, 17.84, 17.84, 17.84, 17.84, 
15.16, 15.16, 15.16, 15.16, 15.16, 15.16, 15.16, 15.16, 15.16, 
15.16, 16.76, 16.76, 16.76, 16.76, 16.76, 16.76, 16.76, 16.76, 
16.76, 16.76, 12.91, 12.91, 12.91, 12.91, 12.91, 12.91, 12.91, 
12.91, 12.91, 12.91, 15.79, 15.79, 15.79, 15.79, 15.79, 15.79, 
15.79, 15.79, 15.79, 15.79, 12.84, 12.84, 12.84, 12.84, 12.84, 
12.84, 12.84, 12.84, 12.84, 12.84, 38.62, 38.62, 38.62, 38.62, 
38.62, 38.62, 38.62, 38.62, 38.62, 38.62, 15.68, 15.68, 15.68, 
15.68, 15.68, 15.68, 15.68, 15.68, 15.68, 15.68, 13.25, 13.25, 
13.25, 13.25, 13.25, 13.25, 13.25, 13.25, 13.25, 13.25, 14.89, 
14.89, 14.89, 14.89, 14.89, 14.89, 14.89, 14.89, 14.89, 14.89, 
11.8, 11.8, 11.8, 11.8, 11.8, 11.8, 11.8, 11.8, 11.8, 11.8, 12.85, 
12.85, 12.85, 12.85, 12.85, 12.85, 12.85, 12.85, 12.85, 12.85, 
11.78, 11.78, 11.78, 11.78, 11.78, 11.78, 11.78, 11.78, 11.78, 
11.78, 37.07, 37.07, 37.07, 37.07, 37.07, 37.07, 37.07, 37.07, 
37.07, 37.07), sd = c(5.75, 5.75, 5.75, 5.75, 5.75, 5.75, 5.75, 
5.75, 5.75, 5.75, 6.34, 6.34, 6.34, 6.34, 6.34, 6.34, 6.34, 6.34, 
6.34, 6.34, 6.05, 6.05, 6.05, 6.05, 6.05, 6.05, 6.05, 6.05, 6.05, 
6.05, 4.56, 4.56, 4.56, 4.56, 4.56, 4.56, 4.56, 4.56, 4.56, 4.56, 
6.66, 6.66, 6.66, 6.66, 6.66, 6.66, 6.66, 6.66, 6.66, 6.66, 5.33, 
5.33, 5.33, 5.33, 5.33, 5.33, 5.33, 5.33, 5.33, 5.33, 12.76, 
12.76, 12.76, 12.76, 12.76, 12.76, 12.76, 12.76, 12.76, 12.76, 
5.83, 5.83, 5.83, 5.83, 5.83, 5.83, 5.83, 5.83, 5.83, 5.83, 6.05, 
6.05, 6.05, 6.05, 6.05, 6.05, 6.05, 6.05, 6.05, 6.05, 5.94, 5.94, 
5.94, 5.94, 5.94, 5.94, 5.94, 5.94, 5.94, 5.94, 4.11, 4.11, 4.11, 
4.11, 4.11, 4.11, 4.11, 4.11, 4.11, 4.11, 6.12, 6.12, 6.12, 6.12, 
6.12, 6.12, 6.12, 6.12, 6.12, 6.12, 4.97, 4.97, 4.97, 4.97, 4.97, 
4.97, 4.97, 4.97, 4.97, 4.97, 13.66, 13.66, 13.66, 13.66, 13.66, 
13.66, 13.66, 13.66, 13.66, 13.66), grade = c(10, 10, 10, 10, 
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 
10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 
9, 9, 9, 9, 9, 9, 9, 9, 9, 9)), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -140L))

Is there an efficient way to assign sten_scores given this context?


Solution

  • dplyr supports overlap joins, e.g. between(x, y_lower, y_upper), that can be combined with other join specs:

    # "one-to-one" to throw an error if there are other relationships
    left_join(
      df1, df2, 
      by = join_by(domain, grade, between(score, min_score, max_score)), 
      relationship = "one-to-one"
    )
    #> # A tibble: 10 × 10
    #>    response_id subscale score grade domain min_score max_score  sten  mean    sd
    #>    <chr>       <chr>    <dbl> <dbl> <chr>      <int>     <int> <int> <dbl> <dbl>
    #>  1 R_43vyKDjz… Abstrac…    20    10 AR            19        21     7  15.2  6.34
    #>  2 R_43vyKDjz… Languag…    22    10 LA            21        23     7  17.8  5.75
    #>  3 R_43vyKDjz… Mechani…    12    10 MR            11        12     5  12.9  4.56
    #>  4 R_43vyKDjz… Numeric…    21    10 NA            20        22     7  15.8  6.66
    #>  5 R_43vyKDjz… Percept…    60    10 PA            59        60    10  38.6 12.8 
    #>  6 R_43vyKDjz… Spatial…    21    10 SA            19        21     8  12.8  5.33
    #>  7 R_43vyKDjz… Verbal …    23    10 VR            23        25     8  16.8  6.05
    #>  8 R_45Ut4Vj2… Abstrac…    24    10 AR            22        24     8  15.2  6.34
    #>  9 R_45Ut4Vj2… Languag…    25    10 LA            24        26     8  17.8  5.75
    #> 10 R_45Ut4Vj2… Mechani…    18    10 MR            18        19     8  12.9  4.56