Search code examples
rjoindplyrdata-manipulation

R: Random Sampling of Longitudinal Data


I have the following dataset in R (e.g. the same students take an exam each year and their results are recorded):

student_id = c(1,1,1,1,1, 2,2,2, 3,3,3,3)
exam_number = c(1,2,3,4,5,1,2,3,1,2,3,4)
exam_result = rnorm(12, 80,10)
my_data = data.frame(student_id, exam_number, exam_result)

  student_id exam_number exam_result
1          1           1    72.79595
2          1           2    81.12950
3          1           3    93.29906
4          1           4    79.33229
5          1           5    76.64106
6          2           1    95.14271

Suppose I take a random sample from this data:

library(dplyr)
random_sample = sample_n(my_data, 5, replace = TRUE)

  student_id exam_number exam_result
1          3           1    76.19691
2          3           3    87.52431
3          2           2    91.89661
4          2           3    80.05088
5          2           2    91.89661

Now, I can take the highest "exam_number" per student from this random sample:

max_value = random_sample %>%
    group_by(student_id) %>%
    summarize(max = max(exam_number))

# A tibble: 2 x 2
  student_id   max
       <dbl> <dbl>
1          2     3
2          3     3

Based on these results - I want to accomplish the following. For the students that were selected in "random_sample":

  • Create a dataset that contains all rows occurring AFTER the "max exam number" (e.g. call this dataset "data_after")
  • Create a dataset that contains all rows occurring BEFORE (and equal to) the "max exam number" (e.g. call this dataset "data_before")

In the example I have created, this would look something like this:

# after
  student_id exam_number exam_result
1          3           4    105.5805

# before

  student_id exam_number exam_result
1          2           1    95.14000
2          2           2    91.89000
3          2           3    80.05000
4          3           1    76.19691
5          3           2   102.00875
6          3           3    87.52431

Currently, I am trying to do this in a very indirect way using JOINS and ANTI_JOINS:

    max_3 = as.numeric(max_value[2,2])
    max_s3 =  max_3 - 1
    student_3 = seq(1, max_s3 , by = 1)

    before_student_3 =  my_data[is.element(my_data$exam_number, student_3) & my_data$student_id == 3,]

remainder_student_3 = my_data[my_data$student_id == 3,]
after_student_3 =  anti_join(remainder_student_3, before_student_3)

But I don't think I am doing this correctly - can someone please show me how to do this?

Thanks!


Solution

  • The code above also uses a join, like it is said in the question. Then, the wanted data sets are created by filtering the join result.

    student_id = c(1,1,1,1,1, 2,2,2, 3,3,3,3)
    exam_number = c(1,2,3,4,5,1,2,3,1,2,3,4)
    exam_result = rnorm(12, 80,10)
    my_data = data.frame(student_id, exam_number, exam_result)
    
    suppressPackageStartupMessages({
      library(dplyr)
    })
    
    set.seed(2022)
    (random_sample = sample_n(my_data, 5, replace = TRUE))
    #>   student_id exam_number exam_result
    #> 1          1           4    73.97148
    #> 2          1           3    84.77151
    #> 3          2           2    78.76927
    #> 4          3           3    69.35063
    #> 5          1           4    73.97148
    
    
    max_value = random_sample %>%
      group_by(student_id) %>%
      summarize(max = max(exam_number))
    
    # join only once
    max_value %>%
      left_join(my_data, by = "student_id") -> join_data
    join_data
    #> # A tibble: 12 × 4
    #>    student_id   max exam_number exam_result
    #>         <dbl> <dbl>       <dbl>       <dbl>
    #>  1          1     4           1        71.0
    #>  2          1     4           2        69.1
    #>  3          1     4           3        84.8
    #>  4          1     4           4        74.0
    #>  5          1     4           5        80.7
    #>  6          2     2           1        77.4
    #>  7          2     2           2        78.8
    #>  8          2     2           3        69.5
    #>  9          3     3           1        83.9
    #> 10          3     3           2        62.7
    #> 11          3     3           3        69.4
    #> 12          3     3           4       102.
    
    data_before <- join_data %>%
      group_by(student_id) %>%
      filter(exam_number <= max) %>%
      ungroup() %>%
      select(-max)
    
    data_after <- join_data %>%
      group_by(student_id) %>%
      filter(exam_number > max) %>%
      ungroup() %>%
      select(-max)
    
    data_before
    #> # A tibble: 9 × 3
    #>   student_id exam_number exam_result
    #>        <dbl>       <dbl>       <dbl>
    #> 1          1           1        71.0
    #> 2          1           2        69.1
    #> 3          1           3        84.8
    #> 4          1           4        74.0
    #> 5          2           1        77.4
    #> 6          2           2        78.8
    #> 7          3           1        83.9
    #> 8          3           2        62.7
    #> 9          3           3        69.4
    
    data_after
    #> # A tibble: 3 × 3
    #>   student_id exam_number exam_result
    #>        <dbl>       <dbl>       <dbl>
    #> 1          1           5        80.7
    #> 2          2           3        69.5
    #> 3          3           4       102.
    
    # final clean-up
    rm(join_data)
    

    Created on 2022-12-10 with reprex v2.0.2