Search code examples
rtimestatuslongitudinal

In a longitudinal dataset, how to create a status variable 0/1 depending on a second, diagnostic variable that varies with time


I work on sexually transmitted diseases, such as gonorrhoea, and their possible consequences, such as ectopic pregnancy in women. My dataset (df_gono) contains longitudinal epidemiological data by year category, age category, sex and HIV status. It also has a variable that counts (integer from 0 to n) the number of diagnoses of gonorrhoea per age_cat, year_cat and HIV status. I would like to create a variable that informs about the "gonorrhoea status" of patients, with patients taking 1 as soon as they have been diagnosed at least once in a given age_cat and year_cat. In other words, for each patient, whenever the variable n_gono is greater than 0, status_gono takes 1 for that line and the following rows. Result should be as df_gono2.

df_gono <- data.frame(patient = c("A", "A", "A", "A", "A","A", "B", "B", "B", "C", "C", "C", "C", "C", "D", "D", "D", "D", "E", "E", "E"), 
                      sex = c("Female", "Female", "Female", "Female", "Female", "Female", "Female","Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female"), 
                      age_current_cat = c(5, 5, 6, 6, 6, 7, 7, 8, 8, 5, 6, 6, 7, 7, 3, 3, 4, 4, 6, 7, 7), 
                      calyear_current_cat = c(2, 2, 2, 3, 4, 4, 2, 2, 3, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 4, 4), 
                      age_cat = c("30-34","30-34", "35-39", "35-39", "35-39", "40-44", "40-44", "45-49", "45-49", "30-34", "35-39", "35-39", "40-44", "40-44", "20-24", "20-24", "25-29", "25-29", "35-39", "40-44", "40-44"), 
                      year_cat = c("2011-2013", "2011-2013", "2011-2013", "2014-2017", "2018-2020", "2018-2020", "2011-2013", "2011-2013", "2014-2017", "2011-2013", "2011-2013", "2014-2017", "2014-2017", "2018-2020", "2011-2013", "2014-2017", "2014-2017", "2018-2020", "2018-2020", "2018-2020", "2018-2020"), 
                      hiv = c(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1),
                      pat_year = c(0.05475702, 2.52635181, 0.41957563, 3.00068446, 1.57973990, 1.91649555, 0.08555784, 2.91512663, 2.08076660, 0.08418891, 2.91649555, 2.08350445, 0.91718001, 3.49623546, 3.00068446, 0.83299110, 2.16769336, 0.99657769, 0.25188227, 0.18343600, 2.22861054),
                      n_gono = c(0, 1, 0, 2, 0, 0, 0, 4, 4, 0, 0, 0, 0, 0, 7, 1, 1, 0, 0, 2, 7))

df_gono2 <- data.frame(patient = c("A", "A", "A", "A", "A","A", "B", "B", "B", "C", "C", "C", "C", "C", "D", "D", "D", "D", "E", "E", "E"), 
                       sex = c("Female", "Female", "Female", "Female", "Female", "Female", "Female","Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female"), 
                       age_current_cat = c(5, 5, 6, 6, 6, 7, 7, 8, 8, 5, 6, 6, 7, 7, 3, 3, 4, 4, 6, 7, 7), 
                       calyear_current_cat = c(2, 2, 2, 3, 4, 4, 2, 2, 3, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 4, 4), 
                       age_cat = c("30-34","30-34", "35-39", "35-39", "35-39", "40-44", "40-44", "45-49", "45-49", "30-34", "35-39", "35-39", "40-44", "40-44", "20-24", "20-24", "25-29", "25-29", "35-39", "40-44", "40-44"), 
                       year_cat = c("2011-2013", "2011-2013", "2011-2013", "2014-2017", "2018-2020", "2018-2020", "2011-2013", "2011-2013", "2014-2017", "2011-2013", "2011-2013", "2014-2017", "2014-2017", "2018-2020", "2011-2013", "2014-2017", "2014-2017", "2018-2020", "2018-2020", "2018-2020", "2018-2020"), 
                       hiv = c(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1),
                       pat_year = c(0.05475702, 2.52635181, 0.41957563, 3.00068446, 1.57973990, 1.91649555, 0.08555784, 2.91512663, 2.08076660, 0.08418891, 2.91649555, 2.08350445, 0.91718001, 3.49623546, 3.00068446, 0.83299110, 2.16769336, 0.99657769, 0.25188227, 0.18343600, 2.22861054),
                       n_gono = c(0, 1, 0, 2, 0, 0, 0, 4, 4, 0, 0, 0, 0, 0, 7, 1, 1, 0, 0, 2, 7),
                       status_gono = c(0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1))

Solution

  • Try this solution, you'll need to install the libraries dplyr and tidyr:

    library(dplyr)
    library(tidyr)
    
    df_gono2 |> 
      group_by(patient) |> 
      mutate(status_gono2 = ifelse(n_gono >= 1, 1, NA)) |> 
      fill(status_gono2, .direction = "down") |> 
      replace_na(list (status_gono2 = 0)) |>
      select(status_gono, status_gono2) 
    #> Adding missing grouping variables: `patient`
    #> # A tibble: 21 × 3
    #> # Groups:   patient [5]
    #>    patient status_gono status_gono2
    #>    <chr>         <dbl>        <dbl>
    #>  1 A                 0            0
    #>  2 A                 1            1
    #>  3 A                 1            1
    #>  4 A                 1            1
    #>  5 A                 1            1
    #>  6 A                 1            1
    #>  7 B                 0            0
    #>  8 B                 1            1
    #>  9 B                 1            1
    #> 10 C                 0            0
    #> 11 C                 0            0
    #> 12 C                 0            0
    #> 13 C                 0            0
    #> 14 C                 0            0
    #> 15 D                 1            1
    #> 16 D                 1            1
    #> 17 D                 1            1
    #> 18 D                 1            1
    #> 19 E                 0            0
    #> 20 E                 1            1
    #> 21 E                 1            1
    

    Data given:

    df_gono2 <- data.frame(patient = c("A", "A", "A", "A", "A","A", "B", "B", "B", "C", "C", "C", "C", "C", "D", "D", "D", "D", "E", "E", "E"), 
                           sex = c("Female", "Female", "Female", "Female", "Female", "Female", "Female","Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female"), 
                           age_current_cat = c(5, 5, 6, 6, 6, 7, 7, 8, 8, 5, 6, 6, 7, 7, 3, 3, 4, 4, 6, 7, 7), 
                           calyear_current_cat = c(2, 2, 2, 3, 4, 4, 2, 2, 3, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 4, 4), 
                           age_cat = c("30-34","30-34", "35-39", "35-39", "35-39", "40-44", "40-44", "45-49", "45-49", "30-34", "35-39", "35-39", "40-44", "40-44", "20-24", "20-24", "25-29", "25-29", "35-39", "40-44", "40-44"), 
                           year_cat = c("2011-2013", "2011-2013", "2011-2013", "2014-2017", "2018-2020", "2018-2020", "2011-2013", "2011-2013", "2014-2017", "2011-2013", "2011-2013", "2014-2017", "2014-2017", "2018-2020", "2011-2013", "2014-2017", "2014-2017", "2018-2020", "2018-2020", "2018-2020", "2018-2020"), 
                           hiv = c(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1),
                           pat_year = c(0.05475702, 2.52635181, 0.41957563, 3.00068446, 1.57973990, 1.91649555, 0.08555784, 2.91512663, 2.08076660, 0.08418891, 2.91649555, 2.08350445, 0.91718001, 3.49623546, 3.00068446, 0.83299110, 2.16769336, 0.99657769, 0.25188227, 0.18343600, 2.22861054),
                           n_gono = c(0, 1, 0, 2, 0, 0, 0, 4, 4, 0, 0, 0, 0, 0, 7, 1, 1, 0, 0, 2, 7),
                           status_gono = c(0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1))