Search code examples
rfilteruniqueextract

R: Extract unique data with several conditions


How to create a new data set with extracted unique ID with maximum time ≤ 4 years and their status at corresponding max time (4 years or less) and corresponding cancer variable also at max time?

I have such data: Data example

I want create such data set as in data1:Data what I want to extract

data <- structure(list(State = structure(c(1L, 1L, 1L, 1L,1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L,3L, 3L, 3L, 3L, 3L), .Label = c("1", "2", "3"), class = "factor"),
                       Time = structure(1:18, .Label = c("0", "1", "2", "3", "4", "5", "0", "1", "2", "3", "0", "1", "2", "3", "4", "5", "6", "7"), class = "factor"),
                       Status = c(0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L ),
                       cancer = structure(1:18, .Label = c("1", "1", "1", "1", "1", "1",  "2", "2", "2", "2", "1", "1", "1", "1", "1", "1", "1", "1"), class = "factor")),
                  .Names = c("ID", "timeYears", "status", "cancer"),
                  class = "data.frame", row.names = c(NA, -18L))
data1 <- structure(list(State = structure(c(1L, 2L, 3L), .Label = c("1", "2", "3"), class = "factor"),
                        Time = structure(1:3, .Label = c("4", "3", "4"), class = "factor"),
                        Status = c( 1L, 0L, 0L),
                        cancer = structure(1:3, .Label = c("1", "2", "1"), class = "factor")),
                   .Names = c("ID", "timeYears", "status", "cancer"),
                   class = "data.frame", row.names = c(NA, -3L))

Solution

  • dplyr

    library(dplyr)
    data %>%
      dplyr::filter(as.integer(as.character(timeYears)) <= 4) %>%
      group_by(ID) %>%
      slice_max(timeYears) %>%
      ungroup()
    # # A tibble: 3 × 4
    #   ID    timeYears status cancer
    #   <fct> <fct>      <int> <fct> 
    # 1 1     4              1 1     
    # 2 2     3              0 2     
    # 3 3     4              0 1     
    

    base R

    data[ave(as.integer(as.character(data$timeYears)), data$ID,
             FUN = function(z) z == max(z[z <= 4])) > 0,]
    #    ID timeYears status cancer
    # 5   1         4      1      1
    # 10  2         3      0      2
    # 15  3         4      0      1