Search code examples
rdplyr

Create a conditional variable for incomplete data


Attached data:

id = c (2, 2, 2, 2, 2, 2, 2, 2, 2, 2)
vm = c("13", "14", "15", "16", "17", "19", "20", "21", "22", "23")
GE = c("0", "0", "0", "0" "0", "0", "1", "0", "1","0")
fichier <- data.frame(id, vm, GE)

Attached R code:

library(tidyverse)
fichier <- fichier %>%
mutate(statut = case_when(
  fichier$vm == 13 & fichier$GE == 1 ~ "infection",
  fichier$vm == 14 & fichier$GE == 1 ~ "infection",
  fichier$vm == 15 & fichier$GE == 1 ~ "infection",
  fichier$vm == 16 & fichier$GE == 1 ~ "infection",
  fichier$vm == 17 & fichier$GE == 1 ~ "infection",
  fichier$vm == 19 & fichier$GE == 1 ~ "infection",
  fichier$vm == 20 & fichier$GE == 1 ~ "infection",
  fichier$vm == 21 & fichier$GE == 1 ~ "infection",
  fichier$vm == 22 & fichier$GE == 1 ~ "infection",
  fichier$vm == 23 & fichier$GE == 1 ~ "infection",
  TRUE ~ "noinfection")
)

Attached Results:

id = c (2, 2, 2, 2, 2, 2, 2, 2, 2, 2),
vm = c("13", "14", "15", "16", "17", "19", "20", "21", "22", "23"), 
GE = c("0", "0", "0", "0" "0", "0", "1", "0", "1","0"), 
statut = c("noinfection", "noinfection", "noinfection", "noinfection", "noinfection", "noinfection", "infection", "noinfection", "infecton", "noinfection")
fichier <- data.frame(id, vm, GE, status)

Please, I would like to be able to complete "status" variable at vm = 18 and vm = 24.

a) I would like to give this variable at vm = 18

- the value 0 if GE = 0 during the previous visits of 13 to 17.

- the value 1 if GE = 1 during the previous visits of 13 to 17.

b) I would like to give this variable at vm = 24

- the value 0 if GE = 0 during the previous visits of 19 to 23.

- the value 1 if GE = 1 during the previous visits of 19 to 23.


Solution

  • I see two potential scenarios in your description:

    1. "status" for 18 == 0 and 24 == 1 as at least one previous GE value == 1 within set timeframe for 24
    2. "status" for 18 == 0 and 24 == NA as not all previous GE values within set timeframe for 24 were the same

    Either way, I have added solutions for both scenarios.

    Load required packages and your data:

    library(dplyr)
    library(tidyr)
    
    # Your sample data
    id <- c (2, 2, 2, 2, 2, 2, 2, 2, 2, 2)
    vm <- c("13", "14", "15", "16", "17", "19", "20", "21", "22", "23")
    GE <- c("0", "0", "0", "0", "0", "0", "1", "0", "1","0")
    
    fichier <- data.frame(id, vm, GE)
    

    Scenario 1:

    # Add missing data if all previous visit == 0 or any previous visits == 1
    fichier <- fichier |>
      mutate(across(c(vm, GE), as.integer)) |>
      group_by(id) |> # assuming your data may have multiple id values
      complete(vm = 13:24) |>
      fill(id, .direction = "down") |>
      mutate(tmp = if_else(vm <= 18, 1, 2)) |>
      group_by(id, tmp) |>
      mutate(GE = case_when(is.na(GE) & sum(GE, na.rm = TRUE) == 0 ~ 0,
                              is.na(GE) & sum(GE, na.rm = TRUE) > 0 ~ 1,
                              .default = GE),
             status = if_else(GE == 1, "infection", "noinfection")) |>
      ungroup() |>
      select(-tmp)
    
    fichier
    # # A tibble: 12 × 4
    #       id    vm    GE status     
    #    <dbl> <int> <dbl> <chr>      
    #  1     2    13     0 noinfection
    #  2     2    14     0 noinfection
    #  3     2    15     0 noinfection
    #  4     2    16     0 noinfection
    #  5     2    17     0 noinfection
    #  6     2    18     0 noinfection
    #  7     2    19     0 noinfection
    #  8     2    20     1 infection  
    #  9     2    21     0 noinfection
    # 10     2    22     1 infection  
    # 11     2    23     0 noinfection
    # 12     2    24     1 infection
    

    Scenario 2:

    # Add missing data if all previous visit == 0 or all previous visits == 1
    fichier <- fichier |>
      mutate(across(c(vm, GE), as.integer)) |>
      group_by(id) |> # assuming your data may have multiple id values
      complete(vm = 13:24) |>
      fill(id, .direction = "down") |>
      mutate(tmp = if_else(vm <= 18, 1, 2)) |>
      group_by(id, tmp) |>
      mutate(tmp1 = +(n_distinct(GE, na.rm = TRUE) == 1),
             GE = case_when(is.na(GE) & tmp1 == 1 & first(GE) == 0 ~ 0,
                            is.na(GE) & tmp1 == 1 & first(GE) == 1 ~ 1,
                            .default = GE),
             status = case_when(GE == 0 ~ "noinfection",
                                GE == 1 ~ "infection",
                                .default = "mixed")) |>
      ungroup() |>
      select(-starts_with("tmp"))
    
    fichier
    # # A tibble: 12 × 4
    #       id    vm    GE status     
    #    <dbl> <int> <dbl> <chr>      
    #  1     2    13     0 noinfection
    #  2     2    14     0 noinfection
    #  3     2    15     0 noinfection
    #  4     2    16     0 noinfection
    #  5     2    17     0 noinfection
    #  6     2    18     0 noinfection
    #  7     2    19     0 noinfection
    #  8     2    20     1 infection  
    #  9     2    21     0 noinfection
    # 10     2    22     1 infection  
    # 11     2    23     0 noinfection
    # 12     2    24    NA mixed