Search code examples
rcriteriarules

How to create new dataset based on existing dataset


I have the following dataset:

individual    sequence_special_drug     all_drugs
A             NA                        A
A             NA                        B
A             1                         C
A             2                         D
A             NA                        B
A             NA                        Z
A             2                         D
A             NA                        Z
A             2                         D
A             NA                        A
A             3                         E

I would like to create the following dataset in R:

Individual  sequence_special_drug  special_drug  prior_special_drug  prior_traditional_drug   during_special_drug
A           1                      C             none                A, B                     none
A           2                      D             C                   none                     B, Z, Z
A           3                      E             C, D                A, B, B, Z, Z, A         none

Is there a quick way to do this? I have many individuals, but these are all the possible scenarios. A special_drug is identified by the sequence number; those with 'NA' are a traditional_drug.

prior_special_drug will contain any special_drug previously identified, so for the first special_drug C there is no previous special_drug, for the second special_drug D, there is one previous special_drug that is C, and for the third special_drug there are two previous special_drugs C and D.

prior_traditional_drug is the same but will contain anything that has been identified in sequence_special_drug as NA. So for the first special_drug (C), the two prior_traditional_drugs are A and B. For the third special_drug, the prior_traditional_drugs are A, B, B, Z, Z, A.

during_special_drug will contain every traditional_drug that have been referenced during the administration of special_drug. This can be identified in the dataset through the repetition of sequence_special_drug (e.g. 2 -> NA NA -> 2 -> NA -> 2) therefore B, Z, Z.

EDIT - For 2 individuals:

dat <- read.table(
  text = "
  individual    sequence_special_drug     all_drugs
A             NA                        A
A             NA                        B
A             1                         C
A             2                         D
A             NA                        B
A             NA                        Z
A             2                         D
A             NA                        Z
A             2                         D
A             NA                        A
A             3                         E
B             1                         D
B             NA                        B
B             NA                        Z
B             1                         D
B             NA                        Z
B             1                         D
B             NA                        A
B             2                         E",
  header = TRUE)

I would expect:

- WRONG "none" line 3 under prior_traditional_drug -

Individual  sequence_special_drug  special_drug  prior_special_drug  prior_traditional_drug   during_special_drug
A           1                      C             none                A, B                     none
A           2                      D             C                   none                     B, Z, Z
A           3                      E             C, D                A, B, B, Z, Z, A         none
B           1                      D             none                none                     B, Z, Z
B           2                      E             D                   B, Z, Z, A               none

- RIGHT "A, B" line 3 under prior_traditional_drug -

Individual  sequence_special_drug  special_drug  prior_special_drug  prior_traditional_drug   during_special_drug
A           1                      C             none                A, B                     none
A           2                      D             C                   A, B                     B, Z, Z
A           3                      E             C, D                A, B, B, Z, Z, A         none
B           1                      D             none                none                     B, Z, Z
B           2                      E             D                   B, Z, Z, A               none

But I obtained: enter image description here

Error message with my own dataset

> special_drug <- example_data %>% 
+   nest_by(individual) %>% 
+   mutate(
+     spec_drug = list(get_all_drugs(data))
+   ) %>% 
+   unnest(spec_drug) %>%
+   select(-data) %>% 
+   ungroup()
`summarise()` has grouped output by 'sequence_special_drug'. You can override using the `.groups` argument.
 Error: Problem with `mutate()` input `spec_drug`.
x Problem with `mutate()` input `flag3`.
x `false` must be a list, not a character vector.
ℹ Input `flag3` is `if_else(flag1 == 1, list(character(0)), flag3)`.
ℹ Input `spec_drug` is `list(get_all_drugs(data))`.
Run `rlang::last_error()` to see where the error occurred.

 > rlang::last_error()
Error in is_rlang_error(parent) : 
  argument "parent" is missing, with no default

My own dataset is more like this:

example_data <- read.table(
  text = "
  individual    sequence_special_drug     all_drugs
77779             NA                      Name1
77779             1                       Name2
77779             1                       Name2
77779             1                       Name2
77779             2                       Name3
4444              NA                      Name1
4444              1                       Name4
4444              2                       Name3
4444              3                       Name7",
  header = TRUE)

But the dataset below also generates the same error message:

example_data <- read.table(
  text = "
  individual    sequence_special_drug     all_drugs
A               NA                        A
A               1                         C
A               2                         D
A               2                         D
A               2                         D
A               3                         E
B               NA                        B
B               1                         D
B               2                         E
B               3                         F",
  header = TRUE)

Solution

  • Here is my suggestion using {tidyverse}. I wrote a function to get each column and then put them together in get_all_drugs(). Then, I ran the function through the nested data by individual, as in the example below.

    library(tidyverse)
    
    example_data <- read.table(
      text = "
      individual    sequence_special_drug     all_drugs
    A             NA                        A
    A             NA                        B
    A             1                         C
    A             2                         D
    A             NA                        B
    A             NA                        Z
    A             2                         D
    A             NA                        Z
    A             2                         D
    A             NA                        A
    A             3                         E
    B             1                         D
    B             NA                        B
    B             NA                        Z
    B             1                         D
    B             NA                        Z
    B             1                         D
    B             NA                        A
    B             2                         E",
    header = TRUE)
     
    get_special_drugs <- function(.data) {
      .data %>% 
        filter(sequence_special_drug != 0) %>% 
        distinct() %>% 
        select(sequence_special_drug, special_drug = all_drugs) %>% 
        mutate(prior_special_drug = as.list(accumulate(special_drug, c))) %>% 
        rowwise() %>% 
        mutate(prior_special_drug = list(
          prior_special_drug[prior_special_drug != special_drug]
        )) %>% 
        ungroup()
    }
    
    fix_drug_sequence <- function(.data) {
      .data %>% 
        mutate(
          seq_drug = replace_na(sequence_special_drug, 0),
          flag = if_else(seq_drug == 0 & seq_drug != lead(seq_drug),
                         lead(seq_drug),
                         seq_drug),
          flag = if_else(flag == 0 & flag != lead(flag),
                         lead(flag),
                         flag)
        ) %>% 
        select(-sequence_special_drug) %>% 
        rename(sequence_special_drug = flag)
    }
    
    get_prior_traditional_drug <- function(...) {
      fix_drug_sequence(...) %>% 
        group_by(sequence_special_drug) %>% 
        mutate(
          flag1 = max(seq_drug == sequence_special_drug & row_number() == 1),
        ) %>% 
        group_by(sequence_special_drug, flag1) %>% 
        summarise(
          flag2 = list(all_drugs[seq_drug == 0])
        ) %>% 
        ungroup() %>% 
        mutate(
          flag3 = as.list(accumulate(flag2, append)),
          flag3 = if_else(flag1 == 1, lag(flag3), flag3)
        ) %>% 
        select(sequence_special_drug, prior_traditional_drug = flag3)
    }
    
    get_during_special_drugs <- function(...) {
      fix_drug_sequence(...) %>% 
        group_by(sequence_special_drug) %>% 
        mutate(
          flag = cumsum(seq_drug == sequence_special_drug)
        ) %>% 
        filter(flag > 0) %>% 
        summarise(
          during_special_drug = list(all_drugs[seq_drug == 0])
        )
    }
    
    get_all_drugs <- function(.data) {
      spec_drug <- get_special_drugs(.data)
      prior_traditional <- get_prior_traditional_drug(.data)
      during_spec <- get_during_special_drugs(.data)
      
      list(spec_drug, prior_traditional, during_spec) %>% 
        reduce(left_join, by = "sequence_special_drug")
    }
    
    special_drug <- example_data %>% 
      nest_by(individual) %>% 
      mutate(
        spec_drug = list(get_all_drugs(data))
      ) %>% 
      unnest(spec_drug) %>%
      select(-data) %>% 
      ungroup()
    
    special_drug
    

    enter image description here