Search code examples
rif-statementrow

Create a new factor level (new row) based on data from other rows with conditional statements


df <- data.frame(PatientID = c("0002" ,"0002", "0005", "0005" ,"0009" ,"0009" ,"0018", "0018" ,"0039" ,"0039" , "0043" ,"0043", "0046", "0046" ,"0048" ,"0048"),
                 Timepoint= c("A", "B", "A", "B", "A", "B", "A", "B", "A", "B",  "A", "B",  "A", "B", "A", "B"),
                 A = c(NA , 977.146 , NA , 964.315 ,NA , 952.311 , NA , 950.797 , 947.465 , 902.852 ,  985.124  ,NA , 930.141 ,1007.790 , 1027.110 , 999.414),
                 B = c(998.988 , NA , 998.680 , NA , 1020.560 ,  955.540 , 911.606 , 964.039   ,  988.087 , 902.367 , 959.338 ,1029.050 , 987.374 ,1066.400  ,957.512 , 917.597),
                 C = c( 987.140 , 961.810 , 929.466 , 978.166, 969.469 , 943.398  ,936.034,  965.292 , 996.404 , 920.610 , 967.047, 913.517 , 893.428 , 921.606 , 929.590  ,950.493), 
                 D = c( 961.810 , 929.466 , 978.166, 1005.820 , 925.752 , 969.469  ,943.398 ,  965.292 , 996.404 ,  967.047 ,  NA , 893.428 , 921.606 , 976.192 , 929.590 , 950.493),
                 E = c(1006.330, 1028.070 ,  954.274 ,1005.910  ,949.969 , 992.820 ,934.407 , 948.913 ,    961.375  ,955.296 , 961.128  ,998.119 ,1009.110 , 994.891 ,1000.170  ,982.763),
                 G= c(NA , 958.990 , 924.680 , 955.927 , NA , 949.384  ,973.348 , 984.392 , 943.894 , 961.468 , 995.368 , 994.997 ,  979.454 , 952.605 ,NA , 956.507), stringsAsFactors = F)

Based on this dataframe, I need to create an extra FACTOR level for the variable (df$TimePoint) that will be filled based on the following conditions - we have already factors A and B in that variable so lets say that we want to create factor level X :

  • For df$A. If df$Timepoint B is >999 then the factor X will be filled with the same value as df$Timepoint level B value, otherwise (if it is ≤999) then it will be filled with the value at df$timepoint A.

  • For df$B. If df$Timepoint B is >986, factor X will be == as df$Timepoint level B value, otherwise X will == df$timepoint A.

  • For df$C. If df$Timepoint B is >1000, factor X will be == as df$Timepoint level B value, otherwise X will == df$timepoint A.

  • For df$D. If df$Timepoint B is >1030, factor X will be == as df$Timepoint level B value, otherwise X will == df$timepoint A.

  • For df$E. If df$Timepoint B is >800, factor X will be == as df$Timepoint level B value, otherwise X will == df$timepoint A.

  • For df$G. If df$Timepoint B is >950, factor X will be == as df$Timepoint level B value, otherwise X will == df$timepoint A.

The new dataframe would look like this:

enter image description here

Thanks in advance! Best


Solution

  • Here is one tidyverse approach.

    library(tidyverse)
    
    df %>% 
      pivot_wider(names_from = Timepoint,
                  values_from = A:G
                  ) %>% 
      mutate(A_X = ifelse(A_B > 999, A_B, A_A),
             B_X = ifelse(B_B > 986, B_B, B_A),
             C_X = ifelse(C_B > 1000, C_B, C_A),
             D_X = ifelse(D_B > 1030, D_B, D_A),
             E_X = ifelse(E_B > 800, E_B, E_A),
             G_X = ifelse(G_B > 950, G_B, G_A)) %>% 
      pivot_longer(cols = A_A:G_X,
                   names_sep = "_",
                   names_to = c("cat","Timepoint")) %>%
      pivot_wider(names_from = cat,
                  values_from = value)
    
    #> # A tibble: 24 x 8
    #>    PatientID Timepoint     A     B     C     D     E     G
    #>    <chr>     <chr>     <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
    #>  1 0002      A           NA   999.  987.  962. 1006.   NA 
    #>  2 0002      B          977.   NA   962.  929. 1028.  959.
    #>  3 0002      X           NA    NA   987.  962. 1028.  959.
    #>  4 0005      A           NA   999.  929.  978.  954.  925.
    #>  5 0005      B          964.   NA   978. 1006. 1006.  956.
    #>  6 0005      X           NA    NA   929.  978. 1006.  956.
    #>  7 0009      A           NA  1021.  969.  926.  950.   NA 
    #>  8 0009      B          952.  956.  943.  969.  993.  949.
    #>  9 0009      X           NA  1021.  969.  926.  993.   NA 
    #> 10 0018      A           NA   912.  936.  943.  934.  973.
    #> 11 0018      B          951.  964.  965.  965.  949.  984.
    #> 12 0018      X           NA   912.  936.  943.  949.  984.
    #> 13 0039      A          947.  988.  996.  996.  961.  944.
    #> 14 0039      B          903.  902.  921.  967.  955.  961.
    #> 15 0039      X          947.  988.  996.  996.  955.  961.
    #> 16 0043      A          985.  959.  967.   NA   961.  995.
    #> 17 0043      B           NA  1029.  914.  893.  998.  995.
    #> 18 0043      X           NA  1029.  967.   NA   998.  995.
    #> 19 0046      A          930.  987.  893.  922. 1009.  979.
    #> 20 0046      B         1008. 1066.  922.  976.  995.  953.
    #> 21 0046      X         1008. 1066.  893.  922.  995.  953.
    #> 22 0048      A         1027.  958.  930.  930. 1000.   NA 
    #> 23 0048      B          999.  918.  950.  950.  983.  957.
    #> 24 0048      X          999.  958.  930.  930.  983.  957.
    

    Created on 2021-07-29 by the reprex package (v0.3.0)

    And here is an alternative using group_modify and add_row:

    library(tidyverse)
    
    df %>%
      group_by(PatientID) %>% 
      group_modify(.f =
                     
        ~ {
          df_b <- .x[.x$Timepoint == "B",]
          df_a <- .x[.x$Timepoint == "A",]
          
          .x %>% add_row(Timepoint = "X",
                         A = ifelse(df_b$A >  999, df_b$A, df_a$A),
                         B = ifelse(df_b$B >  986, df_b$B, df_a$B),
                         C = ifelse(df_b$C > 1000, df_b$C, df_a$C),
                         D = ifelse(df_b$D > 1030, df_b$D, df_a$D),
                         E = ifelse(df_b$E >  800, df_b$E, df_a$E),
                         G = ifelse(df_b$G >  950, df_b$G, df_a$G)
                         )
        })
    
    #> # A tibble: 24 x 8
    #> # Groups:   PatientID [8]
    #>    PatientID Timepoint     A     B     C     D     E     G
    #>    <chr>     <chr>     <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
    #>  1 0002      A           NA   999.  987.  962. 1006.   NA 
    #>  2 0002      B          977.   NA   962.  929. 1028.  959.
    #>  3 0002      X           NA    NA   987.  962. 1028.  959.
    #>  4 0005      A           NA   999.  929.  978.  954.  925.
    #>  5 0005      B          964.   NA   978. 1006. 1006.  956.
    #>  6 0005      X           NA    NA   929.  978. 1006.  956.
    #>  7 0009      A           NA  1021.  969.  926.  950.   NA 
    #>  8 0009      B          952.  956.  943.  969.  993.  949.
    #>  9 0009      X           NA  1021.  969.  926.  993.   NA 
    #> 10 0018      A           NA   912.  936.  943.  934.  973.
    #> # ... with 14 more rows
    

    Created on 2021-07-29 by the reprex package (v0.3.0)