Search code examples
rlistselecttidyversepurrr

writing test conditions in a map_if function : apply function on all dataframes with a column that includes specific values


Once again i'm struggling with map functions of purrr.

I've got a list of dataframes, all with ID and Name columns.

I want to perform some recoding and then aggregation on rows with some specific values. For that purpose, i've got another dataframe with a vector of ID and newID that I want to replace before doing some aggregration (sum all numeric variable).

I know how to perform this on one df (see II/), but I don't know what test to write in a map_if function to apply thoses operations on all dataframes where column ID includes some values of new newIDdf$ID (here dataframe B and C).

Any ideas ?

## I/  2 objects 
 # a list of df
 list_df <- list(A = data.frame(ID = c("a", "b", "c", "Z", "Y"),
                                Name = c("a_name", "b_name", "c_name", "Z_name", "Y_name"), 
                                Var1 = rnorm(5),
                                Var2 = rnorm(5),
                                Var3 = rnorm(5)),
                B = data.frame(ID = c("a", "b", "z1", "z2", "z3"),
                               Name = c("a_name", "b_name", "z1_name", "z2_name", "z3_name"),
                               Var1 = rnorm(5),
                               Var2 = rnorm(5)),
                C = data.frame(ID =  c("y1", "y2", "z1", "z2", "z3"),
                               Name = c("y1_name", "y2_name", "z1_name", "z2_name", "z3_name"),
                               Var1 = rnorm(5),
                               Var2 = rnorm(5)))
 
 # a dataframe of correspondance for aggregation operations
 newIDdf <- data.frame(ID =  c("y1", "y2", "z1", "z2", "z3"),
                       IDagr =  c("Y", "Y", "Z", "Z", "Z"))
 
 ## II/ what I want to do (but on 1 df)
# example on 1 df
 
 On1df <- list_df[["B"]] %>% 
   mutate(ID = reduce2(newIDdf$ID, newIDdf$IDagr, 
                           .init= ID, 
                           str_replace)) %>%
   mutate(Name = case_when(ID == "Z" ~ "Z_name",
                         ID == "Y" ~ "Y_name",
                             TRUE ~ Name)) %>%
   group_by(ID) %>% 
   mutate_if(is.numeric, ~list(. = sum(.))) %>% 
   distinct(ID, .keep_all = TRUE)

## III/ What I really want to achieve
 # what if I want to do that simultaneously on df B and C 
 # I mean applying thoses operations on dataframes 
# where column ID includes some values of new newIDdf$ID
 
 list_df_output <- list_df %>% map_if( .p = ~ any(ID %in% newIDdf$ID), ### what test to put here ? (because this doesn't work)
                                       ~ mutate(.x, ID = reduce2(newIDdf$ID, newIDdf$IDagr, 
                                                             .init= ID, 
                                                             str_replace)) %>%
                                         mutate(.,Name = case_when(ID == "Z" ~ "Z_name",
                                                                 ID == "Y" ~ "Y_name",
                                                                 TRUE ~ Name)) %>%
                                         group_by(., ID) %>% 
                                         mutate_if(., is.numeric, ~list(. = sum(.))) %>% 
                                         distinct(., ID, .keep_all = TRUE) )


Solution

  • Do you want this? I also changed your mutate_at function to the more recent version using across and where:

      list_df |> 
         map_if(~any(.x$ID %in%newIDdf$ID) , ~ .x |> 
                  mutate(ID = reduce2(newIDdf$ID, newIDdf$IDagr, 
                                      .init= ID, 
                                      str_replace)) %>%
                  mutate(Name = case_when(ID == "Z" ~ "Z_name",
                                          ID == "Y" ~ "Y_name",
                                          TRUE ~ Name)) %>%
                  group_by(ID) %>% 
                  mutate(across(where(is.numeric), ~ sum(.))) %>%
                  distinct(ID, .keep_all = TRUE))
    

    Output:

    $A
      ID   Name       Var1       Var2       Var3
    1  a a_name  0.1015844  0.6306434  0.5058593
    2  b b_name -0.1420690  0.5152645  0.2497879
    3  c c_name  0.5841423  1.2883330  0.5297098
    4  Z Z_name  1.6645565  0.2307524 -1.0418045
    5  Y Y_name -0.1293767 -2.4152871 -0.1935843
    
    $B
    # A tibble: 3 × 4
    # Groups:   ID [3]
      ID    Name     Var1   Var2
      <chr> <chr>   <dbl>  <dbl>
    1 a     a_name -0.512 -0.119
    2 b     b_name -2.14  -0.834
    3 Z     Z_name  0.468  2.54 
    
    $C
    # A tibble: 2 × 4
    # Groups:   ID [2]
      ID    Name    Var1  Var2
      <chr> <chr>  <dbl> <dbl>
    1 Y     Y_name 1.15  0.162
    2 Z     Z_name 0.790 2.03