I've got a list of df , with different variables.
At the end, I want to get a synthesis dataframe, from a sub-selection of dataframes and variables, joined into one. I have another df with the list of variables of interest and the corresponding tables.
Of course, to make the selection on a simple df, it would be enough to make a select(colnames(.) %in% VarSel$Var) and do it for all df and then make join... but, for now, I can't do that directly in purrr on the list of selected df.
Do you have any idea how to do this properly?
# reprex
## a list of df
list_df <- list(A = data.frame(ID = letters[1:10],
Var1 = rnorm(10),
Var2 = rnorm(10),
Var3 = rnorm(10)),
B = data.frame(ID = letters[1:10],
Var1 = rnorm(10),
VarY = rnorm(10),
VarX = rnorm(10)),
D = data.frame(ID = letters[5:14],
Var1 = rnorm(10),
VarZ = rnorm(10),
Var3 = rnorm(10)),
E = data.frame(ID = letters[3:12],
Var1 = rnorm(10),
VarK = rnorm(10),
Var3 = rnorm(10)))
## a df for variables & df selection
VarSel <- data.frame(Var = c("ID", "VarZ", "Var3", "Var3", "VarK"),
Df = c("all", "D", "A", "D", "E"))
# the 3 operations that I want to perform (I simply managed the first one)
list_df %>%
keep(names(.) %in% unique(VarSel$Df))# keep only df named in VarSel
# then select matching variable names for thoses DF (not forgetting the ID one)
# finally produce a synthesis Df joining A & D and variables of interest (ID, VarZ, Var3, Var3)
# desired output
output <- full_join(list_df[["A"]], list_df[["D"]], by = "ID") %>%
full_join(.,list_df[["E"]], by= "ID") %>%
select(ID, VarZ, Var3_A = Var3.x, Var3_D = Var3.y, VarK)
You could utilize this solution for using the right dataframes names as suffix while joining, e.g.
rmSuffix <- function(x) sub("\\_[^.]*", "", x)
uniq_stem <- function(x) x[ave(x, rmSuffix(x), FUN = length) == 1]
keep_list <- list_df %>%
keep(names(.) %in% unique(VarSel$Df))
map2(keep_list, names(keep_list), ~ rename_with(.x, function(x) paste(x, .y, sep = "_"), -ID)) |>
reduce(full_join, by = "ID") |>
select(contains(VarSel$Var)) %>%
rename_with(rmSuffix, .cols = uniq_stem(names(.))) |>
select(-Var3_E)
Output:
ID VarZ Var3_A Var3_D VarK
1 a NA 0.07013416 NA NA
2 b NA -0.20262710 NA NA
3 c NA -0.12263637 NA 0.905559370
4 d NA -0.24115062 NA -0.220275164
5 e -0.3153504 -1.71601081 0.39305203 0.199025384
6 f 1.0408308 1.63679816 0.52372235 0.977966335
7 g -1.2133189 -1.64706077 0.04712189 -1.415394796
8 h -1.4864888 1.08853254 -0.67286743 -0.920097871
9 i 1.3551027 0.57154103 -0.97045208 -0.750717887
10 j 1.1669465 1.18983409 -0.42142208 -0.873252568
11 k 0.1631669 NA -1.01488606 -0.674366819
12 l 0.2537215 NA -0.13341806 0.005410001
13 m -1.0206924 NA 0.75391722 NA
14 n -0.2618105 NA -0.14547705 NA