I am attempting to modify my code below to a single pipeline using dplyr
. I am calculating a weighted mean across two columns per year where each year has a different weighting. How can this been performed without explicitly splitting the dataset and then combining the results? Thanks
library(tidyverse)
seet.seed = 1
df = data.frame(yr = sample(c(2022,2023,2024), size =20,replace = TRUE),
col_1 = rnorm(20),
col_2 = rnorm(20),
col_3 = rnorm(20))
dfList = split(df,df$yr)
selected_cols = c("col_1","col_2")
df2022 = dfList[[1]] %>% rowwise() %>%
mutate( mean_wt = weighted.mean(c_across(all_of(selected_cols )),c(0.40,0.60)))%>%
ungroup()
df2023 = dfList[[2]] %>% rowwise() %>%
mutate( mean_wt = weighted.mean(c_across(all_of(selected_cols )),c(0.70,0.30)))%>%
ungroup()
df2024 = dfList[[3]] %>% rowwise() %>%
mutate( mean_wt = weighted.mean(c_across(all_of(selected_cols )),c(0.50,0.50)))%>%
ungroup()
newDF = bind_rows(list(df2022 ,df2023,df2024 ))
library(dplyr)
yr_wt = list("2022" = c(.4, .6), "2023" = c(.7, .3), "2024" = c(.5, .5))
df |>
rowwise() |>
mutate(mean_wt = weighted.mean(c_across(col_1:col_2), yr_wt[[as.character(yr)]])) |>
ungroup()
rowwise()
can be a very slow operation on even moderately sized data frames. Here is a faster option still using the yr_wt
list for lookup:
df |>
mutate(mean_wt = rowSums(as.matrix(pick(col_1:col_2)) %*% diag(yr_wt[[as.character(pull(cur_group()))]])),
.by = yr)