Search code examples
rdataframevectorizationweighted-average

Vectorize weighted mean by list of names in each row of a data.frame


I have two dataframes. I would like to calculate a weighted mean for each row of my results dataframe from the values in my values dataframe. Each row of results has two columns with lists. Every possible combination of the lists is a row in the values dataframe. I am doing this with the code below (two options), which is probably clearer than me trying to explain it. What I would like to know is if and how I can vectorize this (my original results dataframe is very large).

library(dplyr)

a = c('a, b, c', 'a, b', 'c') 
f = c('p, q', 'r', 's, t') 
results <- data.frame(a, f)
# > results
# a    f
# 1 a, b, c p, q
# 2    a, b    r
# 3       c s, t

av = c('a','b','c') 
fv = c('p', 'q', 'r', 's', 't')
values <- expand.grid(av, fv)
values$w <- runif(15)
values$x <- runif(15, min=10, max=100)

# > values
# Var1 Var2          w        x
# 1     a    p 0.10710168 62.58004
# 2     b    p 0.89175147 20.26853
# 3     c    p 0.31489520 85.90532
# 4     a    q 0.07263807 89.02293
# 5     b    q 0.87090293 72.17195
# 6     c    q 0.88818599 48.65717
# 7     a    r 0.54076274 39.46479
# 8     b    r 0.08678314 57.99200
# 9     c    r 0.86298554 77.00845
# 10    a    s 0.41778402 23.35626
# 11    b    s 0.70227865 82.76310
# 12    c    s 0.84415123 65.26321
# 13    a    t 0.50651689 75.52230
# 14    b    t 0.37850063 87.41811
# 15    c    t 0.58515251 96.74228

# Option 1 with apply
calc_wa <- function(as, fs){
  as <- unlist(strsplit(as, ", "))
  fs <- unlist(strsplit(fs, ", "))
  valuestokeep <- values %>% filter(Var1 %in% as, Var2 %in% fs)
  wa_res <- weighted.mean(valuestokeep$x, valuestokeep$w)
  return(wa_res)
}

results$res <- apply(results, 1, function(y) calc_wa(y['a'], y['f'])) 

# Option 2 with mutate
calc_wa2 <- function(as, fs){
  as <- unlist(strsplit(as.character(as), ", "))
  fs <- unlist(strsplit(as.character(fs), ", "))
  valuestokeep <- values %>% filter(Var1 %in% as, Var2 %in% fs)
  wa_res <- weighted.mean(valuestokeep$x, valuestokeep$w)
  return(wa_res)
}
results <- results %>% rowwise() %>% mutate(res2= calc_wa2(a, f))
# > results
# Source: local data frame [3 x 4]
# Groups: <by row>
#   
#   # A tibble: 3 x 4
#   a       f       res  res2
# <fct>   <fct> <dbl> <dbl>
#   1 a, b, c p, q   52.3  52.3
# 2 a, b    r      42.0  42.0
# 3 c       s, t   78.2  78.2

(I am afraid I am missing some basic command, I also have no idea how to title/tag the question - suggestions welcome)


Solution

  • Using data.table instead:

    Setup Data (made some slight variations):

    library(data.table)
    
    set.seed(1) # added for reproducability
    a = c('a, b, c', 'a, b', 'c')
    f = c('p, q', 'r', 's, t')
    results <- data.table(a, f) #slight change
    # > results
    # a    f
    # 1 a, b, c p, q
    # 2    a, b    r
    # 3       c s, t
    
    av = c('a','b','c') 
    fv = c('p', 'q', 'r', 's', 't')
    values <- expand.grid(av = av, fv = fv) #slight change
    values$w <- runif(15)
    values$x <- runif(15, min=10, max=100)
    

    Code:

    results[, rowID := 1:.N] # add ID
    results_expand <- results[, expand.grid(as = trimws(unlist(strsplit(a, ","))),fs = trimws(unlist(strsplit(f, ","))), stringsAsFactors = FALSE), by = .(rowID)] # expand results
    # Alternate: results_expand <- results[, CJ(as = trimws(unlist(strsplit(a, ","))),fs = trimws(unlist(strsplit(f, ",")))), by = .(rowID)] # expand results
    
    results_expand <- merge(results_expand, values, by.x = c("as","fs"), by.y = c("av","fv")) # merge to value table
    results_expand <- results_expand[, .(wm = weighted.mean(x, w)), by = rowID] # calculate weight
    
    results <- merge(results, results_expand, by = "rowID")
    
    results
       rowID       a    f       wm
    1:     1 a, b, c p, q 74.56427
    2:     2    a, b    r 45.37445
    3:     3       c s, t 35.14175
    

    This uses merge and grouping functions in data.table, so should be faster than any looping option.