Search code examples
rdplyrzoo

Why is my dplyr code to create multiple variables using mutate and zoo incredibly slow?


I am using dplyr to create multiple variables in my data frame using mutate. At the same time, I am using zoo to calculate a rolling average. As an example, I have my variables set up like so:

vars <- "total_apples", "total_oranges", "total_bananas"

My data has over 100 variables and approx. 40,000 lines, but the above is just an example.

I am using this code below:

library(dplyr)
library(zoo)
data <- data %>%
  group_by(fruit) %>%
  mutate(across(c(all_of(vars)), list(avge_last2 = ~ zoo::rollapplyr(., 2, FUN = mean, partial = TRUE))))

Just for the above to calculate the average over the last 2 records, it takes over 5 mins:

> end.time <- Sys.time()
> time.taken <- end.time - start.time
> time.taken
Time difference of 5.925337 mins

It takes even longer if I want to average over more records, say n= 10 like so:

library(dplyr)
library(zoo)
data <- data %>%
  group_by(fruit) %>%
  mutate(across(c(all_of(vars)), list(avge_last2 = ~ zoo::rollapplyr(., 10, FUN = mean, partial = TRUE))))

Is there an issue with my code or is it something else?

dput(head(data,20)) provides the following:

structure(list(match_id = c(14581L, 14581L, 14581L, 14581L, 14581L, 
14581L, 14581L, 14581L, 14581L, 14581L, 14581L, 14581L, 14581L, 
14581L, 14581L, 14581L, 14581L, 14581L, 14581L, 14581L), match_date = structure(c(16527, 
16527, 16527, 16527, 16527, 16527, 16527, 16527, 16527, 16527, 
16527, 16527, 16527, 16527, 16527, 16527, 16527, 16527, 16527, 
16527), class = "Date"), season = c(2015, 2015, 2015, 2015, 2015, 
2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 
2015, 2015, 2015, 2015), match_round = c(1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), home_team = c(3, 3, 3, 
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3), away_team = c(14, 
14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 
14, 14, 14), venue = c(11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 
11, 11, 11, 11, 11, 11, 11, 11, 11, 11), venue_name = c("MCG", 
"MCG", "MCG", "MCG", "MCG", "MCG", "MCG", "MCG", "MCG", "MCG", 
"MCG", "MCG", "MCG", "MCG", "MCG", "MCG", "MCG", "MCG", "MCG", 
"MCG"), opponent = c(14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 
14, 14, 14, 14, 14, 14, 14, 14, 14, 14), player_id = c(11186L, 
11215L, 11285L, 11330L, 11380L, 11388L, 11407L, 11472L, 11473L, 
11490L, 11553L, 11561L, 11573L, 11582L, 11598L, 11601L, 11616L, 
11643L, 11671L, 11737L), player_first_name = c("Chris", "Chris", 
"Kade", "Troy", "Andrew", "Brett", "Cameron", "Marc", "Dale", 
"Ivan", "Bryce", "Shane", "Bachar", "Jack", "Andrejs", "Shaun", 
"Michael", "Lachie", "Trent", "Alex"), player_last_name = c("Judd", 
"Newman", "Simpson", "Chaplin", "Carrazzo", "Deledio", "Wood", 
"Murphy", "Thomas", "Maric", "Gibbs", "Edwards", "Houli", "Riewoldt", 
"Everitt", "Grigg", "Jamison", "Henderson", "Cotchin", "Rance"
), player_team = c("Carlton", "Richmond", "Carlton", "Richmond", 
"Carlton", "Richmond", "Carlton", "Carlton", "Carlton", "Richmond", 
"Carlton", "Richmond", "Richmond", "Richmond", "Carlton", "Richmond", 
"Carlton", "Carlton", "Richmond", "Richmond"), player_team_numeric = c(3, 
14, 3, 14, 3, 14, 3, 3, 3, 14, 3, 14, 14, 14, 3, 14, 3, 3, 14, 
14), guernsey_number = c(5L, 1L, 6L, 25L, 44L, 3L, 36L, 3L, 39L, 
20L, 4L, 10L, 14L, 8L, 33L, 6L, 40L, 23L, 9L, 18L), player_position = c(3, 
14, 14, 1, 17, 13, 16, 12, 20, 16, 14, 5, 10, 8, 13, 14, 6, 7, 
3, 2), disposals = c(21L, 7L, 21L, 13L, 18L, 18L, 11L, 21L, 1L, 
13L, 26L, 21L, 21L, 17L, 18L, 17L, 8L, 10L, 17L, 18L), kicks = c(16L, 
6L, 13L, 9L, 9L, 9L, 8L, 9L, 1L, 8L, 15L, 9L, 15L, 13L, 14L, 
9L, 4L, 9L, 6L, 9L), marks = c(5L, 1L, 8L, 1L, 2L, 3L, 2L, 2L, 
0L, 4L, 4L, 1L, 5L, 8L, 8L, 4L, 2L, 6L, 3L, 4L), handballs = c(5L, 
1L, 8L, 4L, 9L, 9L, 3L, 12L, 0L, 5L, 11L, 12L, 6L, 4L, 4L, 8L, 
4L, 1L, 11L, 9L), tackles = c(6L, 1L, 2L, 2L, 2L, 0L, 1L, 2L, 
0L, 4L, 4L, 3L, 1L, 0L, 2L, 2L, 1L, 2L, 1L, 0L), clearances = c(6L, 
0L, 0L, 0L, 6L, 1L, 6L, 4L, 0L, 4L, 4L, 7L, 0L, 0L, 1L, 3L, 0L, 
0L, 1L, 1L), brownlow_votes = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), effective_disposals = c(15L, 
6L, 16L, 11L, 16L, 13L, 6L, 14L, 1L, 11L, 13L, 16L, 16L, 10L, 
14L, 12L, 5L, 6L, 9L, 17L), disposal_efficiency_percentage = c(71L, 
86L, 76L, 85L, 89L, 72L, 55L, 67L, 100L, 85L, 50L, 76L, 76L, 
59L, 78L, 71L, 63L, 60L, 53L, 94L), contested_possessions = c(11L, 
3L, 5L, 7L, 9L, 6L, 7L, 9L, 1L, 9L, 9L, 15L, 1L, 7L, 3L, 4L, 
3L, 4L, 5L, 5L), uncontested_possessions = c(10L, 4L, 17L, 6L, 
10L, 12L, 4L, 12L, 0L, 4L, 17L, 7L, 18L, 9L, 14L, 11L, 5L, 7L, 
12L, 14L), time_on_ground_percentage = c(79L, 65L, 73L, 100L, 
76L, 69L, 89L, 81L, 1L, 88L, 73L, 83L, 85L, 98L, 95L, 81L, 96L, 
91L, 86L, 96L), afl_fantasy_score = c(93L, 26L, 97L, 42L, 54L, 
53L, 61L, 67L, 4L, 91L, 96L, 67L, 78L, 89L, 80L, 80L, 30L, 54L, 
54L, 58L), contested_marks = c(0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 
0L, 2L, 1L, 0L, 1L, 3L, 0L, 0L, 0L, 1L, 0L, 0L), metres_gained = c(474L, 
231L, 269L, 165L, 128L, 181L, 151L, 227L, -7L, 160L, 466L, 332L, 
709L, 268L, 464L, 283L, 99L, 257L, 203L, 288L), turnovers = c(5L, 
3L, 4L, 2L, 3L, 2L, 2L, 4L, 0L, 1L, 6L, 2L, 5L, 8L, 5L, 2L, 2L, 
3L, 3L, 1L), effective_kicks = c(11L, 5L, 9L, 7L, 7L, 4L, 3L, 
5L, 1L, 6L, 5L, 4L, 11L, 7L, 12L, 5L, 2L, 6L, 1L, 9L), ground_ball_gets = c(8L, 
2L, 4L, 5L, 7L, 4L, 4L, 8L, 0L, 3L, 6L, 9L, 0L, 4L, 3L, 2L, 2L, 
2L, 5L, 3L), cum_rec = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 
13, 14, 15, 16, 17, 18, 19, 20), rank_match_kicks = c(2, 34, 
10.5, 20.5, 20.5, 20.5, 28, 20.5, 43, 28, 4.5, 20.5, 4.5, 10.5, 
8, 20.5, 39.5, 20.5, 34, 20.5), rank_match_marks = c(14, 39, 
5, 39, 33, 27.5, 33, 33, 43.5, 20.5, 20.5, 39, 14, 5, 5, 20.5, 
33, 10, 27.5, 20.5)), row.names = c(NA, -20L), class = c("tbl_df", 
"tbl", "data.frame"))

Update:

Consider the example below using the functions suggested in the answer below:

match_id <- c("match_1", "match_1","match_1","match_2","match_2","match_2","match_3","match_3","match_3")
player_id <- c("player_1", "player_2", "player_3", "player_1", "player_2", "player_3", "player_1", "player_2", "player_3")
turnovers <- c(5,10,15,1,2,3,5,7,9)

data <- data.frame(match_id, player_id, turnovers)
    
f <- function(dt, window, vars, byvars, partial=F) {
  res = dt[, lapply(.SD, frollmean, n=window), by=byvars, .SDcols=vars]
  if(partial) {
    res = rbind(
      partials(dt,window-1,vars, byvars),
      res[window:.N, .SD, by=byvars]
    )
  }
  return(res)
}

partials <- function(dt,w,vars,byvars) {
  rbindlist(lapply(1:w, function(p) {
    dt[1:p, lapply(.SD, function(v) Reduce(`+`, shift(v,0:(p-1)))/p),
       .SDcols = vars, by=byvars][p:.N, .SD, by=byvars]
  }))
}

# set the data as data.table
setDT(data)

# define vars of interest
vars = c("turnovers")

# ensure the order is correct for rolling mean
setorder(data, player_id, match_id )

# set the window size
n=3

# get the rolling mean, by grouping variable, for each var in `vars`, and add the partials

f(data, window=n, vars=vars, byvars="player_id", partial=T)

This returns the following:

   player_id turnovers
1:  player_1  5.000000
2:  player_1  3.000000
3:  player_1  3.666667
4:  player_2        NA
5:  player_2        NA
6:  player_2  6.333333
7:  player_3        NA
8:  player_3        NA
9:  player_3  9.000000

What am I doing wrong?


Solution

  • You could try this:

    library(data.table)
    
    setDT(data)
    
    
    data[,paste0(vars, "_avge_last2_"):= lapply(.SD, frollmean, n=2),
         .SDcols=vars,
         by=.(fruit)
    ]
    

    Update

    Here is a more generalized solution for handling the NA(s) at the top of each window (i.e. the partial windows)

    First, start with a function that can take a data table (dt), a window size (window), a set of variables (vars), and a set of grouping variables (byvars), and an optional logical indicator partial

    f <- function(dt, window, vars, byvars, partial=F) {
      res = dt[, lapply(.SD, frollmean, n=window), by=byvars, .SDcols=vars]
      if(partial) {
        res = rbind(
          partials(dt,window-1,vars, byvars),
          res[,.SD[window:.N], by=byvars]
        )
      }
      return(res)
    }
    

    Add, the optional function partials()

    partials <- function(dt,w,vars,byvars) {
      rbindlist(lapply(1:w, function(p) {
        dt[, lapply(.SD[1:p], function(v) Reduce(`+`, shift(v,0:(p-1)))/p),
           .SDcols = vars, by=byvars][, .SD[p:.N], by=byvars]
      }))
    }
    

    Apply the function

    # set the data as data.table
    setDT(data)
    
    # define vars of interest
    vars = c("turnovers", "effective_kicks")
    
    # ensure the order is correct for rolling mean
    setorder(data, match_id, player_id)
    
    # set the window size
    n=3
    
    # get the rolling mean, by grouping variable, for each var in `vars`, and add the partials
    
    f(data, window=n, vars=vars, byvars="player_id", partial=T)