Search code examples
rvectorizationdplyrzoorollapply

Efficient way to perform running total in the last 365 day window


This is what my data frame looks like:

library(data.table)

df <- fread('
                Name  EventType  Date  SalesAmount RunningTotal Runningtotal(prior365Days)
                John    Email      1/1/2014      0          0            0
                John    Sale       2/1/2014     10          10           10
                John    Sale       7/1/2014     20          30           30
                John    Sale       4/1/2015     30          60           50 
                John    Webinar    5/1/2015      0          60           50
                Tom     Email      1/1/2014      0          0            0
                Tom     Sale       2/1/2014     15          15           15
                Tom     Sale       7/1/2014     10          25           25
                Tom     Sale       4/1/2015     25          50           35 
                Tom     Webinar    5/1/2015      0          50           35
                ')
    df[,Date:= as.Date(Date, format="%m/%d/%Y")]

The last column was my desired column which is the cumulative sum of SalesAmount(for each Name) in the last 365 days rolling window and I performed this with the help of @6pool. His solution was:

df$EventDate <- as.Date(df$EventDate, format="%d/%m/%Y")
df <- df %>%
   group_by (Name) %>%
   arrange(EventDate) %>% 
   mutate(day = EventDate - EventDate[1])

f <- Vectorize(function(i)
    sum(df[df$Name[i] == df$Name & df$day[i] - df$day >= 0 & 
             df$day[i] - df$day <= 365, "SalesAmount"]), vec="i")
df$RunningTotal365 <- f(1:nrow(df))

However,df$RunningTotal365 <- f(1:nrow(df)) is taking a long time(over 1.5 days so far) as my dataframe is over 1.5 million rows. I was suggested "rollapply" in my initial question but I have struggled to figure out how to use it in this instance. Kindly help.


Solution

  • Give this a try:

    DF <- read.table(text = "Name  EventType  EventDate  SalesAmount RunningTotal Runningtotal(prior365Days)
    John    Email      1/1/2014      0          0            0
    John    Sale       2/1/2014     10          10           10
    John    Sale       7/1/2014     20          30           30
    John    Sale       4/1/2015     30          60           50 
    John    Webinar    5/1/2015      0          60           50
    Tom     Email      1/1/2014      0          0            0
    Tom     Sale       2/1/2014     15          15           15
    Tom     Sale       7/1/2014     10          25           25
    Tom     Sale       4/1/2015     25          50           35 
    Tom     Webinar    5/1/2015      0          50           35", header = TRUE)
    
    
    fun <- function(x, date, thresh) {
      D <- as.matrix(dist(date)) #distance matrix between dates
      D <- D <= thresh
      D[lower.tri(D)] <- FALSE #don't sum to future
      R <- D * x #FALSE is treated as 0
      colSums(R)
    }
    
    
    library(data.table)
    setDT(DF)
    DF[, EventDate := as.Date(EventDate, format = "%m/%d/%Y")]
    setkey(DF, Name, EventDate)
    
    DF[, RT365 := fun(SalesAmount, EventDate, 365), by = Name]
    
    #    Name EventType  EventDate SalesAmount RunningTotal Runningtotal.prior365Days. RT365
    # 1: John     Email 2014-01-01           0            0                          0     0
    # 2: John      Sale 2014-02-01          10           10                         10    10
    # 3: John      Sale 2014-07-01          20           30                         30    30
    # 4: John      Sale 2015-04-01          30           60                         50    50
    # 5: John   Webinar 2015-05-01           0           60                         50    50
    # 6:  Tom     Email 2014-01-01           0            0                          0     0
    # 7:  Tom      Sale 2014-02-01          15           15                         15    15
    # 8:  Tom      Sale 2014-07-01          10           25                         25    25
    # 9:  Tom      Sale 2015-04-01          25           50                         35    35
    #10:  Tom   Webinar 2015-05-01           0           50                         35    35