Search code examples
rperformancefor-loopvectorizationmemory-efficient

Improve efficiency in R (vectorization?)


I have script in R that takes 8 minutes to run which basically compares date ranges for 800 records over a multi-year period. This is way too long. I am new to R and pretty sure it has to do with my embedded loops. Also, when I tried converting my data to toy problem it doesn't seem to work. I had been dealing with array types which I read in from excel.

# data vectors
ID <- c("1e", "1f", "1g")
StartDate <- c(1, 2, 4)
EndDate <- c(3, 4, 5)
Type <- c("A", "B", "B")
Qty <- c(.5, 2.5, 1)

# table rows and headers
Days <- c(1, 2, 3, 4, 5)
setOfTypes <- c("A", "B")

# get subset of active IDs for each day in table
ActiveID <- data.frame()
for(d in 1:length(Days)){
  check <- StartDate<=Days[d] & EndDate>=Days[d]
  subsetID <- subset(ID, check)
  strSubsetID <- c()
  for(i in 1:length(subsetID)){
    strSubsetID <- paste(ID, subsetID[i], sep=",")
}
ActiveID[d,1] <- strSubsetID
}

# calculate quantity counts by day and type
Count <- matrix(,length(Days),length(setOfTypes))
for(d in 1:length(Days)){
  for(t in 1:length(setOfTypes))
    check <- Type == setOfTypes[t] & sapply(ID, grepl, x=ActiveID[d,1])
    tempCount <- subset(Types, check)
    Count[t,d] <- sum(tempCount)
  }
}

The result should be a table (days x types) with each element consisting of the sum of Qty for active IDs on given day and type.

I am looking to vectorize this code so it runs faster when I apply to much larger data set!! Please help, thanks.


Solution

  • Your code doesn't run as is, so I have no way of knowing exactly what you are looking for. Your description suggests that you want the sum of Qty for each of Days between StartDate and EndDate, grouped by Type. This will produce such a matrix:

    df <- data.frame(ID,StartDate,EndDate,Type,Qty,stringsAsFactors=FALSE)
    Days <- min(StartDate):max(EndDate)
    
    is.between <- function(x,df) with(df,x>=StartDate & x<=EndDate)
    get.sums   <- function(df) sapply(Days,function(d,df) sum(df[is.between(d,df),"Qty"]),df)
    do.call(rbind,lapply(split(df,df$Type), get.sums))
    #   [,1] [,2] [,3] [,4] [,5]
    # A  0.5  0.5  0.5  0.0    0
    # B  0.0  2.5  2.5  3.5    1
    

    Here's a data.table approach that might be faster. Note the different definitions of is.between(...) and get.sums(...).

    DT <- data.table(df,key="Type")
    is.between <- function(x,a,b) x>=a & x <= b
    get.sums   <- function(day) DT[,list(day,Qty=sum(Qty[is.between(day,StartDate,EndDate)])),by=Type]
    long       <- rbindlist(lapply(Days,get.sums))
    result     <- dcast.data.table(long,Type~day,value.var="Qty")
    result
    #    Type   1   2   3   4 5
    # 1:    A 0.5 0.5 0.5 0.0 0
    # 2:    B 0.0 2.5 2.5 3.5 1
    

    Here are some benchmarks with a hopefully more representative example dataset (800 rows, 500 start dates, total date range >900 days), and also testing @Arun's answer.

    # more representative example
    set.seed(1)  # for reproducibility
    StartDate <- sample(1:500,800,replace=TRUE)
    EndDate   <- StartDate + rpois(800,400)
    Type      <- sample(LETTERS[1:20],800,replace=TRUE)
    Qty       <- rnorm(800,10,2)
    Days      <- min(StartDate):max(EndDate)
    df        <- data.frame(StartDate,EndDate,Type,Qty, stringsAsFactors=FALSE)
    

    Comparison of the data frame approach, and the two data table approaches.

    library(data.table)
    library(reshape2)
    DT <- data.table(df,key="Type")
    f.df <- function() {
      is.between <- function(x,df) with(df,x>=StartDate & x<=EndDate)
      get.sums   <- function(df) sapply(Days,function(d,df) sum(df[is.between(d,df),"Qty"]),df)
      do.call(rbind,lapply(split(df,df$Type), get.sums))
    }
    f.dt1 <- function() {
      is.between <- function(x,a,b) x>=a & x <= b
      get.sums   <- function(day) DT[,list(day,Qty=sum(Qty[is.between(day,StartDate,EndDate)])),by=Type]
      long       <- rbindlist(lapply(Days,get.sums))
      dcast.data.table(long,Type~day,value.var="Qty")
    }
    f.dt2 <- function() {
      lookup <- data.table(StartDate=Days, EndDate=Days)
      setkey(lookup)
      j_olaps <- foverlaps(DT, lookup, by.x=c("StartDate", "EndDate"), type="any")
      dcast.data.table(j_olaps, Type ~ StartDate, value.var="Qty", fun.agg=sum, na.rm=TRUE)
    }
    identical(f.dt1(),f.dt2())   # same result? YES!
    # [1] TRUE
    library(microbenchmark)
    microbenchmark(f.df(),f.dt1(),f.dt2(),times=10)
    # Unit: milliseconds
    #     expr        min         lq    median        uq       max neval
    #   f.df() 1199.76370 1212.03787 1222.6558 1243.8743 1275.5526    10
    #  f.dt1() 1634.92675 1664.98885 1689.7812 1714.2662 1798.9121    10
    #  f.dt2()   91.53245   95.19545  129.2789  158.0789  208.1818    10
    

    So @Arun's approach is ~10X faster than the df approach , and ~17X faster than the dt approach above.