Search code examples
rperformance

Fast way to fill data.frame line by line


Summary: This question was based on an erroneous benchmark. For the actual comparison of the different methods to fill a data.frame, see the answer by jblood94.

It has been reported here that preallocating a data.frame and assigning values line by line is very slow because the entire data.frame is copied even when changing only a single cell. Example:

# preallocation
df.pre <- function(x) {
  df <- data.frame(x=numeric(length(x)), y=logical(length(x)))
  for (i in 1:length(x)) {
    df[i,] <- data.frame(x=x[i], y=(x[i]>0.5))  # <<<--- entire df is copied
  }
  return(df)
}

Interestingly, this does not seem to happen when the assignment is made to non existing indices, which thereby automatically increases the data.frame:

# automatic extension
df.auto <- function(x) {
  df <- data.frame(x=numeric(0), y=logical(0))
  for (i in 1:length(x)) {
    df[i,] <- data.frame(x=x[i], y=(x[i]>0.5))
  }
  return(df)
}

x <- runif(100)
library(microbenchmark)
microbenchmark(df.pre(x), df.auto) # <<<--- WRONG

which yields the following runtimes:

Unit: nanoseconds
      expr     min      lq       mean  median        uq      max neval cld
 df.pre(x) 5400053 5447409 5889698.24 5827590 5876605.5 18726678   100   b
   df.auto       9      11      75.94      82     109.5     1071   100  a 

I have also noticed that the runtime of this method to fill a data.frame is almost as fast as using a preallocated matrix and much faster than filling lists and combining them afterwards once in a data.frame.

This raises two questions:

  1. Is this behaviour (no copy) guaranteed?
  2. Is this the recommended way to fill a data.frame line by line?

Edit: As pointed out by jblood94, the benchmark was erroneous. When called correctly, there is almost no runtime difference and assigning to unallocated indices has the same abysmal runtime as assigning to a preallocated data.frame. The fastest way, I am aware of is thus either to use a matrix (if all values are of the same type) and converting it later to a data.frame, or to use a list and convert it later to a data.frame, e.g.:

df.pre <- function(x) {
  df <- data.frame(x=numeric(length(x)), y=logical(length(x)))
  for (i in 1:length(x)) {
      df[i,] <- data.frame(x=x[i], y=(x[i]>0.5))
  }
  return(df)
}
ls.pre <- function(x) {
  ls <- list(x=numeric(length(x)), y=logical(length(x)))
  for (i in 1:length(x)) {
    ls$x[i] <- x[i]; ls$y[i] <- (x[i]>0.5)
  }
  return(as.data.frame(ls))
}
matrix.pre <- function(x) {
  mt <- matrix(c(numeric(length(x)*2)), ncol=2)
  for (i in 1:length(x)) {
    mt[i,] <- c(x[i], as.numeric(x[i]>0.5))
  }
  return(as.data.frame(mt))
}

x <- runif(100)
library(microbenchmark)
microbenchmark(df.pre(x), ls.pre(x), matrix.pre(x))

which yields:

Unit: microseconds
          expr      min        lq       mean    median        uq      max neval
     df.pre(x) 5745.687 5789.6780 6073.79760 5812.7450 5888.2840 8714.556   100
     ls.pre(x)   61.209   65.9720  100.35568   70.9705   74.6935 2981.260   100
 matrix.pre(x)   29.641   33.1215   59.85807   38.6615   40.7255 2237.502   100

Solution

  • A couple other possibilities: (1) preallocate vectors, fill them, then create the data.frame at the end, (2) use data.table.

    library(data.table)
    
    # preallocation
    
    df.pre <- function(x) {
      df <- data.frame(x=numeric(length(x)), y=logical(length(x)))
      for (i in 1:length(x)) {
        df[i,] <- data.frame(x=x[i], y=(x[i]>0.5))  # <<<--- entire df is copied
      }
      df
    }
    
    df.pre2 <- function(x) {
      xx <- numeric(length(x))
      y <- logical(length(x))
      for (i in 1:length(x)) {
        xx[i] <- x[i]
        y[i] <- x[i] > 0.5
      }
      data.frame(x, y)
    }
    
    dt.pre <- function(x) {
      dt <- data.table(x=numeric(length(x)), y=logical(length(x)))
      for (i in 1:length(x)) {
        set(dt, i, 1:2, list(x[i], x[i] > 0.5))
      }
      setDF(dt)
    }
    
    ls.pre <- function(x) {
      ls <- list(x=numeric(length(x)), y=logical(length(x)))
      for (i in 1:length(x)) {
        ls$x[i] <- x[i]; ls$y[i] <- (x[i]>0.5)
      }
      as.data.frame(ls)
    }
    
    matrix.pre <- function(x) {
      mt <- matrix(0, length(x), 2, 1, list(NULL, c("x", "y")))
      for (i in 1:length(x)) {
        mt[i,] <- c(x[i], as.numeric(x[i]>0.5))
      }
      df <- as.data.frame(mt)
      df[[2]] <- as.logical(df[[2]])
      df
    }
    
    # automatic extension
    df.auto <- function(x) {
      df <- data.frame(x=numeric(0), y=logical(0))
      for (i in 1:length(x)) {
        df[i,] <- data.frame(x=x[i], y=(x[i]>0.5))
      }
      df
    }
    

    Benchmark:

    x <- runif(100)
    library(microbenchmark)
    microbenchmark(
      df.pre(x),
      df.pre2(x),
      dt.pre(x),
      ls.pre(x),
      matrix.pre(x),
      df.auto(x),
      check = "equal"
    )
    #> Unit: microseconds
    #>           expr     min       lq      mean   median       uq     max neval
    #>      df.pre(x) 11528.3 11932.50 12836.490 12351.35 13560.65 17712.0   100
    #>     df.pre2(x)   119.1   130.40   140.003   135.40   142.30   214.5   100
    #>      dt.pre(x)   231.0   287.40   318.961   326.35   343.05   491.8   100
    #>      ls.pre(x)   126.1   140.15   150.863   146.75   156.75   214.6   100
    #>  matrix.pre(x)    64.8    75.40    87.267    88.30    95.65   145.3   100
    #>     df.auto(x) 13459.3 13935.35 14745.698 14385.75 15532.90 17592.6   100
    

    The matrix approach can be made fast here, but it would not be best generally, especially if there are character types involved. I recommend the list approach if the calculations can't be vectorized.