Search code examples
rperformancevectorization

Vectorize an sapply function


I am trying to vectorize the following function to drop the sapply loop. I am calculating cumulative skewness.

cskewness <- function(.x) {
  skewness <- function(.x) {
    sqrt(length(.x)) * sum((.x - mean(.x))^3) / (sum((.x - mean(.x))^2)^(3 / 2))
  }
  sapply(seq_along(.x), function(k, z) skewness(z[1:k]), z = .x)
}

Not getting my algebra right. Have this which is wrong:

skewness2 <- function(.x) {
  n <- length(.x)
  csum <- cumsum(.x)
  cmu <- csum / 1:length(.x)
  num <- cumsum(.x - cmu)^3
  den <- cumsum((.x - cmu)^2)^(3/2)
  sqrt(n) * num / den
}

Correct code produces:

x <- c(1,2,4,5,8)

> cskewness(x)
[1]       NaN 0.0000000 0.3818018 0.0000000 0.4082483
> skewness2(x)
[1]      NaN 1.000000 1.930591 3.882748 4.928973

Solution

  • Using a single-pass for loop will be efficient:

    cumskewness <- function(x) {
      n <- length(x)
      
      if (n == 0L) {
        return(x)
      } else if (n == 1L) {
        return(0)
      }
      
      m2 <- m3  <- term1 <- 0
      out <- numeric(n)
      out[1] <- NaN
      m1 <- x[1]
      
      for (i in 2:n) {
        n0 <- i - 1
        delta <- x[i] - m1
        delta_n <- delta/i
        m1 <- m1 + delta_n
        term1 <- delta*delta_n*n0
        m3 <- m3 + term1*delta_n*(n0 - 1) - 3*delta_n*m2
        m2 <- m2 + term1
        out[i] <- sqrt(i)*m3/m2^1.5
      }
      
      out
    }
    

    It is also straightforward to write up with Rcpp:

    library(Rcpp)
    
    cppFunction('
      NumericVector cumskewnessCpp(const NumericVector& x) {
        const int n = x.size();
        
        if (n == 0) {
          return(x);
        } else if (n == 1) {
          return(0);
        }
        
        int n1;
        double m1 = x(0);
        double m2, m3, delta, delta_n, term1;
        NumericVector out(n);
        out(0) = R_NaN;
        
        for (int i = 1; i < n; i++) {
          n1 = i + 1;
          delta = x(i) - m1;
          delta_n = delta/n1;
          m1 += delta_n;
          term1 = delta*delta_n*i;
          m3 += term1*delta_n*(i - 1) - 3*delta_n*m2;
          m2 += term1;
          out(i) = sqrt(n1)*m3/pow(m2, 1.5);
        }
        
        return out;
      }
    ')
    

    Testing

    cumskewness(x)
    #> [1] NaN 0.0000000 0.3818018 0.0000000 0.4082483
    cumskewnessCpp(x)
    #> [1] NaN  0.000000e+00  3.818018e-01 -2.808667e-17  4.082483e-01
    

    Benchmarking

    Including the vectorized solution from @ThomisIsCoding:

    cskewness_tic <- function(y) {
      # cumulative length of y
      k <- seq_along(y)
      # cumulative n-th order moments of y
      m3 <- cumsum(y^3)
      m2 <- cumsum(y^2)
      m1 <- cumsum(y)
      u <- m1 / k
      # expand cubic terms and refactor skewness in terms of num/den
      num <- (m3 - 3 * u * m2 + 3 * u^2 * m1 - k * u^3) / k
      den <- sqrt((m2 + k * u^2 - 2 * u * m1) / k)^3
      num / den
    }
    
    set.seed(0)
    x <- sample(1e3)
    
    microbenchmark::microbenchmark(
      cskewness_tic = cskewness_tic(x),
      cumskewness = cumskewness(x),
      cumskewnessCpp = cumskewnessCpp(x),
      check = "equal",
      unit = "relative"
    )
    #> Unit: relative
    #>            expr      min      lq     mean   median       uq       max neval
    #>   cskewness_tic 2.035272 2.07954 2.006118 2.388216 2.282022 0.7228815   100
    #>     cumskewness 3.930424 3.96835 4.003956 3.939762 3.711879 1.1339946   100
    #>  cumskewnessCpp 1.000000 1.00000 1.000000 1.000000 1.000000 1.0000000   100
    

    A note of caution using cskewness_tic

    Catastrophic cancellation can result in precision errors for higher-order moments. It occurs when the standard deviation is small relative to the mean. Demonstrating:

    set.seed(0)
    x <- sample(1e3) + 1e8
    
    y1 <- cskewness(x)
    y2 <- cumskewness(x)
    y3 <- cumskewnessCpp(x)
    y4 <- cskewness_tic(x); y4[1] <- NaN
    
    all.equal(y1, y2)
    #> [1] TRUE
    all.equal(y1, y3)
    #> [1] TRUE
    all.equal(y1, y4)
    #> [1] "Mean relative difference: 270.1872"
    

    Original Answer

    skewness2 <- function(.x) {
      d <- outer(.x, cumsum(.x)/(1:length(.x)), "-")
      d[lower.tri(d)] <- 0
      sqrt(1:length(.x))*colSums(d^3)/colSums(d^2)^(3/2)
    }
    
    x <- c(1,2,4,5,8)
    cskewness(x)
    #> [1]       NaN 0.0000000 0.3818018 0.0000000 0.4082483
    skewness2(x)
    #> [1]       NaN 0.0000000 0.3818018 0.0000000 0.4082483