Vectorize an sapply function

I am trying to vectorize the following function to drop the sapply loop. I am calculating cumulative skewness.

cskewness <- function(.x) {
  skewness <- function(.x) {
    sqrt(length(.x)) * sum((.x - mean(.x))^3) / (sum((.x - mean(.x))^2)^(3 / 2))
  }
  sapply(seq_along(.x), function(k, z) skewness(z[1:k]), z = .x)
}

Not getting my algebra right. Have this which is wrong:

skewness2 <- function(.x) {
  n <- length(.x)
  csum <- cumsum(.x)
  cmu <- csum / 1:length(.x)
  num <- cumsum(.x - cmu)^3
  den <- cumsum((.x - cmu)^2)^(3/2)
  sqrt(n) * num / den
}

Correct code produces:

x <- c(1,2,4,5,8)

> cskewness(x)
[1]       NaN 0.0000000 0.3818018 0.0000000 0.4082483
> skewness2(x)
[1]      NaN 1.000000 1.930591 3.882748 4.928973

Solution

Using a single-pass for loop will be efficient:

cumskewness <- function(x) {
  n <- length(x)
  
  if (n == 0L) {
    return(x)
  } else if (n == 1L) {
    return(0)
  }
  
  m2 <- m3  <- term1 <- 0
  out <- numeric(n)
  out[1] <- NaN
  m1 <- x[1]
  
  for (i in 2:n) {
    n0 <- i - 1
    delta <- x[i] - m1
    delta_n <- delta/i
    m1 <- m1 + delta_n
    term1 <- delta*delta_n*n0
    m3 <- m3 + term1*delta_n*(n0 - 1) - 3*delta_n*m2
    m2 <- m2 + term1
    out[i] <- sqrt(i)*m3/m2^1.5
  }
  
  out
}

It is also straightforward to write up with Rcpp:

library(Rcpp)

cppFunction('
  NumericVector cumskewnessCpp(const NumericVector& x) {
    const int n = x.size();
    
    if (n == 0) {
      return(x);
    } else if (n == 1) {
      return(0);
    }
    
    int n1;
    double m1 = x(0);
    double m2, m3, delta, delta_n, term1;
    NumericVector out(n);
    out(0) = R_NaN;
    
    for (int i = 1; i < n; i++) {
      n1 = i + 1;
      delta = x(i) - m1;
      delta_n = delta/n1;
      m1 += delta_n;
      term1 = delta*delta_n*i;
      m3 += term1*delta_n*(i - 1) - 3*delta_n*m2;
      m2 += term1;
      out(i) = sqrt(n1)*m3/pow(m2, 1.5);
    }
    
    return out;
  }
')

Testing

cumskewness(x)
#> [1] NaN 0.0000000 0.3818018 0.0000000 0.4082483
cumskewnessCpp(x)
#> [1] NaN  0.000000e+00  3.818018e-01 -2.808667e-17  4.082483e-01

Benchmarking

Including the vectorized solution from @ThomisIsCoding:

cskewness_tic <- function(y) {
  # cumulative length of y
  k <- seq_along(y)
  # cumulative n-th order moments of y
  m3 <- cumsum(y^3)
  m2 <- cumsum(y^2)
  m1 <- cumsum(y)
  u <- m1 / k
  # expand cubic terms and refactor skewness in terms of num/den
  num <- (m3 - 3 * u * m2 + 3 * u^2 * m1 - k * u^3) / k
  den <- sqrt((m2 + k * u^2 - 2 * u * m1) / k)^3
  num / den
}

set.seed(0)
x <- sample(1e3)

microbenchmark::microbenchmark(
  cskewness_tic = cskewness_tic(x),
  cumskewness = cumskewness(x),
  cumskewnessCpp = cumskewnessCpp(x),
  check = "equal",
  unit = "relative"
)
#> Unit: relative
#>            expr      min      lq     mean   median       uq       max neval
#>   cskewness_tic 2.035272 2.07954 2.006118 2.388216 2.282022 0.7228815   100
#>     cumskewness 3.930424 3.96835 4.003956 3.939762 3.711879 1.1339946   100
#>  cumskewnessCpp 1.000000 1.00000 1.000000 1.000000 1.000000 1.0000000   100

A note of caution using `cskewness_tic`

Catastrophic cancellation can result in precision errors for higher-order moments. It occurs when the standard deviation is small relative to the mean. Demonstrating:

set.seed(0)
x <- sample(1e3) + 1e8

y1 <- cskewness(x)
y2 <- cumskewness(x)
y3 <- cumskewnessCpp(x)
y4 <- cskewness_tic(x); y4[1] <- NaN

all.equal(y1, y2)
#> [1] TRUE
all.equal(y1, y3)
#> [1] TRUE
all.equal(y1, y4)
#> [1] "Mean relative difference: 270.1872"

Original Answer

skewness2 <- function(.x) {
  d <- outer(.x, cumsum(.x)/(1:length(.x)), "-")
  d[lower.tri(d)] <- 0
  sqrt(1:length(.x))*colSums(d^3)/colSums(d^2)^(3/2)
}

x <- c(1,2,4,5,8)
cskewness(x)
#> [1]       NaN 0.0000000 0.3818018 0.0000000 0.4082483
skewness2(x)
#> [1]       NaN 0.0000000 0.3818018 0.0000000 0.4082483

Vectorize an sapply function

Testing

Benchmarking

A note of caution using cskewness_tic

Original Answer

A note of caution using `cskewness_tic`