I am trying to vectorize the following function to drop the sapply loop. I am calculating cumulative skewness.
cskewness <- function(.x) {
skewness <- function(.x) {
sqrt(length(.x)) * sum((.x - mean(.x))^3) / (sum((.x - mean(.x))^2)^(3 / 2))
}
sapply(seq_along(.x), function(k, z) skewness(z[1:k]), z = .x)
}
Not getting my algebra right. Have this which is wrong:
skewness2 <- function(.x) {
n <- length(.x)
csum <- cumsum(.x)
cmu <- csum / 1:length(.x)
num <- cumsum(.x - cmu)^3
den <- cumsum((.x - cmu)^2)^(3/2)
sqrt(n) * num / den
}
Correct code produces:
x <- c(1,2,4,5,8)
> cskewness(x)
[1] NaN 0.0000000 0.3818018 0.0000000 0.4082483
> skewness2(x)
[1] NaN 1.000000 1.930591 3.882748 4.928973
Using a single-pass for
loop will be efficient:
cumskewness <- function(x) {
n <- length(x)
if (n == 0L) {
return(x)
} else if (n == 1L) {
return(0)
}
m2 <- m3 <- term1 <- 0
out <- numeric(n)
out[1] <- NaN
m1 <- x[1]
for (i in 2:n) {
n0 <- i - 1
delta <- x[i] - m1
delta_n <- delta/i
m1 <- m1 + delta_n
term1 <- delta*delta_n*n0
m3 <- m3 + term1*delta_n*(n0 - 1) - 3*delta_n*m2
m2 <- m2 + term1
out[i] <- sqrt(i)*m3/m2^1.5
}
out
}
It is also straightforward to write up with Rcpp
:
library(Rcpp)
cppFunction('
NumericVector cumskewnessCpp(const NumericVector& x) {
const int n = x.size();
if (n == 0) {
return(x);
} else if (n == 1) {
return(0);
}
int n1;
double m1 = x(0);
double m2, m3, delta, delta_n, term1;
NumericVector out(n);
out(0) = R_NaN;
for (int i = 1; i < n; i++) {
n1 = i + 1;
delta = x(i) - m1;
delta_n = delta/n1;
m1 += delta_n;
term1 = delta*delta_n*i;
m3 += term1*delta_n*(i - 1) - 3*delta_n*m2;
m2 += term1;
out(i) = sqrt(n1)*m3/pow(m2, 1.5);
}
return out;
}
')
cumskewness(x)
#> [1] NaN 0.0000000 0.3818018 0.0000000 0.4082483
cumskewnessCpp(x)
#> [1] NaN 0.000000e+00 3.818018e-01 -2.808667e-17 4.082483e-01
Including the vectorized solution from @ThomisIsCoding:
cskewness_tic <- function(y) {
# cumulative length of y
k <- seq_along(y)
# cumulative n-th order moments of y
m3 <- cumsum(y^3)
m2 <- cumsum(y^2)
m1 <- cumsum(y)
u <- m1 / k
# expand cubic terms and refactor skewness in terms of num/den
num <- (m3 - 3 * u * m2 + 3 * u^2 * m1 - k * u^3) / k
den <- sqrt((m2 + k * u^2 - 2 * u * m1) / k)^3
num / den
}
set.seed(0)
x <- sample(1e3)
microbenchmark::microbenchmark(
cskewness_tic = cskewness_tic(x),
cumskewness = cumskewness(x),
cumskewnessCpp = cumskewnessCpp(x),
check = "equal",
unit = "relative"
)
#> Unit: relative
#> expr min lq mean median uq max neval
#> cskewness_tic 2.035272 2.07954 2.006118 2.388216 2.282022 0.7228815 100
#> cumskewness 3.930424 3.96835 4.003956 3.939762 3.711879 1.1339946 100
#> cumskewnessCpp 1.000000 1.00000 1.000000 1.000000 1.000000 1.0000000 100
cskewness_tic
Catastrophic cancellation can result in precision errors for higher-order moments. It occurs when the standard deviation is small relative to the mean. Demonstrating:
set.seed(0)
x <- sample(1e3) + 1e8
y1 <- cskewness(x)
y2 <- cumskewness(x)
y3 <- cumskewnessCpp(x)
y4 <- cskewness_tic(x); y4[1] <- NaN
all.equal(y1, y2)
#> [1] TRUE
all.equal(y1, y3)
#> [1] TRUE
all.equal(y1, y4)
#> [1] "Mean relative difference: 270.1872"
skewness2 <- function(.x) {
d <- outer(.x, cumsum(.x)/(1:length(.x)), "-")
d[lower.tri(d)] <- 0
sqrt(1:length(.x))*colSums(d^3)/colSums(d^2)^(3/2)
}
x <- c(1,2,4,5,8)
cskewness(x)
#> [1] NaN 0.0000000 0.3818018 0.0000000 0.4082483
skewness2(x)
#> [1] NaN 0.0000000 0.3818018 0.0000000 0.4082483