I have a large time series dataset of pot weights automatically recorded by scales. Pots are measured every 3 minutes and receive watering pulses once a day. Over time, plants grow and so does the pot weight. Here's a recreation of my dataset as a reproducible example with an object containing all pots an object containing only one pot and a shorter time:
The data
library(dplyr)
library(ggplot2)
library(scattermore)
library(pracma)
library(purrr)
library(furrr)
####### CREATING DATASET #######
#Generating time series vector with 3 minute intervals
interval <- 60
start <- as.POSIXct("2023-04-07")
end <- start + as.difftime(80, units="days")
date_time <- seq(from=start, by=interval*3, to=end)
#Generating pot IDs
pot <- rep(seq(1,100,1),each = length(date_time))
#Building dataframe
pots <- data.frame(pot = as.factor(pot),
date_time = date_time)
#Generating simulated pot weight changes
slope <- 0.001 #slope
a = 3500 #intercept
freq <- 480 #watering frequency -24 h time 60 min, divided by 3 min lapses-
pulse <- 3 #size of watering pulse
l <- length(date_time) #length of time series
weight_gen <- function(slope,a,freq,pulse,l){
x <- as.numeric(seq(1,l,1)) #time axis
y <- ((sin(2*(pi)/freq*x)+cos(2*pi/freq*x)))+a #fluctuation
y <- y+(rnorm(y)*0.05) #adding drift
y <- y+(seq(1,length(y),1)*(slope*abs(rnorm(1)))) #adding slope to fluctuation
y[seq(1,length(y),freq)] <- y[(seq(1,length(y),freq))]+pulse #adding pulses at beginning of day
for (i in 1:30) {
y[seq(1,length(y),freq)+rep(i,length(y))] <- y[seq(1,length(y),freq)+rep(i,length(y))]+(pulse-(sqrt(i/3.6))) #adding pulses at beginning of day
}
y <- y[complete.cases(y)]
weight <- data.frame(x=x,y=y)
# # All
ggplot(weight,aes(x=x,y=y))+
geom_line()
# # Daily
ggplot(weight[1:480,],aes(x=x,y=y))+
geom_line()
return(y)
}
#Generating weight time series with watering pulses for all pots
pots <- pots %>%
dplyr::group_by(pot) %>%
dplyr::mutate(weight = weight_gen(slope,a,freq,pulse,l)) %>%
dplyr::ungroup()
#Extracting one pot and shorter time range for speed testing
single_pot <- pots %>% filter(pot == "1", date_time < "2023-05-04 00:00:00")
#Data all pots
ggplot(pots,aes(x=date_time, y=weight))+
scattermore::geom_scattermore(aes(color=pot),pointsize = .0) #faster plotting
#Data one pot
ggplot(single_pot,aes(x=date_time, y=weight))+
geom_line()#+
# scattermore::geom_scattermore(aes(color=pot),pointsize = .0) #faster plotting
The problem
I need to apply a smoothing function to the raw weights, which is then used to calculate other variables. The accuracy of the derived variables depends on how well the smoothing function reproduces the patterns between watering periods (spikes). By default, the software of the scales uses a Savitzky-Golay algorythm (pracma::savgol()
) as a smoothing function that is very fast to calculate, but this algorythm is very sensitive to sudden spikes. This leads to artifactual declines in weight right before a spike (see below) and other issues. I would like to replace this algorythm by the Whittaker-Henderson algorythm (pracma::whittaker()
) which does a better job at fitting the data,which results in substantially less error propagation in subsequent variables.
However, the time it takes to run pracma::whittaker()
is bastly greater than that of pracma::savgol()
. As you can see below:
####One pot####
#Smoothed using Savitzky-Golay algorythm
system.time(
single_pot <- single_pot %>%
dplyr::group_by(pot) %>%
dplyr::mutate(weight_smooth_sav=savgol(weight,fl=61,forder = 2,dorder = 0)) %>%
dplyr::ungroup()
)
# user system elapsed
# 0.00 0.00 0.02
#Smoothed using Whittaker-Henderson algorythm
system.time(
single_pot <- single_pot %>%
dplyr::group_by(pot) %>%
dplyr::mutate(weight_smooth_wh=pracma::whittaker(weight,lambda = 3200,d=2)) %>%
dplyr::ungroup()
)
# user system elapsed
# 1666.36 0.65 1682.07
#Method comparison
min_x <- as.POSIXct("2023-04-08 00:00:00")
max_x <- as.POSIXct("2023-04-10 00:00:00")
ggplot(single_pot,aes(x=date_time, y=weight))+
geom_line(aes(y=weight, color="Raw data"),size=1)+
geom_line(aes(y=weight_smooth_sav, color="Savitzky-Golay"),size=1)+
geom_line(aes(y=weight_smooth_wh, color="Whittaker-Henderson"),size=1)+
scale_y_continuous(limits = c(min(single_pot$weight),max(3510)))+
scale_x_datetime(limits = c(min_x,max_x))
This becomes a real problem when I try to apply the code across the whole dataset:
#All pots
#Smoothed using Savitzky-Golay algorythm
system.time(
pots <- pots %>%
dplyr::group_by(pot) %>%
dplyr::mutate(weight_smooth_sav=savgol(weight,fl=61,forder = 2,dorder = 0)) %>%
dplyr::ungroup()
)
# user system elapsed
# 121.45 0.20 123.68
#Smoothed using Whittaker-Henderson algorythm (don't even try to run this)
#pots <- pots %>%
# dplyr::group_by(pot) %>%
# dplyr::mutate(weight_smooth_wh=pracma::whittaker(weight,lambda = 3200,d=2)) %>%
# dplyr::ungroup()
My attempts
I tried improving the speed of the approach using parallel processing using the following code:
#Smoothed using Whittaker-Henderson algorythm (don't even try to run this)
#future::plan("future::multisession",workers=5)
# pots <- pots %>%
# dplyr::group_by(pot) %>%
# dplyr::group_modify(~dplyr::mutate(., weight_smooth_wh = furrr::future_map(list(weight),
# ~pracma::whittaker(.x, lambda = 3200, d = 2))[[1]]))
But I don't see all my processors being fully used and the code still takes infinite hours to run.
Here's the pracma::whittaker()
function in case it is useful to understand the root of the problem:
function (y, lambda = 1600, d = 2)
{
m <- length(y)
E <- eye(m)
D <- diff(E, lag = 1, differences = d)
B <- E + (lambda * t(D) %*% D)
z <- solve(B, y)
return(z)
}
What I need
I'm looking for one of the following solutions:
dplyr::group_by()
.pracma::whittaker()
that makes it usable.pracma
should really be using a sparse matrix for whittaker
:
library(Matrix)
sparseWhittaker <- function (y, lambda = 1600, d = 2) {
E <- Diagonal(length(y))
as.numeric(solve(E + (lambda * crossprod(diff(E, 1, d))), y))
}
Let's do an even smaller example.
single_pot <- pots %>% filter(pot == "1", date_time < "2023-04-14 00:00:00")
system.time(
single_pot <- single_pot %>%
dplyr::group_by(pot) %>%
dplyr::mutate(weight_smooth_wh=pracma::whittaker(weight,lambda = 3200,d=2)) %>%
dplyr::ungroup()
)
#> user system elapsed
#> 28.84 0.03 28.87
system.time(
single_pot <- single_pot %>%
dplyr::group_by(pot) %>%
dplyr::mutate(weight_smooth_wh_sparse=sparseWhittaker(weight,lambda = 3200,d=2)) %>%
dplyr::ungroup()
)
#> user system elapsed
#> 0.02 0.00 0.01
all.equal(single_pot$weight_smooth_wh, single_pot$weight_smooth_wh_sparse)
#> [1] TRUE
The full dataset. No need to go parallel.
system.time({
pots <- pots %>%
dplyr::group_by(pot) %>%
dplyr::mutate(weight_smooth_wh=sparseWhittaker(weight,lambda = 3200,d=2)) %>%
dplyr::ungroup()
})
#> user system elapsed
#> 4.58 0.48 5.06