I have timeseries with several days data. I need to find a day with maximum number of outliers and plot only this day data.
Here how I do it:
#generate sample data
Sys.setlocale("LC_ALL","English")
Values <- sample(0:100,24241, replace = T)
Values <- rpois(24241, lambda=75)
start <- as.POSIXct("2012-01-15 06:10:00")
interval <- 15
end <- start + as.difftime(4, units="days") + as.difftime(5, units = "hours")
DateTimes <- seq(from=start, by=interval, to=end)
cpu_df <- tibble(datetime = DateTimes, Value = Values)
# find and plot outliers of all days ========================================
upper_bound <- quantile(cpu_df$Value, 0.975)
outlier_ind <- which(cpu_df$Value > upper_bound)
cpu_df_susp <- cpu_df[outlier_ind, ]
alldays_plot <- ggplot(data = cpu_df, aes(x = datetime, y = Value)) +
geom_point(size = 0.9, color = "darkgreen") +
geom_point(data = cpu_df_susp, color = "red", size = 1) +
geom_hline(yintercept=upper_bound, linetype="dashed", color = "red") +
theme_bw() +
labs(x="", title = paste0("% Processor Time, _Total, Percentile: 0.975, Threshold: ", round(upper_bound,2)))
# ========== convert to xts ====================================================
suppressMessages(library(xts))
cpu_df_xts <- xts(x = cpu_df$Value, order.by = cpu_df$datetime)
days <- split(cpu_df_xts, f="days")
#========= find worst day - with biggest number of outliers
outliers_number <- 0
worstday_index <- 0
for (i in 1:(length(days))) {
upper_bound <- quantile( coredata(days[[i]]), 0.975)
outlier_ind <- which(coredata(days[[i]]) > upper_bound)
outlier_day_number <- length(outlier_ind)
if ( outlier_day_number > outliers_number
){
worstday_index <- i
outliers_number <- outlier_day_number
worst_day_outliers_ind <- outlier_ind
}
}
WorstDay <- days[[worstday_index]]
# find outliers of worst day ====================================================
worst_day_outliers <- WorstDay[worst_day_outliers_ind, ]
# convert xts back to tibble
WorstDayTibble <- tibble( datetime = index(WorstDay),
Value = coredata(WorstDay) )
outliersTibble <- tibble( datetime = index(worst_day_outliers),
Value = coredata(worst_day_outliers) )
# plot worst day ====================================================
worstDay_Plot <- ggplot(data = WorstDayTibble, aes(x = datetime, y = Value)) +
geom_point(size = 0.9, color = "darkgreen") +
geom_point(data = outliersTibble, color = "red", size = 1) +
geom_hline(yintercept=upper_bound, linetype="dashed", color = "red") +
theme_bw() +
labs(x="", title = paste0("% Processor Time, _Total, Percentile: 0.975, Threshold: ", round(upper_bound,2)))
library(ggpubr)
ggpubr::ggarrange(alldays_plot, worstDay_Plot)
Here is the result:
What I don't like in my code - to split data to days and search through them I need to convert it to xts. To plot data via ggplot2, I have to convert data back to tibble. Is it possible to avoid that double conversion and make code simplier?
You don't need to convert your data to xts
and back. Keeping the data into dateframe/tibble you can get worst day using :
library(dplyr)
#Add date column
cpu_df <- cpu_df %>% mutate(date = as.Date(datetime))
#For each date count number of Value greater than 0.975 quantile
#and select the date with max outliers.
WorstDay <- cpu_df %>%
group_by(date) %>%
summarise(n = sum(Value > quantile(Value, 0.975))) %>%
slice(which.max(n)) %>%
left_join(cpu_df, by = 'date')
You can use this data for plotting.