I have some data which looks like:
date ID var1 var2 var3
<date> <chr> <dbl> <dbl> <dbl>
1 2005-02-22 5D0EAE -0.682 -0.682 -0.682
2 2005-04-29 5D0EAE 0.458 0.458 0.458
3 2005-06-28 80D368 0.178 0.0276 0.0435
4 2005-06-29 80D368 0.563 0.54 0.575
5 2005-07-06 7CCD06 0.36 0.36 0.36
6 2005-07-08 7CCD06 0.64 0.64 0.64
7 2005-07-12 7CCD06 -0.74 NA NA
I have 3 different variables which report data on infrequent dates. I want to use the 'rollapply' function to take the last n
reported values and apply a time series computation to it.
i.e. running:
df %>%
group_by(ID) %>%
summarise(n = n())
Gives,
# A tibble: 3 × 2
ID n
<chr> <int>
1 5D0EAE 2
2 7CCD06 71
3 80D368 29
So the first 'ID' will be "ignored" if n = 5
, the other 2 ID's will apply the rolling function for 5 time periods.
I can pad
the dates using the following:
df2 <- df %>%
complete(date = seq.Date(min(date), max(date), by = "day")) %>% # pad the dates so all companies have daily time series (make the time series complete, filling missing obs with NA's
na_if(0) %>%
arrange(date, ID)
I don't necessarily want to take the last 5 days as is usually in the rollapply function but I want to take the last 5 reported values (by group)
df2 %>%
group_by(ID) %>%
mutate(
myOut = zoo::rollapply(., width = 5, FUN = mean, by = 1, by.column = FALSE)
) # gives an error
So, my question is, how can I apply the zoo
rollapply function to the last 5 reported values for a given ID and not the last 5 reported time periods. Ignoring cases when the number of reported values is less than a threshold (i.e. 5).
Each of the ID's have different lengths.
Data:
df <- structure(list(date = structure(c(12836, 12902, 12962, 12963,
12970, 12972, 12976, 12986, 12989, 12991, 12999, 13000, 13004,
13011, 13020, 13021, 13024, 13032, 13033, 13047, 13049, 13053,
13053, 13054, 13062, 13063, 13068, 13069, 13070, 13073, 13074,
13087, 13090, 13090, 13091, 13096, 13101, 13110, 13117, 13117,
13118, 13119, 13126, 13138, 13139, 13152, 13153, 13154, 13157,
13181, 13182, 13193, 13207, 13207, 13228, 13229, 13230, 13245,
13245, 13250, 13251, 13271, 13272, 13273, 13280, 13300, 13307,
13313, 13315, 13320, 13321, 13339, 13363, 13364, 13383, 13391,
13399, 13410, 13437, 13438, 13440, 13445, 13455, 13459, 13516,
13522, 13552, 13553, 13557, 13558, 13566, 13567, 13571, 13572,
13573, 13574, 13577, 13578, 13580, 13581, 13584, 13585), class = "Date"),
ID = c("5D0EAE", "5D0EAE", "80D368", "80D368", "7CCD06",
"7CCD06", "7CCD06", "80D368", "7CCD06", "7CCD06", "7CCD06",
"7CCD06", "7CCD06", "7CCD06", "80D368", "80D368", "80D368",
"7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06", "80D368",
"80D368", "7CCD06", "7CCD06", "80D368", "80D368", "80D368",
"80D368", "80D368", "7CCD06", "7CCD06", "80D368", "7CCD06",
"7CCD06", "7CCD06", "7CCD06", "7CCD06", "80D368", "80D368",
"7CCD06", "7CCD06", "80D368", "7CCD06", "80D368", "80D368",
"80D368", "7CCD06", "7CCD06", "7CCD06", "80D368", "7CCD06",
"80D368", "80D368", "80D368", "7CCD06", "7CCD06", "80D368",
"7CCD06", "7CCD06", "80D368", "7CCD06", "7CCD06", "7CCD06",
"80D368", "7CCD06", "7CCD06", "7CCD06", "80D368", "80D368",
"7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06",
"7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06",
"7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06",
"7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06",
"7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06",
"7CCD06"), var1 = c(-0.681666666666667, 0.458, 0.1784375,
0.563333333333333, 0.36, 0.64, -0.74, 0.94, 0.95, 0.95, -0.0876923076923077,
-0.633333333333333, 0, -0.58, 0.52, 0.61, -0.74, 0.55, 0.55,
0.01, -0.478823529411765, -0.58, -0.74, 0.32, -0.74, 0, 0.32,
0.246666666666667, 0.0000000000000000130678250083694, 0,
0, -0.58, -0.0248, 0.95, -0.428, 0.94, -0.74, 0.94, 0.39,
0.25, 0.61, 0.01, 0, 0.32, 0.65, 0.32, 0.228888888888889,
0.18, 0, 0.112962962962963, 0.256923076923077, 0.94, 0.63,
0, 0.262380952380952, 0.7, 0.7, 0, 0, 0, 0.46, -0.58, 0.27,
0.648, 0.61, 0.305, 0.64, 0.035, 0.7, 0.18037037037037, 0.413333333333333,
0, 0.23, 0.656, 0.55, -0.9, -0.98, -0.58, -0.98, -0.98, -0.58,
-0.98, -0.272068965517241, 0.88, 0.554, 0, -0.30125, -0.4025,
0.62, -0.67, 0.62, 0.62, -0.67, -0.825, 0.62, 0.24, -0.5336364,
-0.08, 0.61, -0.9, -0.5146154, 0.16), var2 = c(-0.681666666666667,
0.458, 0.0276470588235294, 0.54, 0.36, 0.64, NA, 0.94, 0.95,
0.95, -0.0744444444444445, -0.633333333333333, NA, -0.58,
0.514, NA, -0.74, NA, NA, 0.01, -0.37, -0.58, NA, 0.32, -0.74,
0, NA, -0.0825, 0.04625, 0, 0, -0.58, -0.14875, 0.95, -0.35,
0.94, NA, 0.94, 0.295, 0.5, NA, 0.01, NA, 0.32, 0.95, NA,
0.126315789473684, 0.18, 0, 0.281111111111111, 0.256923076923077,
0.94, 0.63, 0, 0.295714285714286, 0.7, 0.7, NA, NA, NA, 0.46,
-0.58, 0.5725, 0.648, 0.61, 0.305, 0.64, 0.69, NA, 0.218846153846154,
0.7, 0, 0.26, 0.656, NA, -0.9, -0.98, -0.58, NA, NA, -0.58,
-0.98, -0.192857142857143, 0.88, 0.56, 0, -0.363913, -0.475,
0.62, NA, 0.62, 0.62, NA, NA, 0.62, 0.2966667, -0.53875,
0.08666667, 0.61, NA, -0.4157143, 0.2), var3 = c(-0.681666666666667,
0.458, 0.0435, 0.575, 0.36, 0.64, NA, 0.94, 0.95, 0.95, -0.00500000000000001,
-0.633333333333333, NA, -0.58, 0.514, NA, -0.74, 0.55, NA,
0.01, -0.317142857142857, -0.58, NA, 0.32, -0.74, 0, NA,
0.123076923076923, 0.04625, 0, 0, -0.58, -0.1025, 0.95, -0.35,
0.94, NA, 0.94, 0.295, 0.5, NA, 0.01, NA, 0.32, 0.75, NA,
0.126315789473684, 0.18, 0, 0.177857142857143, 0.256923076923077,
0.94, 0.63, 0, 0.28551724137931, 0.7, 0.7, 0, NA, NA, 0.46,
-0.58, 0.234285714285714, 0.648, 0.61, 0.305, 0.64, -0.156666666666667,
NA, 0.221333333333333, 0.62, 0, 0.26, 0.656, 0.55, -0.9,
-0.98, -0.58, NA, NA, -0.58, -0.98, -0.18, 0.88, 0.56, 0,
-0.3584615, -0.475, 0.62, NA, 0.62, 0.62, NA, NA, 0.62, 0.2966667,
-0.53875, 0.08666667, 0.61, -0.82, -0.36375, 0.2)), row.names = c(NA,
-102L), class = c("tbl_df", "tbl", "data.frame"))
We may use across
- after the complete
the 'ID' was not fill
ed. Maybe, we do the fill
and then grouped by 'ID', loop across
the 'var' columns and apply the rollapply
library(zoo)
library(dplyr)
library(tidyr)
df2 %>%
fill(ID) %>%
group_by(ID) %>%
mutate(across(starts_with('var'),
~ zoo::rollapply(., width = 5, FUN = mean, na.rm = TRUE,
fill = NA, by = 1))) %>%
ungroup
Or make a correction to the above by only doing the rollapply
on the last 5 non-NA elements (i1
)
library(tidyr)
df2 %>%
fill(ID) %>%
group_by(ID) %>%
mutate((across(starts_with('var'), ~ {
i1 <- tail(row_number()[!is.na(.)], 5)
replace(., i1, rollapply(.[i1], width = 5, FUN = mean, partial = TRUE))
})))