Search code examples
rrollapply

rollapply for last reported values and not last reported time periods


I have some data which looks like:

   date       ID       var1    var2    var3
   <date>     <chr>   <dbl>   <dbl>   <dbl>
 1 2005-02-22 5D0EAE -0.682 -0.682  -0.682 
 2 2005-04-29 5D0EAE  0.458  0.458   0.458 
 3 2005-06-28 80D368  0.178  0.0276  0.0435
 4 2005-06-29 80D368  0.563  0.54    0.575 
 5 2005-07-06 7CCD06  0.36   0.36    0.36  
 6 2005-07-08 7CCD06  0.64   0.64    0.64  
 7 2005-07-12 7CCD06 -0.74  NA      NA    

I have 3 different variables which report data on infrequent dates. I want to use the 'rollapply' function to take the last n reported values and apply a time series computation to it.

i.e. running:

df %>% 
  group_by(ID) %>% 
  summarise(n = n())

Gives,

# A tibble: 3 × 2
  ID         n
  <chr>  <int>
1 5D0EAE     2
2 7CCD06    71
3 80D368    29

So the first 'ID' will be "ignored" if n = 5, the other 2 ID's will apply the rolling function for 5 time periods.

I can pad the dates using the following:

df2 <- df %>% 
  complete(date = seq.Date(min(date), max(date), by = "day")) %>% # pad the dates so all companies have daily time series (make the time series complete, filling missing obs with NA's 
  na_if(0) %>% 
  arrange(date, ID)

I don't necessarily want to take the last 5 days as is usually in the rollapply function but I want to take the last 5 reported values (by group)

df2 %>% 
  group_by(ID) %>% 
  mutate(
    myOut = zoo::rollapply(., width = 5, FUN = mean, by = 1, by.column = FALSE)
  ) # gives an error

So, my question is, how can I apply the zoo rollapply function to the last 5 reported values for a given ID and not the last 5 reported time periods. Ignoring cases when the number of reported values is less than a threshold (i.e. 5).

Each of the ID's have different lengths.

Data:

df <- structure(list(date = structure(c(12836, 12902, 12962, 12963, 
12970, 12972, 12976, 12986, 12989, 12991, 12999, 13000, 13004, 
13011, 13020, 13021, 13024, 13032, 13033, 13047, 13049, 13053, 
13053, 13054, 13062, 13063, 13068, 13069, 13070, 13073, 13074, 
13087, 13090, 13090, 13091, 13096, 13101, 13110, 13117, 13117, 
13118, 13119, 13126, 13138, 13139, 13152, 13153, 13154, 13157, 
13181, 13182, 13193, 13207, 13207, 13228, 13229, 13230, 13245, 
13245, 13250, 13251, 13271, 13272, 13273, 13280, 13300, 13307, 
13313, 13315, 13320, 13321, 13339, 13363, 13364, 13383, 13391, 
13399, 13410, 13437, 13438, 13440, 13445, 13455, 13459, 13516, 
13522, 13552, 13553, 13557, 13558, 13566, 13567, 13571, 13572, 
13573, 13574, 13577, 13578, 13580, 13581, 13584, 13585), class = "Date"), 
    ID = c("5D0EAE", "5D0EAE", "80D368", "80D368", "7CCD06", 
    "7CCD06", "7CCD06", "80D368", "7CCD06", "7CCD06", "7CCD06", 
    "7CCD06", "7CCD06", "7CCD06", "80D368", "80D368", "80D368", 
    "7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06", "80D368", 
    "80D368", "7CCD06", "7CCD06", "80D368", "80D368", "80D368", 
    "80D368", "80D368", "7CCD06", "7CCD06", "80D368", "7CCD06", 
    "7CCD06", "7CCD06", "7CCD06", "7CCD06", "80D368", "80D368", 
    "7CCD06", "7CCD06", "80D368", "7CCD06", "80D368", "80D368", 
    "80D368", "7CCD06", "7CCD06", "7CCD06", "80D368", "7CCD06", 
    "80D368", "80D368", "80D368", "7CCD06", "7CCD06", "80D368", 
    "7CCD06", "7CCD06", "80D368", "7CCD06", "7CCD06", "7CCD06", 
    "80D368", "7CCD06", "7CCD06", "7CCD06", "80D368", "80D368", 
    "7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06", 
    "7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06", 
    "7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06", 
    "7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06", 
    "7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06", 
    "7CCD06"), var1 = c(-0.681666666666667, 0.458, 0.1784375, 
    0.563333333333333, 0.36, 0.64, -0.74, 0.94, 0.95, 0.95, -0.0876923076923077, 
    -0.633333333333333, 0, -0.58, 0.52, 0.61, -0.74, 0.55, 0.55, 
    0.01, -0.478823529411765, -0.58, -0.74, 0.32, -0.74, 0, 0.32, 
    0.246666666666667, 0.0000000000000000130678250083694, 0, 
    0, -0.58, -0.0248, 0.95, -0.428, 0.94, -0.74, 0.94, 0.39, 
    0.25, 0.61, 0.01, 0, 0.32, 0.65, 0.32, 0.228888888888889, 
    0.18, 0, 0.112962962962963, 0.256923076923077, 0.94, 0.63, 
    0, 0.262380952380952, 0.7, 0.7, 0, 0, 0, 0.46, -0.58, 0.27, 
    0.648, 0.61, 0.305, 0.64, 0.035, 0.7, 0.18037037037037, 0.413333333333333, 
    0, 0.23, 0.656, 0.55, -0.9, -0.98, -0.58, -0.98, -0.98, -0.58, 
    -0.98, -0.272068965517241, 0.88, 0.554, 0, -0.30125, -0.4025, 
    0.62, -0.67, 0.62, 0.62, -0.67, -0.825, 0.62, 0.24, -0.5336364, 
    -0.08, 0.61, -0.9, -0.5146154, 0.16), var2 = c(-0.681666666666667, 
    0.458, 0.0276470588235294, 0.54, 0.36, 0.64, NA, 0.94, 0.95, 
    0.95, -0.0744444444444445, -0.633333333333333, NA, -0.58, 
    0.514, NA, -0.74, NA, NA, 0.01, -0.37, -0.58, NA, 0.32, -0.74, 
    0, NA, -0.0825, 0.04625, 0, 0, -0.58, -0.14875, 0.95, -0.35, 
    0.94, NA, 0.94, 0.295, 0.5, NA, 0.01, NA, 0.32, 0.95, NA, 
    0.126315789473684, 0.18, 0, 0.281111111111111, 0.256923076923077, 
    0.94, 0.63, 0, 0.295714285714286, 0.7, 0.7, NA, NA, NA, 0.46, 
    -0.58, 0.5725, 0.648, 0.61, 0.305, 0.64, 0.69, NA, 0.218846153846154, 
    0.7, 0, 0.26, 0.656, NA, -0.9, -0.98, -0.58, NA, NA, -0.58, 
    -0.98, -0.192857142857143, 0.88, 0.56, 0, -0.363913, -0.475, 
    0.62, NA, 0.62, 0.62, NA, NA, 0.62, 0.2966667, -0.53875, 
    0.08666667, 0.61, NA, -0.4157143, 0.2), var3 = c(-0.681666666666667, 
    0.458, 0.0435, 0.575, 0.36, 0.64, NA, 0.94, 0.95, 0.95, -0.00500000000000001, 
    -0.633333333333333, NA, -0.58, 0.514, NA, -0.74, 0.55, NA, 
    0.01, -0.317142857142857, -0.58, NA, 0.32, -0.74, 0, NA, 
    0.123076923076923, 0.04625, 0, 0, -0.58, -0.1025, 0.95, -0.35, 
    0.94, NA, 0.94, 0.295, 0.5, NA, 0.01, NA, 0.32, 0.75, NA, 
    0.126315789473684, 0.18, 0, 0.177857142857143, 0.256923076923077, 
    0.94, 0.63, 0, 0.28551724137931, 0.7, 0.7, 0, NA, NA, 0.46, 
    -0.58, 0.234285714285714, 0.648, 0.61, 0.305, 0.64, -0.156666666666667, 
    NA, 0.221333333333333, 0.62, 0, 0.26, 0.656, 0.55, -0.9, 
    -0.98, -0.58, NA, NA, -0.58, -0.98, -0.18, 0.88, 0.56, 0, 
    -0.3584615, -0.475, 0.62, NA, 0.62, 0.62, NA, NA, 0.62, 0.2966667, 
    -0.53875, 0.08666667, 0.61, -0.82, -0.36375, 0.2)), row.names = c(NA, 
-102L), class = c("tbl_df", "tbl", "data.frame"))

Solution

  • We may use across - after the complete the 'ID' was not filled. Maybe, we do the fill and then grouped by 'ID', loop across the 'var' columns and apply the rollapply

    library(zoo)
    library(dplyr)
    library(tidyr)
    df2 %>%
       fill(ID) %>%
       group_by(ID) %>% 
       mutate(across(starts_with('var'),
        ~ zoo::rollapply(., width = 5, FUN = mean, na.rm = TRUE, 
            fill = NA, by = 1))) %>%
       ungroup
    

    Or make a correction to the above by only doing the rollapply on the last 5 non-NA elements (i1)

    library(tidyr)
    df2 %>% 
       fill(ID) %>% 
       group_by(ID) %>% 
       mutate((across(starts_with('var'), ~ {
         i1 <- tail(row_number()[!is.na(.)], 5)
        replace(., i1, rollapply(.[i1], width = 5, FUN = mean, partial = TRUE))
       })))