Generate list of lead values

I have the below piece of code in R/Dplyr, which finds the lead value at t+5 for column VALUE. If that is not available, the latest value between t+1 and t+5 is used:

data <- data %>%
  group_by(id) %>%
  mutate(
    lead_value = coalesce(lead(VALUE, n = 5), 
                           lead(VALUE, n = 4),
                           lead(VALUE, n = 3),
                           lead(VALUE, n = 2),
                           lead(VALUE, n = 1))
)

Is there a way to generalize this so I could call it as a function, passing the column name and the n into it?

Example data with desired output:

data <- data.frame(
  yyyymm = c(202401, 202402, 202403, 202404, 202405, 202406, 202407, 202408, 202409, 202410, 202411, 202412,
             202401, 202402, 202403, 202404, 202405, 202406, 202407),
  id = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2),
  value = c(1,2,3,4,5,6,7,8,9,10,11,12,1,2,3,4,5,6,7),
  lead = c(6,7,8,9,10,11,12,12,12,12,12,NA,6,7,7,7,7,7,NA)
)

Example data with two id:

data2 <- data.frame(
  yyyymm = c(202401, 202402, 202403, 202404, 202405, 202406, 202407, 202408, 202409, 202410, 202411, 202412,
             202401, 202402, 202403, 202404, 202405, 202406, 202407,202401, 202402, 202403, 202404, 202405, 202406, 202407, 202408, 202409, 202410, 202411, 202412),
  id = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2,3,3,3,3,3,3,3,3,3,3,3,3),
  id2 = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,1,1,1,1,2,2,2,2,2,2,2,2),
  value = c(1,2,3,4,5,6,7,8,9,10,11,12,1,2,3,4,5,6,7,1,2,3,4,5,6,7,8,9,10,11,12),
  lead = c(6,7,8,9,10,11,12,12,12,12,12,NA,6,7,7,7,7,7,NA,4,4,4,NA,10,11,12,12,12,12,12,NA)
)

Solution

The problem is that dplyr::lead does not accept a vector of integer in its n argument, so you have to rely on other functions, like collapse::flag, or data.table::shift.

The other problem is that dplyr::coalesce does not accept matrices or data.frames as input, contrary to data.table::fcoalesce. However, fcoalesce only accepts data.frames, not matrices, hence the use collapse::qDF, which is a quick conversion to DF.

And here you go, a quick and versatile solution!

library(collapse)
library(data.table)

data$lead_value <- fcoalesce(qDF(flag(data$VALUE, g = data$id, -(5:1))))

#Or, using a more dplyr-friendly writing:
data |>
  mutate(lead_value = fcoalesce(qDF(flag(VALUE, -(5:1)))), .by = id)

Output

data
#    yyyymm id VALUE lead lead_value
# 1  202401  1     1    6          6
# 2  202402  1     2    7          7
# 3  202403  1     3    8          8
# 4  202404  1     4    9          9
# 5  202405  1     5   10         10
# 6  202406  1     6   11         11
# 7  202407  1     7   12         12
# 8  202408  1     8   12         12
# 9  202409  1     9   12         12
# 10 202410  1    10   12         12
# 11 202411  1    11   12         12
# 12 202412  1    12   NA         NA
# 13 202401  2     1    6          6
# 14 202402  2     2    7          7
# 15 202403  2     3    7          7
# 16 202404  2     4    7          7
# 17 202405  2     5    7          7
# 18 202406  2     6    7          7
# 19 202407  2     7   NA         NA

When the size of the lag exceeds the length of the vector, flag throws an error, but shift does not:

data2 |>
  mutate(lead_value = fcoalesce(shift(value, -(5:1))), .by = c(id, id2))