Search code examples
rdplyrdata.tablelead

dplyr::lead or data.table::shift refer to variable value rather than scalar


Given:

library(tidyverse)
df <- data.frame(id = c(1, 1, 1, 1, 1,
                        rep(2, 5), rep(3, 3)),
                 dates = as.Date(c("2015-01-01",
                                   "2015-01-02",
                                   "2015-01-02",
                                   "2015-01-03",
                                   "2015-01-04",
                                   "2015-02-22",
                                   "2015-02-23",
                                   "2015-02-23",
                                   "2015-02-23",
                                   "2015-02-25",
                                   "2015-03-13",
                                   "2015-03-14",
                                   "2015-03-15")),
                 indicator = c(0, 1, 0, 0, 0,
                               0, 1, 0, 0, 0,
                               0, 1, 0),
                 final_date = as.Date(rep(NA, 13))) %>% 
  group_by(id, dates) %>% 
  mutate(repeat_days = n())
df
#       id dates      indicator final_date repeat_days
#    <dbl> <date>         <dbl> <date>           <int>
#  1     1 2015-01-01         0 NA                   1
#  2     1 2015-01-02         1 NA                   2
#  3     1 2015-01-02         0 NA                   2
#  4     1 2015-01-03         0 NA                   1
#  5     1 2015-01-04         0 NA                   1
#  6     2 2015-02-22         0 NA                   1
#  7     2 2015-02-23         1 NA                   3
#  8     2 2015-02-23         0 NA                   3
#  9     2 2015-02-23         0 NA                   3
# 10     2 2015-02-25         0 NA                   1
# 11     3 2015-03-13         0 NA                   1
# 12     3 2015-03-14         1 NA                   1
# 13     3 2015-03-15         0 NA                   1

Based on a condition (indicator == 1), I want to lead dates by a value in a variable (repeat_days) rather than supplying a scaler value so my desired output looks like:

#df_final
#       id dates      indicator final_date repeat_days
#    <dbl> <date>         <dbl> <date>           <int>
#  1     1 2015-01-01         0 NA                   1
#  2     1 2015-01-02         1 2015-01-03           2
#  3     1 2015-01-02         0 NA                   2
#  4     1 2015-01-03         0 NA                   1
#  5     1 2015-01-04         0 NA                   1
#  6     2 2015-02-22         0 NA                   1
#  7     2 2015-02-23         1 2015-02-25           3
#  8     2 2015-02-23         0 NA                   3
#  9     2 2015-02-23         0 NA                   3
# 10     2 2015-02-25         0 NA                   1
# 11     3 2015-03-13         0 NA                   1
# 12     3 2015-03-14         1 2015-03-15           1
# 13     3 2015-03-15         0 NA                   1

If we wanted to lead by a scalar e.g. 1, this works:

df %>% 
  group_by(id) %>% 
  mutate(final_date = case_when(is.na(final_date) & indicator == 1 ~ 
                                  lead(dates, n = 1), TRUE ~ final_date)) 

But when I supply a variable it won't work as expected as its not a scalar:

df %>% 
  group_by(id) %>% 
  mutate(final_date = case_when(is.na(final_date) & indicator == 1 ~ 
                                  lead(dates, repeat_days), TRUE ~ final_date)) 
# Error: Problem with `mutate()` column `final_date`.
# i `final_date = case_when(...)`.
# x `n` must be a nonnegative integer scalar, not an integer vector of length 5.
# i The error occurred in group 1: id = 1.

This won't work either as it refers to the first occurrence of repeat_days by group which is 1 in all these cases:

df %>% 
  group_by(id) %>% 
  mutate(final_date = case_when(is.na(final_date) & indicator == 1 ~ 
                                  lead(dates, repeat_days[1]), TRUE ~ final_date))

Is there a way to refer to the row level value of repeat_days directly without creating an additional variable?

thanks


EDIT thanks to @Maël nice answer:

df %>% 
  group_by(id) %>% 
  mutate(final_date = case_when(is.na(final_date) & indicator == 1 ~ 
                                  lead(dates, repeat_days[indicator == 1]), 
                                TRUE ~ final_date))

I should have made it clear that I could also have repeat indicator == 1 per group so it would need to work on this dataset too:

df <- data.frame(id = c(1, 1, 1, 1, 1,
                        rep(2, 5), rep(3, 3), 4, 4),
                 dates = as.Date(c("2015-01-01",
                                   "2015-01-02",
                                   "2015-01-02",
                                   "2015-01-03",
                                   "2015-01-04",
                                   "2015-02-22",
                                   "2015-02-23",
                                   "2015-02-23",
                                   "2015-02-23",
                                   "2015-02-25",
                                   "2015-03-13",
                                   "2015-03-14",
                                   "2015-03-15",
                                   "2015-04-15",
                                   "2015-04-16")),
                 indicator = c(0, 1, 0, 1, 0,
                               0, 1, 0, 0, 0,
                               0, 1, 0, 0, 1),
                 final_date = as.Date(c("2015-01-01", rep(NA, 14)))) %>% 
  group_by(id, dates) %>% 
  mutate(repeat_days = n()) %>% 
  ungroup()
df
#       id dates      indicator final_date repeat_days
#    <dbl> <date>         <dbl> <date>           <int>
#  1     1 2015-01-01         0 2015-01-01           1
#  2     1 2015-01-02         1 NA                   2
#  3     1 2015-01-02         0 NA                   2
#  4     1 2015-01-03         1 NA                   1
#  5     1 2015-01-04         0 NA                   1
#  6     2 2015-02-22         0 NA                   1
#  7     2 2015-02-23         1 NA                   3
#  8     2 2015-02-23         0 NA                   3
#  9     2 2015-02-23         0 NA                   3
# 10     2 2015-02-25         0 NA                   1
# 11     3 2015-03-13         0 NA                   1
# 12     3 2015-03-14         1 NA                   1
# 13     3 2015-03-15         0 NA                   1
# 14     4 2015-04-15         0 NA                   1
# 15     4 2015-04-16         1 NA                   1

Note for id == 4, there is no lead date, so I want it to default to their current line in that case. Also the first row now already has a final_date value in it, hence the requirement to use case_when or something similar.

Desired output:

#       id dates      indicator final_date repeat_days
#    <dbl> <date>         <dbl> <date>           <int>
#  1     1 2015-01-01         0 2015-01-01           1
#  2     1 2015-01-02         1 2015-01-03           2
#  3     1 2015-01-02         0 NA                   2
#  4     1 2015-01-03         1 2015-01-04           1
#  5     1 2015-01-04         0 NA                   1
#  6     2 2015-02-22         0 NA                   1
#  7     2 2015-02-23         1 2015-02-25           3
#  8     2 2015-02-23         0 NA                   3
#  9     2 2015-02-23         0 NA                   3
# 10     2 2015-02-25         0 NA                   1
# 11     3 2015-03-13         0 NA                   1
# 12     3 2015-03-14         1 2015-03-15           1
# 13     3 2015-03-15         0 NA                   1
# 14     4 2015-04-15         0 NA                   1
# 15     4 2015-04-16         1 2015-04-16           1

Related links here, here and here but I couldn't implement something similar on this particular case with conditions. Happy to see data.table (shift?) solutions too.


Solution

  • It might be simpler to write a lead function that takes a vector of ns. Below I call this function lead2. The rest of your code remains the same.

    Update: You further clarify that, if indicator = 1 but there is no lead date, the final_date should be filled in with the current date. This can be implemented with dplyr::coalesce which finds the first non-null element in a vector. It's an analogue to the SQL COALESCE operator.

    library("tidyverse")
    
    df <- data.frame(
      id = c(
        1, 1, 1, 1, 1,
        rep(2, 5), rep(3, 3), 4, 4
      ),
      dates = as.Date(c(
        "2015-01-01",
        "2015-01-02",
        "2015-01-02",
        "2015-01-03",
        "2015-01-04",
        "2015-02-22",
        "2015-02-23",
        "2015-02-23",
        "2015-02-23",
        "2015-02-25",
        "2015-03-13",
        "2015-03-14",
        "2015-03-15",
        "2015-04-15",
        "2015-04-16"
      )),
      indicator = c(
        0, 1, 0, 1, 0,
        0, 1, 0, 0, 0,
        0, 1, 0, 0, 1
      ),
      final_date = as.Date(c("2015-01-01", rep(NA, 14)))
    ) %>%
      group_by(id, dates) %>%
      mutate(repeat_days = n()) %>%
      ungroup()
    
    lead2 <- function(x, ns) {
      # x: vector of values
      # ns: vector of leads
    
      # Compute the target position for each element
      is <- seq_along(x) + ns
      x[is]
    }
    
    xs <- c("a", "b", "c", "d", "e", "f")
    ns <- c(1, 1, 2, 3, 1, 2)
    lead2(xs, ns)
    #> [1] "b" "c" "e" NA  "f" NA
    
    df %>%
      group_by(id) %>%
      mutate(
        final_date = if_else(
          is.na(final_date) & indicator == 1,
            coalesce(lead2(dates, repeat_days), dates),
            final_date
        )
      )
    #> # A tibble: 15 × 5
    #> # Groups:   id [4]
    #>       id dates      indicator final_date repeat_days
    #>    <dbl> <date>         <dbl> <date>           <int>
    #>  1     1 2015-01-01         0 2015-01-01           1
    #>  2     1 2015-01-02         1 2015-01-03           2
    #>  3     1 2015-01-02         0 NA                   2
    #>  4     1 2015-01-03         1 2015-01-04           1
    #>  5     1 2015-01-04         0 NA                   1
    #>  6     2 2015-02-22         0 NA                   1
    #>  7     2 2015-02-23         1 2015-02-25           3
    #>  8     2 2015-02-23         0 NA                   3
    #>  9     2 2015-02-23         0 NA                   3
    #> 10     2 2015-02-25         0 NA                   1
    #> 11     3 2015-03-13         0 NA                   1
    #> 12     3 2015-03-14         1 2015-03-15           1
    #> 13     3 2015-03-15         0 NA                   1
    #> 14     4 2015-04-15         0 NA                   1
    #> 15     4 2015-04-16         1 2015-04-16           1
    

    Created on 2022-03-14 by the reprex package (v2.0.1)