Search code examples
rdplyrdatatabletidytable

How to generate list objects within tidytable mutate rowwise?


I have the following want table, which are generated with rowwise() and mutate(). The main issue is that this solution is too slow.

The actual dataset contains around 15,000,000 rows, and took around 6 hours to complete.

What I hope to do is to convert the rowwise() and mutate() operations into tidytable so that it can leverage on data.table speed. Or maybe I just need a base R solution?

set.seed(1990)
mydf <- tibble(id = as.vector(outer(letters, letters, paste0))[1:10]
               , open_week = rep(1:5,2)) %>%
  mutate(close_week = open_week + sample(1:5,10, replace = T)) %>%
  arrange(open_week)
mydf
# some are closed, some are not closed # if not closed, set to NA
mydf$close_week[sample(c(TRUE, FALSE),10, replace = T, prob = c(0.1,0.9))] <- NA


mydf

# A tibble: 10 x 3
   id    open_week close_week
   <chr>     <int>      <int>
 1 aa            1          2
 2 fa            1          4
 3 ba            2          4
 4 ga            2         NA
 5 ca            3          7
 6 ha            3          6
 7 da            4          6
 8 ia            4          5
 9 ea            5          7
10 ja            5          9

# calculate up to the last week
week_last <- max(mydf$close_week, na.rm = T)

# create complete week grid
df <- as_tibble(data.frame(week = seq(from = min(mydf$open_week, na.rm = T)
                                     , to = max(mydf$close_week, na.rm = T), by = 1)))

have <- df %>% 
  rowwise() %>% 
  mutate( # which IDs are active - for the rowwise week?
         active_id_list = list(mydf$id[week >= mydf$open_week & 
                                                          week < ifelse(is.na(mydf$close_week),
                                                                        week_last +1,
                                                                        mydf$close_week)]),
         # what are the ages of the IDs - for the rowwise week?
         active_id_age_list = list(week - mydf$open_week[week >= mydf$open_week & 
                                                            week < ifelse(is.na(mydf$close_week),
                                                                                   week +1,
                                                                          mydf$close_week)]),
         # which IDs have age less than 1 week, more than 1 week - for the rowwise week?
         active_id_less_1_week_list = list(active_id_list[active_id_age_list < 1]),
         active_id_above_1_week_list = list(active_id_list[active_id_age_list >= 1]),
         
         # how many active IDs based on age less than 1 week, age more than 1 week - for the rowwise week?
         active_id_less_1_week = sum(active_id_age_list < 1, na.rm = T),
         active_id_above_1_week = sum(active_id_age_list >= 1, na.rm = T),

         # how many active IDs in total?
         active_id_count = length(active_id_age_list)) %>% 
  ungroup() %>% 
  dplyr::select(!where(is.list)) # remove the list object, unless want to inspect the actual ID list

have

# A tibble: 9 x 4
   week active_id_less_1_week active_id_above_1_week active_id_count
  <dbl>                 <int>                  <int>           <int>
1     1                     2                      0               2
2     2                     2                      1               3
3     3                     2                      3               5
4     4                     2                      3               5
5     5                     2                      4               6
6     6                     0                      4               4
7     7                     0                      2               2
8     8                     0                      2               2
9     9                     0                      1               1

I tried replacing the rowwise() and mutate() with tidytable::mutate_rowwise.() as per https://markfairbanks.github.io/tidytable/reference/mutate_rowwise..html

But I'm not sure how to interpret the following error


have <- df %>% 
  tidytable::mutate_rowwise.( # which IDs are active - for the rowwise week?
    active_id_list = list(mydf$id[week >= mydf$open_week & 
                                    week < ifelse(is.na(mydf$close_week),
                                                  week_last +1,
                                                  mydf$close_week)]),
    # what are the ages of the IDs - for the rowwise week?
    active_id_age_list = list(week - mydf$open_week[week >= mydf$open_week & 
                                                      week < ifelse(is.na(mydf$close_week),
                                                                    week +1,
                                                                    mydf$close_week)]),
    # which IDs have age less than 1 week, more than 1 week - for the rowwise week?
    active_id_less_1_week_list = list(active_id_list[active_id_age_list < 1]),
    active_id_above_1_week_list = list(active_id_list[active_id_age_list >= 1]),
    
    # how many active IDs based on age less than 1 week, age more than 1 week - for the rowwise week?
    active_id_less_1_week = sum(active_id_age_list < 1, na.rm = T),
    active_id_above_1_week = sum(active_id_age_list >= 1, na.rm = T),
    
    # how many active IDs in total?
    active_id_count = length(active_id_age_list)) %>% 
  ungroup() %>% 
  dplyr::select(!where(is.list)) # remove the list object, unless want to inspect the actual ID list

Error in `[.data.table`(list(week = c(1, 2, 3, 4, 5, 6, 7, 8, 9), .rowwise_id = 1:9),  : 
  'list' object cannot be coerced to type 'double'

Solution

  • The error occurs in subsetting the list element i.e. we are not extracting the list element. It can be done with [[

    df %>% 
      tidytable::mutate_rowwise.( # which IDs are active - for the rowwise week?
        active_id_list = list(mydf$id[week >= mydf$open_week & 
                                        week < ifelse(is.na(mydf$close_week),
                                                      week_last +1,
                                                      mydf$close_week)]),
        # what are the ages of the IDs - for the rowwise week?
        active_id_age_list = list(week - mydf$open_week[week >= mydf$open_week & 
                                                          week < ifelse(is.na(mydf$close_week),
                                                                        week +1,
                                                                        mydf$close_week)]), active_id_less_1_week_list = list(active_id_list[active_id_age_list[[1]] < 1]))