Search code examples
sqlrdplyrdbplyr

How to use `last()` when mutating by group with {dbplyr}?


Consider the following remote table:

library(dbplyr)
library(dplyr, w = F)
remote_data <- memdb_frame(
  grp = c(2, 2, 2, 1, 3, 1, 1),
  win = c("B", "C", "A", "B", "C", "A", "C"),
  id = c(1,3,5,7,2,4,6),
)

I wish to group by grp, order by win and take the last id. This is fairly straightforward if I collect first

# intended output when collecting first
remote_data %>% 
  collect() %>% 
  arrange(grp, win) %>% 
  group_by(grp) %>% 
  mutate(last_id = last(id)) %>% 
  ungroup()
#> # A tibble: 7 × 4
#>     grp win      id last_id
#>   <dbl> <chr> <dbl>   <dbl>
#> 1     1 A         4       6
#> 2     1 B         7       6
#> 3     1 C         6       6
#> 4     2 A         5       3
#> 5     2 B         1       3
#> 6     2 C         3       3
#> 7     3 C         2       2

However I cannot directly convert this to {dbplyr} code by removing collect(), though the SQL code doesn't look bad, what's happening here ?

remote_data %>% 
  arrange(grp, win) %>% 
  group_by(grp) %>% 
  mutate(last_id = last(id)) %>% 
  ungroup() %>% 
  print() %>% 
  show_query()
#> # Source:     SQL [7 x 4]
#> # Database:   sqlite 3.39.4 [:memory:]
#> # Ordered by: grp, win
#>     grp win      id last_id
#>   <dbl> <chr> <dbl>   <dbl>
#> 1     1 A         4       4
#> 2     1 B         7       7
#> 3     1 C         6       6
#> 4     2 A         5       5
#> 5     2 B         1       1
#> 6     2 C         3       3
#> 7     3 C         2       2
#> <SQL>
#> SELECT
#>   *,
#>   LAST_VALUE(`id`) OVER (PARTITION BY `grp` ORDER BY `grp`, `win`) AS `last_id`
#> FROM `dbplyr_001`
#> ORDER BY `grp`, `win`

dbplyr::window_order() allows us to override th ORDER BY clause created by the group_by(), I tried window_order(,win), but no cookie:

remote_data %>% 
  arrange(grp, win) %>% 
  group_by(grp) %>% 
  window_order(win) %>% 
  mutate(last_id = last(id)) %>% 
  ungroup() %>% 
  print() %>% 
  show_query()
#> # Source:     SQL [7 x 4]
#> # Database:   sqlite 3.39.4 [:memory:]
#> # Ordered by: win
#>     grp win      id last_id
#>   <dbl> <chr> <dbl>   <dbl>
#> 1     1 A         4       4
#> 2     1 B         7       7
#> 3     1 C         6       6
#> 4     2 A         5       5
#> 5     2 B         1       1
#> 6     2 C         3       3
#> 7     3 C         2       2
#> <SQL>
#> SELECT *, LAST_VALUE(`id`) OVER (PARTITION BY `grp` ORDER BY `win`) AS `last_id`
#> FROM `dbplyr_001`
#> ORDER BY `grp`, `win`

For some reason window_order(,grp) does trigger a window calculation but not with the expected order:

remote_data %>% 
  arrange(grp, win) %>% 
  group_by(grp) %>% 
  window_order(grp) %>% 
  mutate(last_id = last(id)) %>% 
  ungroup() %>% 
  print() %>% 
  show_query()
#> # Source:     SQL [7 x 4]
#> # Database:   sqlite 3.39.4 [:memory:]
#> # Ordered by: grp
#>     grp win      id last_id
#>   <dbl> <chr> <dbl>   <dbl>
#> 1     1 A         4       6
#> 2     1 B         7       6
#> 3     1 C         6       6
#> 4     2 A         5       5
#> 5     2 B         1       5
#> 6     2 C         3       5
#> 7     3 C         2       2
#> <SQL>
#> SELECT *, LAST_VALUE(`id`) OVER (PARTITION BY `grp` ORDER BY `grp`) AS `last_id`
#> FROM `dbplyr_001`
#> ORDER BY `grp`, `win`

What can I do to keep my initial output with only remote computations, preferably {dbplyr} code ?


Solution

  • It seems like you need to use window_frame():

    library(dbplyr)
    library(dplyr, w = F)
    remote_data <- memdb_frame(
      grp = c(2, 2, 2, 1, 3, 1, 1),
      win = c("B", "C", "A", "B", "C", "A", "C"),
      id = c(1,3,5,7,2,4,6),
    )
    
    remote_data %>% 
      group_by(grp) %>% 
      window_order(win) %>% 
      window_frame() |> 
      mutate(last_id = last(id)) %>% 
      ungroup() %>% 
      print() %>% 
      show_query()
    #> # Source:     SQL [7 x 4]
    #> # Database:   sqlite 3.39.4 [:memory:]
    #> # Ordered by: win
    #>     grp win      id last_id
    #>   <dbl> <chr> <dbl>   <dbl>
    #> 1     1 A         4       6
    #> 2     1 B         7       6
    #> 3     1 C         6       6
    #> 4     2 A         5       3
    #> 5     2 B         1       3
    #> 6     2 C         3       3
    #> 7     3 C         2       2
    #> <SQL>
    #> SELECT
    #>   *,
    #>   LAST_VALUE(`id`) OVER (PARTITION BY `grp` ORDER BY `win` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `last_id`
    #> FROM `dbplyr_001`
    

    Created on 2022-12-05 with reprex v2.0.2

    This feels like a bug in dbplyr to me. Can you please open an issue in the dbplyr repo? Then I can fix this for the next dbplyr release.