Search code examples
rdplyrmultidplyr

R multidplyr for summarise_at work around?


I want to use multidplyr, and it has yet to have anything for summarise_at. i have hundreds if not thousands, so the summarise_at is necessary, but unfortunately, not available in multidplyr.

looking for an alternative to work around it.

library('tidyverse')
df <- tibble(ID = c('a','a','b','c','c','e','e','f','g','g'),
              var1 = floor(runif(10, min=0, max=100)),
              var2 = floor(runif(10, min=0, max=100)),
              var3 = floor(runif(10, min=0, max=100)),
              var4 = floor(runif(10, min=0, max=100))
              )

library('multidplyr')
cluster <- new_cluster(5)

#works
df %>% 
  group_by(ID) %>% 
  #partition(cluster) %>% 
  summarise_at(.vars = vars(starts_with('var')),sum) 
  #collect()

#works
df %>% 
  group_by(ID) %>% 
  partition(cluster) %>% 
  summarise(var1 = sum(var1),
            var2 = sum(var2),
            var3 = sum(var3)) %>% 
  collect()

#doesnt works
df %>% 
  group_by(ID) %>% 
  partition(cluster) %>%
  summarise_at(.vars = vars(starts_with('var')),sum)  %>% 
  collect()

I've even tried this

#Define character string vector to replace command line
sum_var <- select(df,starts_with('var')) %>% names()
sum_var_str <- paste0(sum_var," = sum(",sum_var,")")
sum_var_str <- str_c(sum_var_str, collapse = ", ")
> sum_var
[1] "var1" "var2" "var3" "var4"
> sum_var_str
[1] "var1 = sum(var1), var2 = sum(var2), var3 = sum(var3), var4 = sum(var4)"

#works
df %>% 
  group_by(ID) %>% 
  { eval(parse(text = sprintf("summarise(., %s, .groups = 'drop')", sum_var_str))) }

#doesn't works
df %>% 
  group_by(ID) %>% 
  partition(cluster) %>%
  { eval(parse(text = sprintf("summarise(., %s, .groups = 'drop')", sum_var_str))) } %>%
  collect()


Solution

  • Found the solution

    library('dplyr')
    library('multidplyr')
    library('parallel')
    cluster <- new_cluster(detectCores())
    
    df <- tibble(ID = c('a','a','b','c','c','e','e','f','g','g'),
                 var1 = floor(runif(10, min=0, max=100)),
                 var2 = floor(runif(10, min=0, max=100)),
                 var3 = floor(runif(10, min=0, max=100)),
                 var4 = floor(runif(10, min=0, max=100))
    )
    
    sum_var <- select(df,starts_with('var')) %>% names()
    
    #assign vector to cluster
    cluster_assign(cluster, sum_var = sum_var)
    cluster_library(cluster, 'dplyr')
    
    df %>% 
      group_by(ID) %>% 
      partition(cluster) %>% 
      summarise(across(all_of(sum_var), sum)) %>% 
      collect()
    
    # A tibble: 6 x 5
      ID     var1  var2  var3  var4
      <chr> <dbl> <dbl> <dbl> <dbl>
    1 a        57    72    85   118
    2 b        46    50    80    33
    3 c        82   156    96   154
    4 e       122   107    93   120
    5 f        33     7    49    36
    6 g        99    79    83    56