Search code examples
rdplyr

Custom function with dplyr::summarise with conditions


I want to create a function named ratio_function that does the same as the following code:

data = data %>% 
  group_by(ID) %>% 
  summarise(sum_ratio = sum(surface[category == "A"], na.rm = T)/sum(total_area[category == "A"], na.rm = T)*mean(`MEAN`[category == "A"], na.rm = T))

but inside of summarise such as:

data = data %>% 
  group_by(ID) %>% 
  summarise(sum_ratio = ratio_function("A"))

The problem is that surface, total_area and category aren't recognized as variable name in summarise once they are called in the function.


Solution

  • If you do not want to pass the names of the other relevant columns one by one to the function, you would have to pass the entire dataframe to work on:

    library(tidyverse)
    
    # generate data
    data <- tribble(
      ~ID, ~surface, ~total_area, ~category, ~MEAN,
      1,50,200,"A",1.5,
      1,30,150,"A",1.2,
      1,20,100,"B",0.8,
      2,70,300,"A",2.0,
      2,60,250,"B",1.0,
      2,80,350,"A",1.8,
      3,40,180,"A",1.4,
      3,20,90,"A",1.1,
      3,30,130,"B",0.9,
      4,55,220,"A",1.6,
      4,45,180,"A",1.3,
      4,25,90,"B",0.7
    )
    
    # old approach
    data |> 
      group_by(ID) |> 
      summarise(sum_ratio = sum(surface[category == "A"], na.rm = T) / sum(total_area[category == "A"], na.rm = T) *
                  mean(`MEAN`[category == "A"], na.rm = T))
    #> # A tibble: 4 × 2
    #>      ID sum_ratio
    #>   <dbl>     <dbl>
    #> 1     1     0.309
    #> 2     2     0.438
    #> 3     3     0.278
    #> 4     4     0.363
    
    # define function
    ratio_function <- function(df, category) {
      sum(df$surface[df$category == "A"], na.rm = T) / sum(df$total_area[df$category == "A"], na.rm = T) *
        mean(df$`MEAN`[df$category == "A"], na.rm = T)
    }
    
    # new approach
    data |> 
      group_by(ID) |> 
      summarize(new = ratio_function(pick(everything()), "A"))
    #> # A tibble: 4 × 2
    #>      ID   new
    #>   <dbl> <dbl>
    #> 1     1 0.309
    #> 2     2 0.438
    #> 3     3 0.278
    #> 4     4 0.363
    

    Created on 2024-07-19 with reprex v2.1.1