Search code examples
rdplyrcustom-function

How do I take a custom function and use it with dplyr group_by in r?


Sorry for asking the same type of question, but I just can't wrap my head around how to take my custom function and iterate it over grouped data. So bonus points if anyone can point me to some in-depth resources.

This works as intended:

library(dplyr)

a_df <- tibble(team = "A",
             cluster = list(
               c(0.01, 0.01, 0.09, 0.03, 0.14, 0.28, 0.09, 0.25, 0.18, 0.17, 0.54, 0.41, 0.16, 0.18, 0.25, 0.02, 0.2, 0.69, 0.02, 0.01, 0.02, 0.07, 0.07, 0.21)),
             tot_matches = 20,
             tot_ck = 121)

#Function that will take a list of xG and return the probability of n goals being scored.Normalized
calculate_goals_norm <- function(shot_xg_list, n_matches, norm_matches){
  
  #set seed only for stackoverflow question!!!!
  set.seed(123)
  
  #Start goal total at 0
  goals_total <- 0
  
  #Function to take an xG number and simulate whether it was a goal or not
  xg_to_goals_sim <- function(shot_xg){

    #Start goal count at 0
    Goals <- 0

    #For each shot, if it goes in, add a goal
    for (shot in shot_xg){
      if (runif(1)<=shot){
        Goals <- Goals + 1
      }
    }

    #Finally, return the number of goals
    return(Goals)

  }
  
  #Run xG calculator 10000 times to test winner %
  sim_goal_list <- c()
  # Create a for statement to populate the list
  for (i in 1:10000) {
    #Run the above formula for xG list
    #But first we need to normalize the number of xG by randomly picking from xG
    
    if(n_matches == norm_matches){
      new_shot_xg_list <- shot_xg_list
      goals_total <- xg_to_goals_sim(new_shot_xg_list)
      sim_goal_list[[i]] <- goals_total
    }
    
    if(n_matches < norm_matches){
      new_shot_xg_list <- c(shot_xg_list, 
                            sample(shot_xg_list, 
                                   round(length(shot_xg_list)/n_matches*norm_matches,0)-length(shot_xg_list),
                                   replace=FALSE))
      goals_total <- xg_to_goals_sim(new_shot_xg_list)
      sim_goal_list[[i]] <- goals_total
    }
    
    if(n_matches > norm_matches){
      
      new_shot_xg_list <- sample(shot_xg_list, 
                                 round(length(shot_xg_list)/n_matches*norm_matches,0),
                                 replace=FALSE)
      goals_total <- xg_to_goals_sim(new_shot_xg_list)
      sim_goal_list[[i]] <- goals_total
    }
    
  }
  
  
  sim_goal_dat <- data.frame(value = unlist(sim_goal_list))
  
  goal_prob <- sim_goal_dat %>% 
    count(value) %>% 
    summarise(goals = value,
              prob = round(n/1000*10,1)) %>% 
    arrange(goals)
  
  return(goal_prob)
  
  
}

#apply function to a single team dataframe (1 obs. of 4 variables)
calculate_goals_norm(shot_xg_list= unlist(a_df$cluster), n_matches = a_df$tot_matches, norm_matches = 20)

# A tibble: 12 x 2
   goals  prob
   <dbl> <dbl>
 1     0   0.6
 2     1   4  
 3     2  12.4
 4     3  20.6
 5     4  23.8
 6     5  19.1
 7     6  11.6
 8     7   5.6
 9     8   1.8
10     9   0.4
11    10   0.1
12    12   0 

Adding a second team to create a full_df with the intended use of the custom function:

full_df <- add_row(a_df, tibble(team = "B",
                                cluster = list(
                                  c(0.06, 0.01, 0.11, 0.18, 0.75, 0.04, 0.23, 0.07, 0.1, 0.05, 0.24, 0.12, 0.28, 0.02, 0.09, 0.16, 0.64, 0.03, 0.1, 0.19, 0.09, 0.01, 0.02, 0.12, 0.01, 0.11, 0.18, 0.05, 0.02, 0.8, 0.08)),
                                tot_matches = 19,
                                tot_ck = 83) )

#final function should look like this?
full_df %>% 
  group_by(team) %>% 
  calculate_goals_norm(shot_xg_list = unlist(cluster), n_matches = tot_matches, norm_matches = 20)

I'm not married to the custom function looking like the above example. I know apply()/sapply() is commonly used to iterate over dataframes, but again I'm not literate enough to know how to apply here.

Thanks for your help.


Solution

  • We may use group_modify here

    library(dplyr)
    out <- full_df %>% 
      group_by(team) %>% 
      group_modify(~with(.x, 
        calculate_goals_norm(shot_xg_list = unlist(cluster), 
               n_matches = tot_matches, norm_matches = 20))) %>%
      ungroup
    

    -output

    > as.data.frame(out)
       team goals prob
    1     A     0  0.6
    2     A     1  4.0
    3     A     2 12.4
    4     A     3 20.6
    5     A     4 23.8
    6     A     5 19.0
    7     A     6 11.6
    8     A     7  5.6
    9     A     8  1.8
    10    A     9  0.4
    11    A    10  0.1
    12    A    12  0.0
    13    B     0  0.0
    14    B     1  0.8
    15    B     2  3.7
    16    B     3 11.1
    17    B     4 18.6
    18    B     5 22.4
    19    B     6 19.6
    20    B     7 12.4
    21    B     8  6.9
    22    B     9  2.9
    23    B    10  1.1
    24    B    11  0.3
    25    B    12  0.1
    26    B    14  0.0
    

    Note that the function calculate_goals_norm is giving errors at the

    sim_goal_list <- as_tibble(sim_goal_list)
    

    So, it is modified to

    sim_goal_dat <-  data.frame(value = unlist(sim_goal_list));
    

    -full function

    
    calculate_goals_norm <- function(shot_xg_list, n_matches, norm_matches){
      
      #set seed only for stackoverflow question!!!!
      set.seed(123)
      
      #Start goal total at 0
      goals_total <- 0
      
      #Function to take an xG number and simulate whether it was a goal or not
      xg_to_goals_sim <- function(shot_xg){
    
        #Start goal count at 0
        Goals <- 0
    
        #For each shot, if it goes in, add a goal
        for (shot in shot_xg){
          if (runif(1)<=shot){
            Goals <- Goals + 1
          }
        }
    
        #Finally, return the number of goals
        return(Goals)
    
      }
      
      #Run xG calculator 10000 times to test winner %
      sim_goal_list <- c()
      # Create a for statement to populate the list
      for (i in 1:10000) {
        #Run the above formula for xG list
        #But first we need to normalize the number of xG by randomly picking from xG
        
        if(n_matches == norm_matches){
          new_shot_xg_list <- shot_xg_list
          goals_total <- xg_to_goals_sim(new_shot_xg_list)
          sim_goal_list[[i]] <- goals_total
        }
        
        if(n_matches < norm_matches){
          new_shot_xg_list <- c(shot_xg_list, 
                                sample(shot_xg_list, 
                                       round(length(shot_xg_list)/n_matches*norm_matches,0)-length(shot_xg_list),
                                       replace=FALSE))
          goals_total <- xg_to_goals_sim(new_shot_xg_list)
          sim_goal_list[[i]] <- goals_total
        }
        
        if(n_matches > norm_matches){
          
          new_shot_xg_list <- sample(shot_xg_list, 
                                     round(length(shot_xg_list)/n_matches*norm_matches,0),
                                     replace=FALSE)
          goals_total <- xg_to_goals_sim(new_shot_xg_list)
          sim_goal_list[[i]] <- goals_total
        }
        
      }
      
      sim_goal_dat <-  data.frame(value = unlist(sim_goal_list))
      
        goal_prob <- sim_goal_dat %>% 
          count(value) %>% 
          summarise(goals = value,
                    prob = round(n/1000*10,1)) %>% 
          arrange(goals)
      
        return(goal_prob)
      
      
      
      
      
      
    }