Search code examples
rfor-loopvectorizationchi-squared

Can this for-loop be vectorized


I implemented a chi-square test in R using a for-loop in order to calculate the test statistic for every cell. However, I was wondering whether this can be optimized. And is the chi=square in R working as my code?

eval_preds <- function(df, observations, predictions, groups) {
  # determine cells
  cells <- unique(df[,groups])
  my_sum = 0

  # compute test statistic for every cell
  for (i in 1:length(cells)) {

    # get cell means
    m_obs <- mean(df[df[,groups] == cells[i], observations])
    m_pred <- mean(df[df[,groups] == cells[i], predictions])

    # get cell variance
    var_obs <- var(df[df[,groups] == cells[i], observations])
    var_pred <- var(df[df[,groups] == cells[i], predictions])

    # get cell's number of observations
    N_obs <- length(df[df[,groups] == cells[i], observations])
    N_pred <- length(df[df[,groups] == cells[i], predictions])

    # sum up
    my_sum = my_sum + (m_obs-m_pred) / ((var_obs/N_obs) + (var_pred/N_pred))
  }
  return(my_sum)
}

And a toy example:

# save this as df_head.csv
"RT","RTmodel_s","groups"
899,975.978308710825,"pl_sgDom"
1115,1095.61538562629,"sg_sgDom"
1077,1158.19266217845,"pl_sgDom"
850,996.410033287249,"sg_plDom"
854,894.862587823602,"pl_sgDom"
1720,1046.34200941684,"sg_sgDom"

# load dat into R
df <- read.csv('./df_head.csv')

my_chi <- eval_preds(df, 'RT', 'Pred', 'Group') 

EDIT

The function eval_preds function is called in a for loop in which different predictions are computed based on a free parameter t_parsing.

p_grid = seq(0,1,0.1)

# tune p
for (i in 1:length(p_grid)) {
  
  # set t_parsing
  t_parsing = p_grid[i]
  
  # compute model-time RTs
  df$RTmodel <- ifelse(df$number == 'sing', 
                             RT_lookup(df$sg_t_act, df$epsilon), 
                             RT_decompose(df$sg_t_act, df$pl_t_act, df$epsilon))
  
  # scale into real time
  df$RTmodel_s <- scale_RTmodel(df$RT, df$RTmodel)
  
  # compare model output to measured RT
  # my_chi <- eval_preds(my_nouns, 'RT', 'RTmodel_s', 'groups')
  my_chi <- eval_preds1(df, RT, RTmodel_s, groups) #function written by DaveArmstrong
  print(paste(p_grid[i], ': ', my_chi))
}

RT and RTmodel_s are numerical variables, groups is a character variable.


Solution

  • Sure, you could do it with dplyr and rlang:

    eval_preds <- function(df, observations, predictions, groups) {
      # determine cells
      require(dplyr)
      require(rlang)
      groups <- enquo(groups)
      observations <- enquo(observations)
      predictions <- enquo(predictions)
      out <- DF %>% 
        group_by(!!groups) %>% 
        summarise(m_obs = mean(!!observations), 
                  m_pred = mean(!!predictions), 
                  var_obs = var(!!observations), 
                  var_pred = var(!!predictions),
                  N_obs = sum(!is.na(!!observations)),
                  N_pred = sum(!is.na(!!predictions))) %>% 
        mutate(my_sum = (m_obs - m_pred)/((var_obs/N_obs) + (var_pred/N_pred))) 
    sum(out$my_sum)
    }
    my_chi <- eval_preds(DF, RT, Pred, Group) 
    my_chi
    # [1] 1.6
    

    Or, even a bit more streamlined:

    eval_preds <- function(df, observations, predictions, groups) {
      # determine cells
      require(dplyr)
      require(rlang)
      groups <- enquo(groups)
      observations <- enquo(observations)
      predictions <- enquo(predictions)
      out <- DF %>% 
        group_by(!!groups) %>% 
        summarise(diff= mean(!!observations) - mean(!!predictions), 
                  sum_v = (var(!!observations)/n()) + (var(!!predictions)/n())) %>% 
        mutate(my_sum = diff/sum_v) 
      sum(out$my_sum)
    }
    my_chi <- eval_preds(DF, RT, Pred, Group) 
    my_chi
    # [1] 1.6
    

    EDIT - added benchmarks

    So, I think the question of which one is better depends on the size of the data set. I also want to throw one more function into the mix - one that uses by() from base R:

    eval_preds <- function(df, observations, predictions, groups) {
      # determine cells
      cells <- unique(df[,groups])
      my_sum = 0
      
      # compute test statistic for every cell
      for (i in 1:length(cells)) {
        
        # get cell means
        m_obs <- mean(df[df[,groups] == cells[i], observations])
        m_pred <- mean(df[df[,groups] == cells[i], predictions])
        
        # get cell variance
        var_obs <- var(df[df[,groups] == cells[i], observations])
        var_pred <- var(df[df[,groups] == cells[i], predictions])
        
        # get cell's number of observations
        N_obs <- length(df[df[,groups] == cells[i], observations])
        N_pred <- length(df[df[,groups] == cells[i], predictions])
        
        # sum up
        my_sum = my_sum + (m_obs-m_pred) / ((var_obs/N_obs) + (var_pred/N_pred))
      }
      return(my_sum)
    }
    
    eval_preds1 <- function(df, observations, predictions, groups) {
      # determine cells
      require(dplyr)
      require(rlang)
      groups <- enquo(groups)
      observations <- enquo(observations)
      predictions <- enquo(predictions)
      out <- df %>% 
        group_by(!!groups) %>% 
        summarise(m_obs = mean(!!observations), 
                  m_pred = mean(!!predictions), 
                  var_obs = var(!!observations), 
                  var_pred = var(!!predictions),
                  N_obs = sum(!is.na(!!observations)),
                  N_pred = sum(!is.na(!!predictions))) %>% 
        ungroup %>%
        mutate(my_sum = (m_obs - m_pred)/((var_obs/N_obs) + (var_pred/N_pred))) 
      sum(out$my_sum)
    }
    
    eval_preds2 <- function(df, observations, predictions, groups) {
      # determine cells
      require(dplyr)
      require(rlang)
      groups <- enquo(groups)
      observations <- enquo(observations)
      predictions <- enquo(predictions)
      out <- df %>% 
        group_by(!!groups) %>% 
        summarise(diff= mean(!!observations) - mean(!!predictions), 
                  sum_v = (var(!!observations)/n()) + (var(!!predictions)/n())) %>% 
        ungroup %>%
        mutate(my_sum = diff/sum_v) 
      sum(out$my_sum)
    }
    
    eval_preds3<- function(df, observations, predictions, groups) {
      # determine cells
      
      m <- by(df[,c(observations, predictions)], list(df[[groups]]), function(x)diff(-colMeans(x)))
      v <- by(df[,c(observations, predictions)], list(df[[groups]]), function(x)sum(apply(x, 2, var)/nrow(x)))
      sum(m/v)
    }
    

    So, eval_preds() is the original, eval_preds1() is the first set of dplyr code and eval_preds2(), eval_preds3() is the base R with by(). On the original data set, here is the output from microbenchmark().

    microbenchmark(eval_preds(DF, 'RT', 'Pred', 'Group'), 
                   eval_preds1(DF, RT, Pred, Group), 
                   eval_preds2(DF, RT, Pred, Group), 
                   eval_preds4(DF, 'RT', 'Pred', 'Group'), times=100)
    # Unit: microseconds
    #                                  expr      min        lq      mean   median        uq       max neval  cld
    #  eval_preds(DF, "RT", "Pred", "Group")  236.513  279.4920  324.9760  295.190  321.0125   774.213   100 a   
    #       eval_preds1(DF, RT, Pred, Group) 5236.850 5747.5095 6503.0251 6089.343 6937.8670 12950.677   100    d
    #       eval_preds2(DF, RT, Pred, Group) 4871.812 5372.2365 6070.7297 5697.686 6548.8935 14577.786   100   c 
    # eval_preds4(DF, "RT", "Pred", "Group")  651.013  739.9405  839.1706  773.610  923.9870  1582.218   100  b  
    

    In this case, the original code is the fastest. However, if we made a bigger dataset - here's one with 1000 groups and 10 observations per group.

    DF2 <- data.frame(
      Group = rep(1:1000, each=10), 
      RT = rpois(10000, 3), 
      Pred = rpois(10000, 4)
    )
    

    The output from microbenchmark() here tells a quite different story.

    microbenchmark(eval_preds(DF2, 'RT', 'Pred', 'Group'), 
                   eval_preds1(DF2, RT, Pred, Group), 
                   eval_preds2(DF2, RT, Pred, Group), 
                   eval_preds3(DF2, 'RT', 'Pred', 'Group'), times=25)
    
    
    # Unit: milliseconds
    #                                   expr       min        lq     mean    median        uq      max neval cld
    #  eval_preds(DF2, "RT", "Pred", "Group") 245.67280 267.96445 353.6489 324.29416 419.4193 565.4494    25   c
    #       eval_preds1(DF2, RT, Pred, Group)  74.56522  88.15003 102.6583  92.24103 106.6766 211.2368    25 a  
    #       eval_preds2(DF2, RT, Pred, Group)  79.00919  89.03754 125.8202  94.71703 114.5176 606.8830    25 a  
    # eval_preds3(DF2, "RT", "Pred", "Group") 193.94042 240.35447 272.0004 254.85557 316.5098 420.5394    25  b 
    
    

    Both dplyr() based functions are much faster than their base R competitors. So, the question of which is the "optimal" method depends on the size of the problem to be solved.