Search code examples
rfunctiongrouping

How to perform for loop to apply custom function with grouping


I'm trying to perform a forloop to apply a custom summarise function to all the numeric columns in the dataframe. The forloop output seems to ignore the grouping factor- however, if I perform the function alone on a single column (without the for loop), it provides the correct output.

#sample df
structure(list(participant = c("pt04", "pt75", "pt21", "pt73", 
"pt27", "pt39", "pt43", "pt52", "pt69", "pt49", "pt50", "pt56", 
"pt62", "pt68", "pt22", "pt64", "pt54", "pt79", "pt36", "pt26", 
"pt65", "pt38"), group = structure(c(1L, 2L, 2L, 1L, 1L, 2L, 
1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L
), .Label = c("c", "e"), class = "factor"), sex = structure(c(2L, 
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 
2L, 1L, 1L, 2L, 1L), .Label = c("m", "f"), class = "factor"), 
    fm_bdc3 = c(18.535199635968, 23.52996574649, 17.276246451976, 
    11.526088555461, 23.805048656112, 23.08597823716, 28.691020942436, 
    28.968097858499, 23.378093165331, 22.491725344661, 14.609015054932, 
    19.734914019306, 31.947412973684, 25.152298171274, 12.007356801787, 
    20.836128108938, 22.322230884349, 14.777652101515, 21.389572717608, 
    16.992853675086, 14.138189878472, 17.777235203826), fm_rec3 = c(18.545007190636, 
    23.017181869742, 17.031403417007, 11.227201061887, 23.581434653208, 
    21.571120542136, 28.919246372213, 28.138632765662, 22.990408911436, 
    22.274932676852, 14.012586350504, 19.066675709151, 30.897705534847, 
    24.491614222412, 11.670939246332, 20.306494543464, 22.052263684182, 
    14.252973638341, 21.028701096846, 17.207104923059, 13.172159777361, 
    17.610831079442), fm_chg = c(0.00980755466799721, -0.512783876747999, 
    -0.244843034968998, -0.298887493573998, -0.223614002904, 
    -1.514857695024, 0.228225429777002, -0.829465092836998, -0.387684253894999, 
    -0.216792667809003, -0.596428704428, -0.668238310155001, 
    -1.049707438837, -0.660683948862001, -0.336417555455, -0.529633565474001, 
    -0.269967200167002, -0.524678463173998, -0.360871620761998, 
    0.214251247972999, -0.966030101111, -0.166404124383998), 
    fm_percchg = c(0.00052913132097943, -0.0217928016671462, 
    -0.0141722355981437, -0.0259313896588437, -0.00939355370091154, 
    -0.0656180855522784, 0.00795459423472242, -0.0286337438132355, 
    -0.0165832282022865, -0.00963877445980213, -0.0408260722701251, 
    -0.0338607155572751, -0.0328573534170568, -0.0262673392452288, 
    -0.028017619615079, -0.025419001203338, -0.0120940958619099, 
    -0.0355048596062299, -0.0168713805332318, 0.0126083147698213, 
    -0.0683277073949869, -0.00936051767758492)), row.names = c(NA, 
-22L), class = "data.frame")

#my function:
 summbygrp <- function(x) {
  group_by(dexadf, group) %>%
        summarise(
          count = n(),
          mean = mean({{x}}, na.rm = TRUE),
          sd = sd({{x}}, na.rm = TRUE)
        ) %>%
        mutate(se = sd / sqrt(11),
               lower.ci = mean - qt(1 - (0.05 / 2), 11 - 1) * se,
               upper.ci = mean + qt(1 - (0.05 / 2), 11 - 1) * se
              ) 
} 

#apply function to all numeric columns and print column names before output
coln = 1

for (col in dexadf) {
  print(colnames(dexadf)[coln])
  coln = coln + 1
  if(is.numeric(col)) {
    print(summbygrp(col))
  } else {next}
}

#output: 
[1] "fm_bdc3"
# A tibble: 2 × 7
  group count  mean    sd    se lower.ci upper.ci
  <fct> <int> <dbl> <dbl> <dbl>    <dbl>    <dbl>
1 c        11  20.6  5.48  1.65     16.9     24.3
2 e        11  20.6  5.48  1.65     16.9     24.3
[1] "fm_rec3"
# A tibble: 2 × 7
  group count  mean    sd    se lower.ci upper.ci
  <fct> <int> <dbl> <dbl> <dbl>    <dbl>    <dbl>
1 c        11  20.1  5.41  1.63     16.5     23.8
2 e        11  20.1  5.41  1.63     16.5     23.8
[1] "fm_chg"
# A tibble: 2 × 7
  group count   mean    sd    se lower.ci upper.ci
  <fct> <int>  <dbl> <dbl> <dbl>    <dbl>    <dbl>
1 c        11 -0.450 0.406 0.122   -0.723   -0.178
2 e        11 -0.450 0.406 0.122   -0.723   -0.178
[1] "fm_percchg"
# A tibble: 2 × 7
  group count    mean     sd      se lower.ci upper.ci
  <fct> <int>   <dbl>  <dbl>   <dbl>    <dbl>    <dbl>
1 c        11 -0.0227 0.0198 0.00598  -0.0360 -0.00942
2 e        11 -0.0227 0.0198 0.00598  -0.0360 -0.00942

As you can see, all the means for both groups are the same, and I know this shouldn't be true. Could someone identify the error in the code? Thank you!


Solution

  • So instead of using for-loops you can do better,

    library(dplyr)
    library(rlang)
    library(purrr)
    library(tibble)
    
    
    dexadf <- data.frame(
      stringsAsFactors = FALSE,
      participant = c("pt04","pt75","pt21","pt73",
                      "pt27","pt39","pt43","pt52","pt69","pt49","pt50",
                      "pt56","pt62","pt68","pt22","pt64","pt54","pt79",
                      "pt36","pt26","pt65","pt38"),
      fm_bdc3 = c(18.535199635968,23.52996574649,
                  17.276246451976,11.526088555461,23.805048656112,
                  23.08597823716,28.691020942436,28.968097858499,
                  23.378093165331,22.491725344661,14.609015054932,19.734914019306,
                  31.947412973684,25.152298171274,12.007356801787,
                  20.836128108938,22.322230884349,14.777652101515,
                  21.389572717608,16.992853675086,14.138189878472,17.777235203826),
      fm_rec3 = c(18.545007190636,
                  23.017181869742,17.031403417007,11.227201061887,23.581434653208,
                  21.571120542136,28.919246372213,28.138632765662,
                  22.990408911436,22.274932676852,14.012586350504,19.066675709151,
                  30.897705534847,24.491614222412,11.670939246332,
                  20.306494543464,22.052263684182,14.252973638341,
                  21.028701096846,17.207104923059,13.172159777361,17.610831079442),
      fm_chg = c(0.00980755466799721,
                 -0.512783876747999,-0.244843034968998,-0.298887493573998,
                 -0.223614002904,-1.514857695024,0.228225429777002,
                 -0.829465092836998,-0.387684253894999,-0.216792667809003,
                 -0.596428704428,-0.668238310155001,-1.049707438837,
                 -0.660683948862001,-0.336417555455,-0.529633565474001,
                 -0.269967200167002,-0.524678463173998,-0.360871620761998,
                 0.214251247972999,-0.966030101111,-0.166404124383998),
      fm_percchg = c(0.00052913132097943,
                     -0.0217928016671462,-0.0141722355981437,-0.0259313896588437,
                     -0.00939355370091154,-0.0656180855522784,
                     0.00795459423472242,-0.0286337438132355,-0.0165832282022865,
                     -0.00963877445980213,-0.0408260722701251,-0.0338607155572751,
                     -0.0328573534170568,-0.0262673392452288,-0.028017619615079,
                     -0.025419001203338,-0.0120940958619099,
                     -0.0355048596062299,-0.0168713805332318,0.0126083147698213,
                     -0.0683277073949869,-0.00936051767758492),
      group = as.factor(c("c","e",
                          "e","c","c","e","c","e","c","e","e","c",
                          "e","c","c","e","e","c","e","c","e",
                          "c")),
      sex = as.factor(c("f","m",
                        "m","m","m","m","m","f","m","f","f","f",
                        "f","f","f","f","m","f","m","m","f",
                        "m"))
    )
    
    dexadf <- as_tibble(dexadf)
    
    # Note the use of .data pronoun, since columns will passed to this function as characters
    
    summbygrp <- function(df, x) {
      df %>% 
        group_by(group) %>%
        summarise(
          count = n(),
          mean = mean(.data[[x]], na.rm = TRUE), # use of .data
          sd = sd(.data[[x]], na.rm = TRUE) # use of .data
        ) %>%
        mutate(se = sd / sqrt(11),
               lower.ci = mean - qt(1 - (0.05 / 2), 11 - 1) * se,
               upper.ci = mean + qt(1 - (0.05 / 2), 11 - 1) * se
        )
    }
    
    # Here we extract the numerical columns of the dataset
    cols <- dexadf %>% 
      select(where(is.numeric)) %>% colnames(.)
    
    cols 
    #> [1] "fm_bdc3"    "fm_rec3"    "fm_chg"     "fm_percchg"
    
    # Then instead of for loops we can simply use this map function
    map(.x = cols, ~ summbygrp(dexadf, .x))
    
    #> [[1]]
    #> # A tibble: 2 × 7
    #>   group count  mean    sd    se lower.ci upper.ci
    #>   <fct> <int> <dbl> <dbl> <dbl>    <dbl>    <dbl>
    #> 1 c        11  19.3  5.49  1.66     15.6     23.0
    #> 2 e        11  21.9  5.40  1.63     18.2     25.5
    #> 
    #> [[2]]
    #> # A tibble: 2 × 7
    #>   group count  mean    sd    se lower.ci upper.ci
    #>   <fct> <int> <dbl> <dbl> <dbl>    <dbl>    <dbl>
    #> 1 c        11  19.1  5.54  1.67     15.3     22.8
    #> 2 e        11  21.2  5.31  1.60     17.7     24.8
    #> 
    #> [[3]]
    #> # A tibble: 2 × 7
    #>   group count   mean    sd     se lower.ci upper.ci
    #>   <fct> <int>  <dbl> <dbl>  <dbl>    <dbl>    <dbl>
    #> 1 c        11 -0.256 0.311 0.0938   -0.465  -0.0470
    #> 2 e        11 -0.645 0.407 0.123    -0.918  -0.371 
    #> 
    #> [[4]]
    #> # A tibble: 2 × 7
    #>   group count    mean     sd      se lower.ci upper.ci
    #>   <fct> <int>   <dbl>  <dbl>   <dbl>    <dbl>    <dbl>
    #> 1 c        11 -0.0149 0.0167 0.00503  -0.0261 -0.00368
    #> 2 e        11 -0.0306 0.0203 0.00611  -0.0442 -0.0170
    
    # -------------------------------------------------------------------
    # we can also bind all the output results (dataframes) in a single dataframe
    map_dfr(.x = cols, ~ summbygrp(dexadf, .x), .id = "vars")
    
    #> # A tibble: 8 × 8
    #>   vars  group count    mean     sd      se lower.ci upper.ci
    #>   <chr> <fct> <int>   <dbl>  <dbl>   <dbl>    <dbl>    <dbl>
    #> 1 1     c        11 19.3    5.49   1.66     15.6    23.0    
    #> 2 1     e        11 21.9    5.40   1.63     18.2    25.5    
    #> 3 2     c        11 19.1    5.54   1.67     15.3    22.8    
    #> 4 2     e        11 21.2    5.31   1.60     17.7    24.8    
    #> 5 3     c        11 -0.256  0.311  0.0938   -0.465  -0.0470 
    #> 6 3     e        11 -0.645  0.407  0.123    -0.918  -0.371  
    #> 7 4     c        11 -0.0149 0.0167 0.00503  -0.0261 -0.00368
    #> 8 4     e        11 -0.0306 0.0203 0.00611  -0.0442 -0.0170
    

    Created on 2022-07-09 by the reprex package (v2.0.1)