Search code examples
rloopsaggregateplyr

R - Loop multiple variables group_by


I want to be able to loop through the group_by variables in order to aggregate by every combination of variables and the total and rbind them into one.

I saw something similar here:
dplyr- group by in a for loop r

I have tried to modify the code above a few different ways but I cannot seem to get it working with crossing.

df <- data.frame(
  location = c(rep("UK", 5), rep("USA", 5)),
  industry = c(rep("RETAIL", 3), rep("TECH", 7)),
  department = c(rep("SALES", 4), rep("MANUFACTURING", 6)),
  pay = rnorm(10),
  tax = rnorm(10)
)

temp <- crossing(varA = c("location",""), varB = c("industry",""), varC = c("department",""))

data <- data.frame()
for(i in 1:nrow(temp)){
test <- df %>%
  group_by(!!temp[i,]) %>%
  summarise_at(c("pay", "tax"), sum, na.rm = TRUE)

data <- rbind.fill(test, data)
}

Solution

  • Here is what I think you are looking for. This is a dplyr solution.

    set.seed(10)
    df <- data.frame(
      location = c(rep("UK", 5), rep("USA", 5)),
      industry = c(rep("RETAIL", 3), rep("TECH", 7)),
      department = c(rep("SALES", 4), rep("MANUFACTURING", 6)),
      pay = rnorm(10),
      tax = rnorm(10)
    )
    
    temp <- crossing(varA = c("location",""), varB = c("industry",""), varC = c("department",""))
    
    data <- data.frame()
    for(i in 1:nrow(temp)){
    # extracts only non "" values from temp[i,] and unnames them (else group_by will use names)
      vars <- unname(unlist(temp[i,which(temp[i,] != "")]))
      test <- df %>%
        # tells tidyselect to use all columns that match the contents of vars
        group_by(across(all_of(vars))) %>% 
        summarise_at(c("pay", "tax"), sum, na.rm = TRUE)
      # union_all does what you want rbind.fill to do
      data <- union_all(test, data)
    }
    print(data, n = 20)
    # A tibble: 20 x 5
    # Groups:   location, industry [8]
       location industry department       pay   tax
       <chr>    <chr>    <chr>          <dbl> <dbl>
     1 UK       RETAIL   SALES         -1.54  1.62 
     2 UK       TECH     MANUFACTURING  0.295 0.741
     3 UK       TECH     SALES         -0.599 0.987
     4 USA      TECH     MANUFACTURING -3.07  0.348
     5 UK       RETAIL   NA            -1.54  1.62 
     6 UK       TECH     NA            -0.305 1.73 
     7 USA      TECH     NA            -3.07  0.348
     8 UK       NA       MANUFACTURING  0.295 0.741
     9 UK       NA       SALES         -2.14  2.61 
    10 USA      NA       MANUFACTURING -3.07  0.348
    11 UK       NA       NA            -1.84  3.35 
    12 USA      NA       NA            -3.07  0.348
    13 NA       RETAIL   SALES         -1.54  1.62 
    14 NA       TECH     MANUFACTURING -2.77  1.09 
    15 NA       TECH     SALES         -0.599 0.987
    16 NA       RETAIL   NA            -1.54  1.62 
    17 NA       TECH     NA            -3.37  2.08 
    18 NA       NA       MANUFACTURING -2.77  1.09 
    19 NA       NA       SALES         -2.14  2.61 
    20 NA       NA       NA            -4.91  3.70
    

    If you wanted to replace your NA values with something, say "ALL", you can do that simply:

    data %>% replace(is.na(.), "ALL")
    # A tibble: 20 x 5
    # Groups:   location, industry [8]
       location industry department       pay   tax
       <chr>    <chr>    <chr>          <dbl> <dbl>
     1 UK       RETAIL   SALES         -1.54  1.62 
     2 UK       TECH     MANUFACTURING  0.295 0.741
     3 UK       TECH     SALES         -0.599 0.987
     4 USA      TECH     MANUFACTURING -3.07  0.348
     5 UK       RETAIL   ALL           -1.54  1.62 
     6 UK       TECH     ALL           -0.305 1.73 
     7 USA      TECH     ALL           -3.07  0.348
     8 UK       ALL      MANUFACTURING  0.295 0.741
     9 UK       ALL      SALES         -2.14  2.61 
    10 USA      ALL      MANUFACTURING -3.07  0.348
    11 UK       ALL      ALL           -1.84  3.35 
    12 USA      ALL      ALL           -3.07  0.348
    13 ALL      RETAIL   SALES         -1.54  1.62 
    14 ALL      TECH     MANUFACTURING -2.77  1.09 
    15 ALL      TECH     SALES         -0.599 0.987
    16 ALL      RETAIL   ALL           -1.54  1.62 
    17 ALL      TECH     ALL           -3.37  2.08 
    18 ALL      ALL      MANUFACTURING -2.77  1.09 
    19 ALL      ALL      SALES         -2.14  2.61 
    20 ALL      ALL      ALL           -4.91  3.7