Search code examples
rdplyrtidyversecrosstab

Dplyr computation when cross tables have more than two groups


Dplyr is really great to work with datasets and I think I can replace all data manipulation that I always did in Excel by using dplyr commands. However, it turns difficult for computing subtotals and percentage when dealing with several subgroups and I didn't find any related code in my recent search on this subject here.

Please check the following table. Everything works fine, but after spending more than a day, I'm still confused about how to perform the last calculation: totals for each country, cH, EC, GT, MX and the USA.

Subtotals

I'll be glad if some of the experts we just find here could help me with this process.

That's the code and the data structure if you want to reproduce the entire environment.

Thank you.

library(tidyverse)
data %>% group_by(Q7_MoEd, add=TRUE) %>% 
  mutate(educ_levels_count = n()) %>% 

  #how many countries we are dealing with
  group_by(Country, add=TRUE) %>% 
  mutate(country_count = n()) %>% 

  #lets add how many education levels we have
  group_by(Q7_MoEd, Country, educ_levels_count, country_count, add=TRUE) %>% 
  summarise(
    educ_count = n(), 
    educ_percentage = round(educ_count/first(educ_levels_count),2), 
    #country_percentage = educ_count/first(country_count), 
  ) %>% 
  mutate(n_percentage = paste0(educ_count," (",educ_percentage,"%)")) %>% #merge variables
  select(Q7_MoEd, Country, n_percentage, educ_levels_count) %>% 
  spread(Country, n_percentage, fill="-") %>% 
  janitor::adorn_totals("row")



data <- structure(list(Q7_MoEd = structure(c(6L, 3L, 3L, 5L, 5L, 5L, 
                                             5L, 5L, 5L, 5L, 5L, 3L, 5L, 6L, 5L, 5L, 5L, 6L, 5L, 5L, 5L, 2L, 
                                             6L, 6L, 6L, 5L, 5L, 5L, 5L, 6L, 5L, 6L, 2L, 6L, 6L, 6L, 6L, 5L, 
                                             4L, 5L, 6L, 6L, 5L, 5L, 5L, 5L, 5L, 6L, 1L, 5L, 5L, 6L, 5L, 6L, 
                                             5L, 6L, 6L, 6L, 5L, 6L, 6L, 5L, 3L, 5L, 5L, 5L, 6L, 5L, 5L, 6L, 
                                             5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 5L, 5L, 4L, 5L, 3L, 5L, 2L, 6L, 
                                             5L, 5L, 3L, 3L, 6L, 5L, 2L, 5L, 5L, 6L, 5L, 6L, 4L, 6L, 5L, 3L, 
                                             5L, 6L, 5L, 6L, 6L, 5L, 3L, 5L, 5L, 4L, 5L, 6L, 5L, 2L, 5L, 6L, 
                                             5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 3L, 5L, 6L, 5L, 5L, 1L, 5L, 6L, 
                                             5L, 5L, 5L, 6L, 5L, 5L, 3L, 5L, 5L, 5L, 5L, 4L, 6L, 5L, 6L, 5L, 
                                             5L, 5L, 6L, 2L, 5L, 3L, 6L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 
                                             5L, 5L, 5L, 5L, 5L, 5L, 3L, 5L, 4L, 5L, 5L, 5L, 5L, 5L, 6L, 5L, 
                                             5L, 3L, 5L, 5L, 6L, 5L, 5L, 5L, 6L, 5L, 6L, 5L, 3L, 5L, 4L, 5L, 
                                             5L, 3L, 6L, 6L, 3L, 4L, 4L, 2L, 3L, 5L, 5L, 3L, 5L, 3L, 5L, 6L, 
                                             5L, 5L, 5L, 5L, 6L, 6L, 4L, 5L, 5L, 5L, 3L, 5L, 5L, 5L, 3L, 5L, 
                                             3L, 6L, 5L, 3L, 5L, 3L, 5L, 5L, 5L, 5L, 3L, 5L, 2L, 5L, 5L, 3L, 
                                             3L, 2L, 3L, 5L, 5L, 3L, 3L, 5L, 1L, 5L, 4L, 5L, 3L, 3L, 5L, 5L, 
                                             3L, 5L, 5L, 3L, 5L, 5L, 6L, 3L, 5L, 3L, 5L, 5L, 3L, 3L, 5L, 4L, 
                                             3L, 5L, 6L, 3L, 5L, 6L, 3L, 5L, 1L, 5L, 5L, 5L, 6L, 5L, 3L, 2L, 
                                             2L, 2L, 2L, 4L, 2L, 2L, 2L, 3L, 5L, 5L, 5L, 5L, 5L, 5L, 3L, 5L, 
                                             2L, 5L, 5L, 5L, 5L, 5L, 5L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 3L, 2L, 
                                             5L, 2L, 4L, 2L, 3L, 4L, 3L, 3L, 4L, 4L, 3L, 2L, 1L, 1L, 3L, 3L, 
                                             3L, 3L, 2L, 2L, 3L, 2L, 3L, 5L, 3L, 5L, 4L, 1L, 5L, 2L, 2L, 1L, 
                                             1L, 1L, 5L, 5L, 3L, 2L, 1L, 5L, 3L, 5L, 6L, 5L, 4L, 5L, 6L, 5L, 
                                             3L, 5L, 3L, 5L, 3L, 1L, 3L, 4L, 2L, 6L, 5L, 5L, 6L, 6L, 5L, 2L, 
                                             4L, 4L, 6L, 5L, 6L, 4L, 6L, 3L, 3L, 4L, 1L, 6L, 6L, 1L, 2L, 2L, 
                                             1L, 1L, 3L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 6L, 1L, 2L, 1L, 6L, 
                                             1L, 1L, 5L, 1L, 3L, 3L, 1L, 3L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 
                                             2L, 1L, 1L, 2L, 6L, 4L, 2L, 4L, 4L, 2L, 4L, 2L, 4L, 4L, 6L, 5L, 
                                             2L, 5L, 4L, 6L, 4L, 6L, 4L, 4L, 4L, 4L, 3L, 2L, 5L, 4L, 2L, 6L, 
                                             6L, 2L, 4L, 2L, 6L, 2L, 5L, 4L, 3L, 2L, 4L, 4L, 4L, 5L, 4L, 4L, 
                                             4L, 4L, 6L, 5L, 2L, 5L, 6L, 2L, 5L, 5L, 5L, 2L, 2L, 5L, 4L, 6L, 
                                             4L, 2L, 2L, 4L, 2L, 4L, 2L, 2L, 1L, 4L, 4L, 4L, 5L, 5L, 2L, 4L, 
                                             2L, 4L, 2L, 1L, 4L, 5L, 5L, 2L, 4L, 4L, 4L, 3L, 4L, 2L, 4L, 2L, 
                                             2L, 6L, 1L, 2L, 2L, 4L, 4L, 4L, 2L, 4L, 5L, 2L, 4L, 4L, 4L, 2L, 
                                             5L, 6L, 4L, 5L, 5L, 4L, 5L, 2L, 4L, 4L, 4L, 5L, 5L, 4L, 4L, 4L, 
                                             4L, 2L, 5L, 5L, 4L, 5L, 4L, 4L, 5L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
                                             1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 2L, 3L, 5L, 
                                             4L, 2L, 2L, 2L, 2L, 3L, 1L, 4L, 2L, 3L, 4L, 2L, 3L, 2L, 5L, 3L, 
                                             3L, 5L, 2L, 1L), .Label = c("Primaria", "Secundaria", "Bachillerato", 
                                                                         "Tecnico", "Universitario", "Otro", "No sé", "No aplica", "T?o", 
                                                                         "No se"), class = "factor"), Country = c("MX", "MX", "MX", "MX", 
                                                                                                                  "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", 
                                                                                                                  "MX", "MX", "MX", "MX", "MX", "CH", "CH", "EC", "EC", "EC", "EC", 
                                                                                                                  "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", 
                                                                                                                  "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", 
                                                                                                                  "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", 
                                                                                                                  "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", 
                                                                                                                  "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", 
                                                                                                                  "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", 
                                                                                                                  "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", 
                                                                                                                  "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", 
                                                                                                                  "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", 
                                                                                                                  "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", 
                                                                                                                  "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", 
                                                                                                                  "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", 
                                                                                                                  "EC", "EC", "EC", "EC", "EC", "EC", "EC", "GT", "GT", "GT", "GT", 
                                                                                                                  "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", 
                                                                                                                  "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", 
                                                                                                                  "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", 
                                                                                                                  "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", 
                                                                                                                  "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", 
                                                                                                                  "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", 
                                                                                                                  "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", 
                                                                                                                  "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", 
                                                                                                                  "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", 
                                                                                                                  "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", 
                                                                                                                  "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", 
                                                                                                                  "GT", "GT", "GT", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", 
                                                                                                                  "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", 
                                                                                                                  "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", 
                                                                                                                  "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", 
                                                                                                                  "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", 
                                                                                                                  "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", 
                                                                                                                  "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", 
                                                                                                                  "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", 
                                                                                                                  "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", 
                                                                                                                  "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "GT", 
                                                                                                                  "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", 
                                                                                                                  "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", 
                                                                                                                  "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", 
                                                                                                                  "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "CH", "CH", "CH", 
                                                                                                                  "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", 
                                                                                                                  "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", 
                                                                                                                  "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", 
                                                                                                                  "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", 
                                                                                                                  "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", 
                                                                                                                  "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", 
                                                                                                                  "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", 
                                                                                                                  "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", 
                                                                                                                  "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", 
                                                                                                                  "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", 
                                                                                                                  "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", 
                                                                                                                  "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", 
                                                                                                                  "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", 
                                                                                                                  "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", 
                                                                                                                  "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", 
                                                                                                                  "US", "US", "US", "US", "US", "US", "US", "US", "US")), class = c("tbl_df", 
                                                                                                                                                                                    "tbl", "data.frame"), row.names = c(NA, -618L))

Solution

  • The trouble appears to be that you are trying to sum after you have converted the variables to characters (and added the percentages). You could convert back, but it is probably easier to just step back and calculate the totals first.

    Here, I am just separating out the first part of what you did and calling it step_one. Importantly, this stops just before converting the column to a character.

    step_one <-
      data %>%
      group_by(Q7_MoEd, add=TRUE) %>% 
      mutate(educ_levels_count = n()) %>% 
    
      #how many countries we are dealing with
      group_by(Country, add=TRUE) %>% 
      mutate(country_count = n()) %>% 
    
      #lets add how many education levels we have
      group_by(Q7_MoEd, Country, educ_levels_count, country_count, add=TRUE) %>% 
      summarise(
        educ_count = n(), 
        educ_percentage = round(educ_count/first(educ_levels_count),2)
        #country_percentage = educ_count/first(country_count), 
      ) %>%
      ungroup()
    

    Then, I am using that to calculate the totals myself:

    totals <-
      step_one %>%
      mutate(Q7_MoEd = "Total") %>%
      group_by(Country, Q7_MoEd) %>%
      summarise(
        educ_count = sum(educ_count)
      ) %>%
      ungroup() %>%
      mutate(
        educ_percentage = round(educ_count/sum(educ_count),2)
        , educ_levels_count = sum(educ_count)
      )
    

    I am planning to manually set the sort order for the result (to keep "Total" at the bottom, and better sort the other entries), using a variable:

    sort_order <-
      c("Primaria"
        , "Secundaria"
        , "Tecnico"
        , "Bachillerato"
        , "Universitario"
        , "Otro"
        , "Total")
    

    Finally, I am binding the step_one and totals data together and modifying it using the same steps you used before -- though I no longer need adorn_totals because the totals are already calculated:

    bind_rows(
      step_one
      , totals
    ) %>% 
      mutate(
        n_percentage = paste0(educ_count," (",educ_percentage,"%)")
        , Q7_MoEd = factor(Q7_MoEd, levels = sort_order)
             ) %>%
      select(Q7_MoEd, Country, n_percentage, educ_levels_count) %>% 
      spread(Country, n_percentage, fill="0 (0.00%)")
    

    returns:

    # A tibble: 7 x 7
      Q7_MoEd       educ_levels_count CH          EC          GT          MX         US        
      <fct>                     <int> <chr>       <chr>       <chr>       <chr>      <chr>     
    1 Primaria                     50 3 (0.06%)   2 (0.04%)   26 (0.52%)  10 (0.2%)  9 (0.18%) 
    2 Secundaria                   97 36 (0.37%)  5 (0.05%)   11 (0.11%)  25 (0.26%) 20 (0.21%)
    3 Tecnico                      83 56 (0.67%)  5 (0.06%)   8 (0.1%)    11 (0.13%) 3 (0.04%) 
    4 Bachillerato                 79 3 (0.04%)   9 (0.11%)   35 (0.44%)  25 (0.32%) 7 (0.09%) 
    5 Universitario               228 27 (0.12%)  82 (0.36%)  72 (0.32%)  44 (0.19%) 3 (0.01%) 
    6 Otro                         81 12 (0.15%)  40 (0.49%)  18 (0.22%)  11 (0.14%) 0 (0.00%) 
    7 Total                       618 137 (0.22%) 143 (0.23%) 170 (0.28%) 126 (0.2%) 42 (0.07%)