Dplyr is really great to work with datasets and I think I can replace all data manipulation that I always did in Excel by using dplyr commands. However, it turns difficult for computing subtotals and percentage when dealing with several subgroups and I didn't find any related code in my recent search on this subject here.
Please check the following table. Everything works fine, but after spending more than a day, I'm still confused about how to perform the last calculation: totals for each country, cH, EC, GT, MX and the USA.
I'll be glad if some of the experts we just find here could help me with this process.
That's the code and the data structure if you want to reproduce the entire environment.
Thank you.
library(tidyverse)
data %>% group_by(Q7_MoEd, add=TRUE) %>%
mutate(educ_levels_count = n()) %>%
#how many countries we are dealing with
group_by(Country, add=TRUE) %>%
mutate(country_count = n()) %>%
#lets add how many education levels we have
group_by(Q7_MoEd, Country, educ_levels_count, country_count, add=TRUE) %>%
summarise(
educ_count = n(),
educ_percentage = round(educ_count/first(educ_levels_count),2),
#country_percentage = educ_count/first(country_count),
) %>%
mutate(n_percentage = paste0(educ_count," (",educ_percentage,"%)")) %>% #merge variables
select(Q7_MoEd, Country, n_percentage, educ_levels_count) %>%
spread(Country, n_percentage, fill="-") %>%
janitor::adorn_totals("row")
data <- structure(list(Q7_MoEd = structure(c(6L, 3L, 3L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 3L, 5L, 6L, 5L, 5L, 5L, 6L, 5L, 5L, 5L, 2L,
6L, 6L, 6L, 5L, 5L, 5L, 5L, 6L, 5L, 6L, 2L, 6L, 6L, 6L, 6L, 5L,
4L, 5L, 6L, 6L, 5L, 5L, 5L, 5L, 5L, 6L, 1L, 5L, 5L, 6L, 5L, 6L,
5L, 6L, 6L, 6L, 5L, 6L, 6L, 5L, 3L, 5L, 5L, 5L, 6L, 5L, 5L, 6L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 5L, 5L, 4L, 5L, 3L, 5L, 2L, 6L,
5L, 5L, 3L, 3L, 6L, 5L, 2L, 5L, 5L, 6L, 5L, 6L, 4L, 6L, 5L, 3L,
5L, 6L, 5L, 6L, 6L, 5L, 3L, 5L, 5L, 4L, 5L, 6L, 5L, 2L, 5L, 6L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 3L, 5L, 6L, 5L, 5L, 1L, 5L, 6L,
5L, 5L, 5L, 6L, 5L, 5L, 3L, 5L, 5L, 5L, 5L, 4L, 6L, 5L, 6L, 5L,
5L, 5L, 6L, 2L, 5L, 3L, 6L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 3L, 5L, 4L, 5L, 5L, 5L, 5L, 5L, 6L, 5L,
5L, 3L, 5L, 5L, 6L, 5L, 5L, 5L, 6L, 5L, 6L, 5L, 3L, 5L, 4L, 5L,
5L, 3L, 6L, 6L, 3L, 4L, 4L, 2L, 3L, 5L, 5L, 3L, 5L, 3L, 5L, 6L,
5L, 5L, 5L, 5L, 6L, 6L, 4L, 5L, 5L, 5L, 3L, 5L, 5L, 5L, 3L, 5L,
3L, 6L, 5L, 3L, 5L, 3L, 5L, 5L, 5L, 5L, 3L, 5L, 2L, 5L, 5L, 3L,
3L, 2L, 3L, 5L, 5L, 3L, 3L, 5L, 1L, 5L, 4L, 5L, 3L, 3L, 5L, 5L,
3L, 5L, 5L, 3L, 5L, 5L, 6L, 3L, 5L, 3L, 5L, 5L, 3L, 3L, 5L, 4L,
3L, 5L, 6L, 3L, 5L, 6L, 3L, 5L, 1L, 5L, 5L, 5L, 6L, 5L, 3L, 2L,
2L, 2L, 2L, 4L, 2L, 2L, 2L, 3L, 5L, 5L, 5L, 5L, 5L, 5L, 3L, 5L,
2L, 5L, 5L, 5L, 5L, 5L, 5L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 3L, 2L,
5L, 2L, 4L, 2L, 3L, 4L, 3L, 3L, 4L, 4L, 3L, 2L, 1L, 1L, 3L, 3L,
3L, 3L, 2L, 2L, 3L, 2L, 3L, 5L, 3L, 5L, 4L, 1L, 5L, 2L, 2L, 1L,
1L, 1L, 5L, 5L, 3L, 2L, 1L, 5L, 3L, 5L, 6L, 5L, 4L, 5L, 6L, 5L,
3L, 5L, 3L, 5L, 3L, 1L, 3L, 4L, 2L, 6L, 5L, 5L, 6L, 6L, 5L, 2L,
4L, 4L, 6L, 5L, 6L, 4L, 6L, 3L, 3L, 4L, 1L, 6L, 6L, 1L, 2L, 2L,
1L, 1L, 3L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 6L, 1L, 2L, 1L, 6L,
1L, 1L, 5L, 1L, 3L, 3L, 1L, 3L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L,
2L, 1L, 1L, 2L, 6L, 4L, 2L, 4L, 4L, 2L, 4L, 2L, 4L, 4L, 6L, 5L,
2L, 5L, 4L, 6L, 4L, 6L, 4L, 4L, 4L, 4L, 3L, 2L, 5L, 4L, 2L, 6L,
6L, 2L, 4L, 2L, 6L, 2L, 5L, 4L, 3L, 2L, 4L, 4L, 4L, 5L, 4L, 4L,
4L, 4L, 6L, 5L, 2L, 5L, 6L, 2L, 5L, 5L, 5L, 2L, 2L, 5L, 4L, 6L,
4L, 2L, 2L, 4L, 2L, 4L, 2L, 2L, 1L, 4L, 4L, 4L, 5L, 5L, 2L, 4L,
2L, 4L, 2L, 1L, 4L, 5L, 5L, 2L, 4L, 4L, 4L, 3L, 4L, 2L, 4L, 2L,
2L, 6L, 1L, 2L, 2L, 4L, 4L, 4L, 2L, 4L, 5L, 2L, 4L, 4L, 4L, 2L,
5L, 6L, 4L, 5L, 5L, 4L, 5L, 2L, 4L, 4L, 4L, 5L, 5L, 4L, 4L, 4L,
4L, 2L, 5L, 5L, 4L, 5L, 4L, 4L, 5L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 2L, 3L, 5L,
4L, 2L, 2L, 2L, 2L, 3L, 1L, 4L, 2L, 3L, 4L, 2L, 3L, 2L, 5L, 3L,
3L, 5L, 2L, 1L), .Label = c("Primaria", "Secundaria", "Bachillerato",
"Tecnico", "Universitario", "Otro", "No sé", "No aplica", "T?o",
"No se"), class = "factor"), Country = c("MX", "MX", "MX", "MX",
"MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX",
"MX", "MX", "MX", "MX", "MX", "CH", "CH", "EC", "EC", "EC", "EC",
"EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC",
"EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC",
"EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC",
"EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC",
"EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC",
"EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC",
"EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC",
"EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC",
"EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC",
"EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC",
"EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC",
"EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC", "EC",
"EC", "EC", "EC", "EC", "EC", "EC", "EC", "GT", "GT", "GT", "GT",
"GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT",
"GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT",
"GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT",
"GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT",
"GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT",
"GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT",
"GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT",
"GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT",
"GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT",
"GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT",
"GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT",
"GT", "GT", "GT", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX",
"MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX",
"MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX",
"MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX",
"MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX",
"MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX",
"MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX",
"MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX",
"MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX",
"MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "MX", "GT",
"GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT",
"GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT",
"GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT",
"GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "CH", "CH", "CH",
"CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH",
"CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH",
"CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH",
"CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH",
"CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH",
"CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH",
"CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH",
"CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH",
"CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH",
"CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH",
"CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH",
"CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH", "CH",
"US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US",
"US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US",
"US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US",
"US", "US", "US", "US", "US", "US", "US", "US", "US")), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -618L))
The trouble appears to be that you are trying to sum after you have converted the variables to characters (and added the percentages). You could convert back, but it is probably easier to just step back and calculate the totals first.
Here, I am just separating out the first part of what you did and calling it step_one
. Importantly, this stops just before converting the column to a character.
step_one <-
data %>%
group_by(Q7_MoEd, add=TRUE) %>%
mutate(educ_levels_count = n()) %>%
#how many countries we are dealing with
group_by(Country, add=TRUE) %>%
mutate(country_count = n()) %>%
#lets add how many education levels we have
group_by(Q7_MoEd, Country, educ_levels_count, country_count, add=TRUE) %>%
summarise(
educ_count = n(),
educ_percentage = round(educ_count/first(educ_levels_count),2)
#country_percentage = educ_count/first(country_count),
) %>%
ungroup()
Then, I am using that to calculate the totals myself:
totals <-
step_one %>%
mutate(Q7_MoEd = "Total") %>%
group_by(Country, Q7_MoEd) %>%
summarise(
educ_count = sum(educ_count)
) %>%
ungroup() %>%
mutate(
educ_percentage = round(educ_count/sum(educ_count),2)
, educ_levels_count = sum(educ_count)
)
I am planning to manually set the sort order for the result (to keep "Total" at the bottom, and better sort the other entries), using a variable:
sort_order <-
c("Primaria"
, "Secundaria"
, "Tecnico"
, "Bachillerato"
, "Universitario"
, "Otro"
, "Total")
Finally, I am binding the step_one
and totals
data together and modifying it using the same steps you used before -- though I no longer need adorn_totals
because the totals are already calculated:
bind_rows(
step_one
, totals
) %>%
mutate(
n_percentage = paste0(educ_count," (",educ_percentage,"%)")
, Q7_MoEd = factor(Q7_MoEd, levels = sort_order)
) %>%
select(Q7_MoEd, Country, n_percentage, educ_levels_count) %>%
spread(Country, n_percentage, fill="0 (0.00%)")
returns:
# A tibble: 7 x 7
Q7_MoEd educ_levels_count CH EC GT MX US
<fct> <int> <chr> <chr> <chr> <chr> <chr>
1 Primaria 50 3 (0.06%) 2 (0.04%) 26 (0.52%) 10 (0.2%) 9 (0.18%)
2 Secundaria 97 36 (0.37%) 5 (0.05%) 11 (0.11%) 25 (0.26%) 20 (0.21%)
3 Tecnico 83 56 (0.67%) 5 (0.06%) 8 (0.1%) 11 (0.13%) 3 (0.04%)
4 Bachillerato 79 3 (0.04%) 9 (0.11%) 35 (0.44%) 25 (0.32%) 7 (0.09%)
5 Universitario 228 27 (0.12%) 82 (0.36%) 72 (0.32%) 44 (0.19%) 3 (0.01%)
6 Otro 81 12 (0.15%) 40 (0.49%) 18 (0.22%) 11 (0.14%) 0 (0.00%)
7 Total 618 137 (0.22%) 143 (0.23%) 170 (0.28%) 126 (0.2%) 42 (0.07%)