I am using ggplot to create a scatterplot of survey responses between two groups. Currently I have the size of the points based on count of the answers per group. Since the groups' ns are so different, one group is consistently larger than the other. I would like the size of points to be based on what proportion of individuals within each group answered the certain answer choice. My current code and graph out are below:
ggplot(data, aes(y = value, x = gain, group = demographic, color = demographic)) +
geom_count(position = position_dodge(width = 0.9))
My data currently looks like this:
structure(list(hispanic = structure(c(2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
1L, 1L, 1L, 1L), levels = c("hispanic", "non_hispanic"), class = "factor"),
gain = c("SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16",
"SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15",
"SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16",
"SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15",
"SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16",
"SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15",
"SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16",
"SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15",
"SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16",
"SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15",
"SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16",
"SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15",
"SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16",
"SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15",
"SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16",
"SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15",
"SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16",
"SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15",
"SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16",
"SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15",
"SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16",
"SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15",
"SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16",
"SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15",
"SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16",
"SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15",
"SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16",
"SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16"), value = c(5,
5, 5, 5, 4, 4, 5, NA, 5, 4, NA, NA, 4, 2, 5, 5, 5, 5, 5,
5, 4, 3, 4, 4, 5, 5, 3, 3, 3, 2, 4, 4, 5, 5, NA, NA, 4, 3,
3, NA, 4, 3, 2, 4, 3, 1, 5, 3, NA, NA, 4, 2, 5, 4, 3, 3,
2, 2, 5, 3, 5, 5, 5, NA, 3, 1, 3, 3, 5, 5, 3, 1, 5, 3, 5,
5, 4, 3, 4, 4, 5, 3, 5, 5, 5, 5, 5, 3, 4, 3, 4, 3, 5, 4,
4, 2, 4, 3, 5, 3, 5, 4, 4, 3, 4, 2, 5, 5, 5, 5, 4, 4, 4,
3, 1, 1, 5, 5, 5, 3, 5, 3, 3, 3, 5, 3, 3, 2, 5, 5, 5, 3,
5, 3, 4, 4, 5, 3)), row.names = c(NA, -138L), class = c("tbl_df",
"tbl", "data.frame"))
I would like to obtain the proportion of "value" answers hispanic/non hispanic individuals, respectively, provided. The values range 1-5. I would like to calculate this for each gain, there are 21.
I have unsuccessfully tried to calculate proportions directly into my dataframe, as I have only two columns, but need to find proportions for 21 questions. I have played around with various ggplot tools, like scale_size_area and scale_size_continuous but they are only allowing me to manually inout a range, not change area to proportion of answers.
Try this:
library(dplyr)
library(ggplot2)
df_summary = df |>
count(gain, value, hispanic) |>
group_by(gain, value) |>
mutate(
n_all= sum(n),
proportion = n / n_all
) |>
ungroup() |>
rename(demographic = hispanic)
df_summary
# # A tibble: 21 × 6
# gain value demographic n n_all proportion
# <chr> <dbl> <fct> <int> <int> <dbl>
# 1 SU_11_15 1 hispanic 1 1 1
# 2 SU_11_15 2 non_hispanic 2 2 1
# 3 SU_11_15 3 non_hispanic 10 10 1
# 4 SU_11_15 4 hispanic 7 19 0.368
# 5 SU_11_15 4 non_hispanic 12 19 0.632
# 6 SU_11_15 5 hispanic 12 34 0.353
# 7 SU_11_15 5 non_hispanic 22 34 0.647
# 8 SU_11_15 NA hispanic 2 3 0.667
# 9 SU_11_15 NA non_hispanic 1 3 0.333
# 10 SU_11_16 1 hispanic 1 4 0.25
# # … with 11 more rows
# # ℹ Use `print(n = ...)` to see more rows
ggplot(df_summary,
aes(y = value, x = gain, color = demographic, size = proportion)) +
geom_point(position = position_dodge(width = 0.9)) +
labs(size = "Proportion")