Plotting proportion of count to geom_point size

I am using ggplot to create a scatterplot of survey responses between two groups. Currently I have the size of the points based on count of the answers per group. Since the groups' ns are so different, one group is consistently larger than the other. I would like the size of points to be based on what proportion of individuals within each group answered the certain answer choice. My current code and graph out are below:

ggplot(data, aes(y = value, x = gain, group = demographic, color = demographic)) +
  geom_count(position = position_dodge(width = 0.9))

current graph

My data currently looks like this:

structure(list(hispanic = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 
2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
1L, 1L, 1L, 1L), levels = c("hispanic", "non_hispanic"), class = "factor"), 
    gain = c("SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", 
    "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", 
    "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", 
    "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", 
    "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", 
    "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", 
    "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", 
    "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", 
    "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", 
    "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", 
    "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", 
    "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", 
    "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", 
    "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", 
    "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", 
    "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", 
    "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", 
    "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", 
    "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", 
    "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", 
    "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", 
    "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", 
    "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", 
    "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", 
    "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", 
    "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", 
    "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", 
    "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16"), value = c(5, 
    5, 5, 5, 4, 4, 5, NA, 5, 4, NA, NA, 4, 2, 5, 5, 5, 5, 5, 
    5, 4, 3, 4, 4, 5, 5, 3, 3, 3, 2, 4, 4, 5, 5, NA, NA, 4, 3, 
    3, NA, 4, 3, 2, 4, 3, 1, 5, 3, NA, NA, 4, 2, 5, 4, 3, 3, 
    2, 2, 5, 3, 5, 5, 5, NA, 3, 1, 3, 3, 5, 5, 3, 1, 5, 3, 5, 
    5, 4, 3, 4, 4, 5, 3, 5, 5, 5, 5, 5, 3, 4, 3, 4, 3, 5, 4, 
    4, 2, 4, 3, 5, 3, 5, 4, 4, 3, 4, 2, 5, 5, 5, 5, 4, 4, 4, 
    3, 1, 1, 5, 5, 5, 3, 5, 3, 3, 3, 5, 3, 3, 2, 5, 5, 5, 3, 
    5, 3, 4, 4, 5, 3)), row.names = c(NA, -138L), class = c("tbl_df", 
"tbl", "data.frame"))

I would like to obtain the proportion of "value" answers hispanic/non hispanic individuals, respectively, provided. The values range 1-5. I would like to calculate this for each gain, there are 21.

I have unsuccessfully tried to calculate proportions directly into my dataframe, as I have only two columns, but need to find proportions for 21 questions. I have played around with various ggplot tools, like scale_size_area and scale_size_continuous but they are only allowing me to manually inout a range, not change area to proportion of answers.

Solution

Try this:

library(dplyr)
library(ggplot2)
df_summary = df |>
  count(gain, value, hispanic) |>
  group_by(gain, value) |>
  mutate(
    n_all= sum(n),
    proportion = n / n_all
  ) |>
  ungroup() |>
  rename(demographic = hispanic)
df_summary
# # A tibble: 21 × 6
#    gain     value demographic      n n_all proportion
#    <chr>    <dbl> <fct>        <int> <int>      <dbl>
#  1 SU_11_15     1 hispanic         1     1      1    
#  2 SU_11_15     2 non_hispanic     2     2      1    
#  3 SU_11_15     3 non_hispanic    10    10      1    
#  4 SU_11_15     4 hispanic         7    19      0.368
#  5 SU_11_15     4 non_hispanic    12    19      0.632
#  6 SU_11_15     5 hispanic        12    34      0.353
#  7 SU_11_15     5 non_hispanic    22    34      0.647
#  8 SU_11_15    NA hispanic         2     3      0.667
#  9 SU_11_15    NA non_hispanic     1     3      0.333
# 10 SU_11_16     1 hispanic         1     4      0.25 
# # … with 11 more rows
# # ℹ Use `print(n = ...)` to see more rows

ggplot(df_summary, 
       aes(y = value, x = gain, color = demographic, size = proportion)) +
  geom_point(position = position_dodge(width = 0.9)) +
  labs(size = "Proportion")