Search code examples
rggplot2scatter-plotproportions

Plotting proportion of count to geom_point size


I am using ggplot to create a scatterplot of survey responses between two groups. Currently I have the size of the points based on count of the answers per group. Since the groups' ns are so different, one group is consistently larger than the other. I would like the size of points to be based on what proportion of individuals within each group answered the certain answer choice. My current code and graph out are below:

ggplot(data, aes(y = value, x = gain, group = demographic, color = demographic)) +
  geom_count(position = position_dodge(width = 0.9))

current graph

My data currently looks like this:

structure(list(hispanic = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 
2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
1L, 1L, 1L, 1L), levels = c("hispanic", "non_hispanic"), class = "factor"), 
    gain = c("SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", 
    "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", 
    "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", 
    "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", 
    "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", 
    "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", 
    "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", 
    "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", 
    "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", 
    "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", 
    "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", 
    "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", 
    "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", 
    "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", 
    "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", 
    "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", 
    "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", 
    "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", 
    "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", 
    "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", 
    "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", 
    "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", 
    "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", 
    "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", 
    "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", 
    "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", 
    "SU_11_16", "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16", 
    "SU_11_15", "SU_11_16", "SU_11_15", "SU_11_16"), value = c(5, 
    5, 5, 5, 4, 4, 5, NA, 5, 4, NA, NA, 4, 2, 5, 5, 5, 5, 5, 
    5, 4, 3, 4, 4, 5, 5, 3, 3, 3, 2, 4, 4, 5, 5, NA, NA, 4, 3, 
    3, NA, 4, 3, 2, 4, 3, 1, 5, 3, NA, NA, 4, 2, 5, 4, 3, 3, 
    2, 2, 5, 3, 5, 5, 5, NA, 3, 1, 3, 3, 5, 5, 3, 1, 5, 3, 5, 
    5, 4, 3, 4, 4, 5, 3, 5, 5, 5, 5, 5, 3, 4, 3, 4, 3, 5, 4, 
    4, 2, 4, 3, 5, 3, 5, 4, 4, 3, 4, 2, 5, 5, 5, 5, 4, 4, 4, 
    3, 1, 1, 5, 5, 5, 3, 5, 3, 3, 3, 5, 3, 3, 2, 5, 5, 5, 3, 
    5, 3, 4, 4, 5, 3)), row.names = c(NA, -138L), class = c("tbl_df", 
"tbl", "data.frame"))

I would like to obtain the proportion of "value" answers hispanic/non hispanic individuals, respectively, provided. The values range 1-5. I would like to calculate this for each gain, there are 21.

I have unsuccessfully tried to calculate proportions directly into my dataframe, as I have only two columns, but need to find proportions for 21 questions. I have played around with various ggplot tools, like scale_size_area and scale_size_continuous but they are only allowing me to manually inout a range, not change area to proportion of answers.


Solution

  • Try this:

    library(dplyr)
    library(ggplot2)
    df_summary = df |>
      count(gain, value, hispanic) |>
      group_by(gain, value) |>
      mutate(
        n_all= sum(n),
        proportion = n / n_all
      ) |>
      ungroup() |>
      rename(demographic = hispanic)
    df_summary
    # # A tibble: 21 × 6
    #    gain     value demographic      n n_all proportion
    #    <chr>    <dbl> <fct>        <int> <int>      <dbl>
    #  1 SU_11_15     1 hispanic         1     1      1    
    #  2 SU_11_15     2 non_hispanic     2     2      1    
    #  3 SU_11_15     3 non_hispanic    10    10      1    
    #  4 SU_11_15     4 hispanic         7    19      0.368
    #  5 SU_11_15     4 non_hispanic    12    19      0.632
    #  6 SU_11_15     5 hispanic        12    34      0.353
    #  7 SU_11_15     5 non_hispanic    22    34      0.647
    #  8 SU_11_15    NA hispanic         2     3      0.667
    #  9 SU_11_15    NA non_hispanic     1     3      0.333
    # 10 SU_11_16     1 hispanic         1     4      0.25 
    # # … with 11 more rows
    # # ℹ Use `print(n = ...)` to see more rows
    
    ggplot(df_summary, 
           aes(y = value, x = gain, color = demographic, size = proportion)) +
      geom_point(position = position_dodge(width = 0.9)) +
      labs(size = "Proportion")
    

    enter image description here