Search code examples
rggplot2plotlydistributiondensity-plot

Distribution plot for categorical variable


I have the following data table dt and want to construct/create a density plot of nrOrders for each unique delivYear:

structure(list(delivYear = c("2018", "2018", "2018", "2018", 
"2018", "2018", "2018", "2018", "2018", "2018", "2018", "2018", 
"2018", "2018", "2018", "2018", "2018", "2018", "2018", "2018", 
"2018", "2018", "2018", "2018", "2018", "2018", "2018", "2018", 
"2018", "2018", "2018", "2018", "2018", "2018", "2018", "2018", 
"2018", "2018", "2018", "2018", "2018", "2018", "2018", "2018", 
"2018", "2018", "2018", "2018", "2018", "2018", "2018", "2018", 
"2018", "2018", "2018", "2018", "2018", "2018", "2018", "2018", 
"2019", "2019", "2019", "2019", "2019", "2019", "2019", "2019", 
"2019", "2019", "2019", "2019", "2019", "2019", "2019", "2019", 
"2019", "2019", "2019", "2019", "2019", "2019", "2019", "2019", 
"2019", "2019", "2019", "2019", "2019", "2019", "2019", "2019", 
"2019", "2019", "2019", "2019", "2019", "2019", "2019", "2019", 
"2019", "2019", "2019", "2019", "2019", "2019", "2019", "2019", 
"2020", "2020", "2020", "2020", "2020", "2020", "2020", "2020", 
"2020", "2020", "2020", "2020", "2020", "2020", "2020", "2020", 
"2020", "2020", "2020", "2020", "2020", "2020", "2020", "2020", 
"2020", "2020", "2020", "2020", "2020", "2020", "2020", "2020", 
"2020", "2020", "2020", "2020", "2020", "2020", "2020", "2020", 
"2020", "2020", "2020", "2020", "2020", "2020", "2020", "2020", 
"2020", "2020", "2020", "2020", "2020", "2020", "2020", "2020", 
"2020", "2020", "2020", "2020"), acquiYear = c("2014", "2014", 
"2014", "2014", "2014", "2014", "2014", "2014", "2014", "2014", 
"2014", "2014", "2015", "2015", "2015", "2015", "2015", "2015", 
"2015", "2015", "2015", "2015", "2015", "2015", "2016", "2016", 
"2016", "2016", "2016", "2016", "2016", "2016", "2016", "2016", 
"2016", "2016", "2017", "2017", "2017", "2017", "2017", "2017", 
"2017", "2017", "2017", "2017", "2017", "2017", "2018", "2018", 
"2018", "2018", "2018", "2018", "2018", "2018", "2018", "2018", 
"2018", "2018", "2016", "2016", "2016", "2016", "2016", "2016", 
"2016", "2016", "2016", "2016", "2016", "2016", "2017", "2017", 
"2017", "2017", "2017", "2017", "2017", "2017", "2017", "2017", 
"2017", "2017", "2018", "2018", "2018", "2018", "2018", "2018", 
"2018", "2018", "2018", "2018", "2018", "2018", "2019", "2019", 
"2019", "2019", "2019", "2019", "2019", "2019", "2019", "2019", 
"2019", "2019", "2016", "2016", "2016", "2016", "2016", "2016", 
"2016", "2016", "2016", "2016", "2016", "2016", "2017", "2017", 
"2017", "2017", "2017", "2017", "2017", "2017", "2017", "2017", 
"2017", "2017", "2018", "2018", "2018", "2018", "2018", "2018", 
"2018", "2018", "2018", "2018", "2018", "2018", "2019", "2019", 
"2019", "2019", "2019", "2019", "2019", "2019", "2019", "2019", 
"2019", "2019", "2020", "2020", "2020", "2020", "2020", "2020", 
"2020", "2020", "2020", "2020", "2020", "2020"), month = structure(c(1L, 
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 
5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 
8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 
11L, 12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 
4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L, 6L, 
7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 
10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 
12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L, 
6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 
9L, 10L, 11L, 12L), .Label = c("Jan", "Feb", "Mar", "Apr", "May", 
"Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"), class = "factor"), 
    nrOrders = c(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 0, 2, 4, 5, 
    3, 7, 3, 5, 4, 3, 7, 8, 7, 2, 24, 16, 33, 9, 27, 16, 10, 
    27, 9, 31, 35, 11, 11, 25, 15, 18, 19, 19, 8, 27, 34, 43, 
    51, 0, 11, 2, 0, 0, 0, 0, 0, 4, 5, 1, 0, 8, 1, 18, 19, 10, 
    31, 7, 5, 19, 3, 18, 12, 2, 9, 24, 11, 12, 13, 10, 14, 17, 
    24, 20, 14, 13, 4, 0, 27, 6, 5, 13, 14, 13, 20, 17, 64, 3, 
    6, 4, 8, 1, 5, 3, 2, 2, 3, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 
    1, 0, 1, 0, 7, 1, 15, 8, 1, 16, 7, 3, 5, 14, 9, 5, 12, 16, 
    0, 13, 5, 0, 11, 7, 12, 12, 5, 35, 4, 6, 11, 11, 6, 19, 6, 
    22, 19, 52, 61, 44, 4, 6, 9, 1, 6, 2, 2, 1, 1, 0, 0, 0)), row.names = c(NA, 
-168L), class = c("data.table", "data.frame"))

The question for the distribution/density plot is as follows: How is the number of orders nrOrders for each delivery year delivYear distributed over the month? I am not sure how to do that because there are no continuous variables.

How can I plot a distribution/density plot for this problem?


Solution

  • As your month variable is categorical maybe you would want to consider a stacked area chart with geom_area(). Also if I understood correctly you should summarise the number of orders for each month first before proceeding with the plotting:

    dt %>% 
      group_by(month,delivYear) %>% 
      summarise(sumOrders = sum(nrOrders)) %>%
      ggplot() + 
      geom_area(aes(x = month, 
                               y = sumOrders, 
                               group = delivYear, 
                               fill = delivYear, 
                               alpha=0.7), 
                           position = "identity") +
      theme_classic()
    

    Output:

    enter image description here