Search code examples
rhistogramgplots

How to add a density curve and mean line to geom_histogram?


I am currently writing my thesis and I have one thing I could not solve by searching the web. I have some datasets and I have to compare the results before and after, and I would like to visualise it by comparing two sets of histograms (both sets contain 5 plots). The think it is a really easy problem for you guys but a little help is still needed.

I tried a few thing but I keep ending up by messing up the ggplot. I know that I probably have to add two small lines of code but I am really struggling to find them.

I have got the following code that works for the time being.

df <- as.data.frame(clust1_mat[,1:5])

p1 <- ggplot(gather(df), aes(value)) + 
        geom_histogram(bins = 10) +
        facet_wrap(~key, scales = 'free_x', nrow= 1) +
        xlab("Average results students in CLuster 1")

p1 + geom_density(fill="lightblue")

df <- as.data.frame(cijfers_list[,1:5])


p2 <- ggplot(gather(df), aes(value)) +  
        geom_histogram(bins = 10) + 
        facet_wrap(~key, scales = 'free_x', nrow=1) +
        xlab("Average results students before clustering")

p2 + geom_density(fill="lightblue")



grid.arrange(p1, p2, nrow=2)

I would like to add a shaded density curve and a red vertical line on the mean of each histogram.

clust1_mat data:


structure(list(`BSTAT-TH` = c(6.9, 7, 8.1, 7.1, 6.2, 7, 6.2, 
7.7, 9.3, 6.3, 6.7, 6.9, 6.6, 5.3, 6.5, 6.3, 6.8, 7.3, 7.1, 6.9, 
7, 7, 6.5, 5.8, 6.2, 6.4, 7, 6.6, 9.5, 8, 6.5, 9, 7.3, 6.5, 7.4, 
6.9, 7.3, 6.2, 7.6, 7.1, 7.7, 5.2, 7, 6.5, 7.5, 6.9, 6.8, 7.4, 
9.2, 6.2, 9.2, 7.4, 9, 7.1, 5.7, 7.1, 8.4, 7.2, 8.8, 8.9, 5.7, 
7.1), `GRAAF-TH` = c(9.1, 6.5, 5.9, 7.3, 6.9, 7, 8.6, 8.4, 7.7, 
7, 7.2, 7.7, 8.3, 6.5, 7.7, 8.6, 8.5, 7.5, 7, 7.1, 5.9, 6.3, 
7.8, 8.3, 7.9, 8.1, 7.7, 7.5, 7.2, 9.2, 7.5, 9.4, 8.4, 5.8, 7.9, 
7.2, 7.6, 7.8, 8.7, 7.9, 7, 8.1, 7.3, 7.8, 7.7, 6.3, 6.2, 7.6, 
9.1, 7, 9.4, 9.2, 9.3, 7.4, 8.3, 7.2, 5.7, 8.7, 5.4, 7.7, 6.7, 
6.6), `BWISK-TH` = c(5.5, 6.1, 7.7, 5.2, 5.4, 6.3, 6.3, 3.8, 
5.4, 5.7, 4.7, 6.6, 6.9, 5.8, 4.8, 6.3, 6, 6.1, 7.1, 6.2, 6.3, 
6.1, 4.7, 5.9, 6.2, 4.9, 3.4, 5.5, 5.3, 4.2, 5.3, 5.2, 6, 5.9, 
5.9, 5.4, 6.2, 6.2, 5.7, 3.3, 6.5, 5.3, 6.3, 6.2, 6.5, 6.1, 5.8, 
4, 5.2, 6.4, 5.8, 3.8, 5.1, 5.8, 6, 6.1, 4.2, 5.4, 4.3, 5.4, 
4.7, 6.4), `CALEID-TH` = c(7.1, 6, 5.1, 6.6, 6.3, 4.9, 6.9, 4.7, 
6.4, 5.8, 5.7, 7.2, 5.8, 5.8, 5.5, 6.4, 5.8, 4.7, 5.7, 4.9, 5.1, 
5.8, 6, 6.9, 6.2, 5, 4.3, 5.5, 5.9, 4.4, 6.2, 6.2, 5.6, 6, 6.5, 
7.5, 4.3, 6.2, 6, 4.7, 6.3, 6.6, 4.4, 6.6, 6.1, 6.2, 5.3, 5.8, 
6.5, 6.1, 6.1, 4.8, 6, 5, 6.3, 7.4, 6.2, 6.2, 5.9, 6.2, 4.3, 
7.1), `COVA1-PR` = c(7.5, 8, 7.5, 7.5, 6, 7, 6.5, 7.5, 6.5, 6, 
7.5, 6, 7.5, 6.5, 7.5, 7, 8.5, 8, 7, 8, 6.5, 7, 7.5, 7.5, 8, 
7.7, 7.5, 6, 6, 6.5, 5.5, 6, 8, 8.5, 8, 7, 7.5, 8.5, 8.5, 7.5, 
6, 7, 8, 7, 8, 8, 6.5, 7.5, 6, 6.5, 6.5, 6.5, 6, 6.5, 7, 6, 6.5, 
8, 6, 6, 6.5, 6), cluster = c(`4` = 1L, `8` = 1L, `9` = 1L, `10` = 1L, 
`11` = 1L, `13` = 1L, `16` = 1L, `20` = 1L, `25` = 1L, `28` = 1L, 
`31` = 1L, `32` = 1L, `34` = 1L, `35` = 1L, `36` = 1L, `39` = 1L, 
`40` = 1L, `41` = 1L, `43` = 1L, `44` = 1L, `45` = 1L, `47` = 1L, 
`49` = 1L, `51` = 1L, `52` = 1L, `53` = 1L, `57` = 1L, `63` = 1L, 
`66` = 1L, `68` = 1L, `70` = 1L, `71` = 1L, `73` = 1L, `74` = 1L, 
`76` = 1L, `77` = 1L, `78` = 1L, `79` = 1L, `81` = 1L, `82` = 1L, 
`86` = 1L, `89` = 1L, `90` = 1L, `92` = 1L, `93` = 1L, `96` = 1L, 
`97` = 1L, `99` = 1L, `101` = 1L, `106` = 1L, `107` = 1L, `108` = 1L, 
`109` = 1L, `111` = 1L, `115` = 1L, `116` = 1L, `118` = 1L, `120` = 1L, 
`124` = 1L, `125` = 1L, `126` = 1L, `127` = 1L)), row.names = c(4L, 
8L, 9L, 10L, 11L, 13L, 16L, 20L, 25L, 28L, 31L, 32L, 34L, 35L, 
36L, 39L, 40L, 41L, 43L, 44L, 45L, 47L, 49L, 51L, 52L, 53L, 57L, 
63L, 66L, 68L, 70L, 71L, 73L, 74L, 76L, 77L, 78L, 79L, 81L, 82L, 
86L, 89L, 90L, 92L, 93L, 96L, 97L, 99L, 101L, 106L, 107L, 108L, 
109L, 111L, 115L, 116L, 118L, 120L, 124L, 125L, 126L, 127L), class = "data.frame")

Thanks!


Solution

  • Edited to add provided data.

    Adding a density curve to fit a histogram can be tricky - the key is setting the density to ..count.. and making sure you're multiplying it by the number of bins you are using in your histogram.

    Here's some dummy data and a couple examples:

    library(tidyverse)
    
    df <-
      tibble(
        a = rlnorm(1000, meanlog = 2, sdlog = .4),
        b = rlnorm(1000, meanlog = 2.2, sdlog = .4),
        c = rlnorm(1000, meanlog = 1.9, sdlog = .4),
        d = rlnorm(1000, meanlog = 2.1, sdlog = .4)
      ) %>%
      gather() %>% 
      group_by(key) %>% 
      mutate(mean = mean(value)) %>% # calculate mean for plotting as well
      ungroup()
    
    bin <- 1 # set number of bins
    
    df %>% 
      ggplot(aes(value)) +
      geom_density(aes(y = ..count.. * bin), # multiply count by bins
                   fill = "blue", alpha = .3, col = NA) + 
      geom_histogram(binwidth = bin, alpha = .5) + # use the same bins here
      geom_vline(aes(xintercept = mean), col = "red") + 
      theme_minimal() + 
      labs(y = "count") + 
      facet_wrap(~ key, ncol = 2)
    

    Let's try a different number of bins:

    bin <- 2.5
    
    df %>% 
      ggplot(aes(value)) +
      geom_density(aes(y = ..count.. * bin), fill = "blue", alpha = .3, col = NA) + 
      geom_histogram(binwidth = bin, alpha = .5) +
      geom_vline(aes(xintercept = mean), col = "red") + 
      theme_minimal() + 
      labs(y = "count") + 
      facet_wrap(~ key, ncol = 2)
    

    Hope this is what you were looking for!

    Probably a bit more finessing needed to get the plot perfect but here's a first whack at the data you provided:

    library(tidyverse)
    
    df <- your_data %>% 
      select(1:5) %>% 
      gather() %>% 
      group_by(key) %>% 
      mutate(mean = mean(value)) %>% 
      ungroup()
    
    bin <- 1
    
    df %>% 
      ggplot(aes(value)) +
      geom_density(aes(y = ..count.. * bin), fill = "blue", alpha = .3, col = NA) + 
      geom_histogram(binwidth = bin, alpha = .5) +
      geom_vline(aes(xintercept = mean), col = "red") + 
      theme_minimal() + 
      labs(y = "count") + 
      facet_wrap(~ key, ncol = 1) +
      coord_fixed(ratio = .04) + 
      scale_x_continuous(limits = c(1,10), breaks = 1:10, minor_breaks = NULL)
    

    Created on 2019-10-25 by the reprex package (v0.3.0)