Search code examples
rggplot2visualization

I need help in smoothing out normal curves, is geom_smooth() useful in this regard or something else?


Current Output:
Current Output

I've currently plotted 2 normal curves overlayed on a histogram, which I created by selecting a range of values manually and using ggplot.

However, I can't figure out how to smoothen the curve as I'm relatively new to R studio. Can anyone help?

My current code:

#calculate normal curve and set interval

data_m1 <- data %>% filter(masses_kDa >= 0 & masses_kDa <= 150) #change range here

data_m2 <- data %>% filter(masses_kDa >= 300 & masses_kDa <= 900) #change range here

#calculate mean and standard deviation
mean_m1 <- mean(data_m1$masses_kDa)
sd_m1 <- sd(data_m1$masses_kDa)

mean_m2 <- mean(data_m2$masses_kDa)
sd_m2 <- sd(data_m2$masses_kDa)

library(ggplot2)

# Calculate the bin width and the total number of data points
bin_width <- 50
total_points_m1 <- length(data_m1$masses_kDa)
total_points_m2 <- length(data_m2$masses_kDa)

# Plot the histogram
ggplot(cleaned_data, aes(x=masses_kDa)) +
    geom_histogram(color='transparent', fill='red', alpha = 0.5, bins= 50) + # Adjust the number of bins here
    labs(x=' ', y=' ', title=' ') +
    # Add the normal curves to the histogram
    stat_function(fun = function(x) dnorm(x, mean = mean_m1, sd = sd_m1) * bin_width * total_points_m1, col= '#8B0000') +
stat_function(fun = function(x) dnorm(x, mean = mean_m2, sd = sd_m2) * bin_width * total_points_m2, col= '#8B0000') +
# Add the mean lines to the histogram
geom_vline(aes(xintercept = mean_m1), linetype="dashed", color = 'black') +
 geom_vline(aes(xintercept = mean_m2), linetype="dashed", color = 'black') +
# Ensure there are no gaps between the graph and axis
scale_x_continuous(expand = c(0,0)) +
  scale_y_continuous(expand = c(0,0)) +
coord_cartesian(xlim = c(0, 2000))+ # Adjust the cutoff point here
   theme(panel.background = element_blank(), # Make the background clear
         axis.line = element_line(color = "black"))



Solution

  • A simple fix would be giving the stat_function calls more points to calculate a y value for with n = 1001:

    library(ggplot2)
    
    df <- tibble::tibble(x = c(rnorm(1000, mean = 750, sd = 180),
                               rnorm(30, mean = 75, sd = 25)))
    
    
    bin_width <- 50
    
    
    ggplot(df, aes(x)) +
      geom_histogram(
        color = 'transparent',
        fill = 'red',
        alpha = 0.5,
        binwidth = bin_width
      ) + 
      stat_function(
        fun = function(x)
          dnorm(x, mean = 750,
                sd = 180) * bin_width * 1000,
        n = 1001,
        col = '#8B0000'
      ) +
      stat_function(
        fun = function(x)
          dnorm(x, mean = 75,
                sd = 25) * bin_width * 30,
        n = 1001,
        col = '#8B0000'
      ) +
      scale_x_continuous(expand = c(0,0)) +
      scale_y_continuous(expand = c(0,0)) +
      coord_cartesian(xlim = c(0, 2000))+ 
      theme(panel.background = element_blank(), 
            axis.line = element_line(color = "black"))