Search code examples
rggplot2plot

Adding a mean or median line to a density plot that ends at the within the graphed area


I am trying to add a line that extends only within the borders of the plotted area on a density graph. This is similar to the question here: Adding summary information to a density plot created with ggplot but they do not go so far as to add a single vertical line within the plotted area (it extends the for entire plot). I would like each of the vertical mean lines to stop at the top of the density geom, but I can't figure out a way to do this.

In short: I want to capture the value at the top of the density graph where the mean is for each group, and have the line end at that point.

Here is some code to demonstrate:

iris <- as.data.table(iris)

iris_summary <- iris[, .(sepal_mean = mean(Sepal.Length),
                                   sepal_se_low = mean(Sepal.Length) - sd(Sepal.Length) / sqrt(length(Sepal.Length)),
                                   sepal_se_high = mean(Sepal.Length) + sd(Sepal.Length) / sqrt(length(Sepal.Length))),
                                    Species]
unique(iris$Species)
x.dens.s <- density(iris[Species == "setosa", Sepal.Length])
x.dens.ve <- density(iris[Species == "versicolor", Sepal.Length])
x.dens.vi <- density(iris[Species == "virginica", Sepal.Length])
df.dens <- data.table(x = c(x.dens.s$x, x.dens.ve$x, x.dens.vi$x), y = c(x.dens.s$y, x.dens.ve$y, x.dens.vi$y))
df.dens$Species <- c(rep("setosa", length(x.dens.s$y)), rep("versicolor", length(x.dens.ve$y)),
                 rep("virginica", length(x.dens.vi$y)))

iris_density <- 
  ggplot() +
  geom_density(data=iris, aes(x=Sepal.Length,fill=Species),alpha=0.5) +
  geom_area(data = df.dens[Species == "setosa" & 
                             x %between% c(iris_summary[Species == "setosa", sepal_se_low], 
                                           iris_summary[Species == "setosa", sepal_se_high]),],  
            aes(x=x,y=y), fill = "white", alpha = 0.5) +
  geom_area(data = df.dens[Species == "versicolor" & 
                             x %between% c(iris_summary[Species == "versicolor", sepal_se_low], 
                                           iris_summary[Species == "versicolor", sepal_se_high]),],  
            aes(x=x,y=y), fill = "white", alpha = 0.5) +
  geom_area(data = df.dens[Species == "virginica" & 
                             x %between% c(iris_summary[Species == "virginica", sepal_se_low], 
                                           iris_summary[Species == "virginica", sepal_se_high]),],  
            aes(x=x,y=y), fill = "white", alpha = 0.5)  + 
  geom_vline(data = iris_summary, 
             aes(xintercept = sepal_mean, color = Species), linetype = 2, 
             linewidth = 0.7, color = "black")
  
iris_density

enter image description here


Solution

  • You need to find the y-value corresponding to the means, that requires some rounding. Check and see if that's what you need.

    library(data.table)
    library(ggplot2)
    library(dplyr)
    #> 
    #> Attaching package: 'dplyr'
    #> The following objects are masked from 'package:data.table':
    #> 
    #>     between, first, last
    #> The following objects are masked from 'package:stats':
    #> 
    #>     filter, lag
    #> The following objects are masked from 'package:base':
    #> 
    #>     intersect, setdiff, setequal, union
    
    
    iris <- as.data.table(iris)
    
    iris_summary <- iris[, .(sepal_mean = mean(Sepal.Length),
                                       sepal_se_low = mean(Sepal.Length) - sd(Sepal.Length) / sqrt(length(Sepal.Length)),
                                       sepal_se_high = mean(Sepal.Length) + sd(Sepal.Length) / sqrt(length(Sepal.Length))),
                                        Species]
    
    unique(iris$Species)
    #> [1] setosa     versicolor virginica 
    #> Levels: setosa versicolor virginica
    
    x.dens.s <- density(iris[Species == "setosa", Sepal.Length])
    x.dens.ve <- density(iris[Species == "versicolor", Sepal.Length])
    x.dens.vi <- density(iris[Species == "virginica", Sepal.Length])
    df.dens <- data.table(x = c(x.dens.s$x, x.dens.ve$x, x.dens.vi$x), y = c(x.dens.s$y, x.dens.ve$y, x.dens.vi$y))
    df.dens$Species <- c(rep("setosa", length(x.dens.s$y)), rep("versicolor", length(x.dens.ve$y)),
                     rep("virginica", length(x.dens.vi$y)))
    
    ## Add mean value to density dataset
    df.dens <- left_join(df.dens, iris_summary)
    #> Joining with `by = join_by(Species)`
    
    
    ## Find y-value at mean value
    yend <- df.dens |> 
      group_by(Species) |> 
      filter(round(x, 2) == round(sepal_mean, 2)) |> 
      summarise(yend = mean(y))
    
    ## Plot
    iris_density <- 
      ggplot() +
      geom_density(data=iris, aes(x=Sepal.Length,fill=Species),alpha=0.5) +
      geom_area(data = df.dens[Species == "setosa" & 
                                 x %between% c(iris_summary[Species == "setosa", sepal_se_low], 
                                               iris_summary[Species == "setosa", sepal_se_high]),],  
                aes(x=x,y=y), fill = "white", alpha = 0.5) +
      geom_area(data = df.dens[Species == "versicolor" & 
                                 x %between% c(iris_summary[Species == "versicolor", sepal_se_low], 
                                               iris_summary[Species == "versicolor", sepal_se_high]),],  
                aes(x=x,y=y), fill = "white", alpha = 0.5) +
      geom_area(data = df.dens[Species == "virginica" & 
                                 x %between% c(iris_summary[Species == "virginica", sepal_se_low], 
                                               iris_summary[Species == "virginica", sepal_se_high]),],  
                aes(x=x,y=y), fill = "white", alpha = 0.5)  + 
      geom_segment(aes(x = iris_summary$sepal_mean, y = 0, yend = yend$yend), linetype = 2)
    
    
      
    iris_density
    

    Created on 2024-08-29 with reprex v2.1.0