Search code examples
rggplot2mixed-modelsconfidence-interval

How to display the confidence interval for only one of the regression lines within a mixed-effects model in ggplot2?


The following code creates a data frame with 3 columns.

set.seed(142222)
num_lots <- 5

# Create an empty data frame to store the simulated data
data <- data.frame(Lot = rep(1:num_lots, each = 9),
                   Time = rep(3 * 0:8, times = num_lots),
                   Measurement = numeric(num_lots * 9))

# Simulate purity data for each lot and time point
for (lot in 1:num_lots) {
  # Generate random intercept and slope for each lot
  intercept <- rnorm(1, mean = 95, sd = 2)
  slope <- runif(1, min = -.7, max = 0)
  
  for (month in 0:8) {
    # Simulate purity data with noise
    data[data$Lot == lot & data$Time == month * 3, "Purity"] <- intercept + slope * month * 3 + rnorm(1, mean = 0, sd = .35)
  }
}

And then I fit a mixed-effect model to the simulated data. As follow:

ggplot(data, aes(x = Time, y = Purity, color = as.factor(Lot), shape = as.factor(Lot))) +
  geom_point() +
  geom_smooth(method = "lm", se=FALSE, type = 1) +
  labs(
    title = "Test",
    x = "month",
    y = "Purity",
    color = "Lot",    # Set legend title for color
    shape = "Lot"     # Set legend title for shape
  ) +
  theme_minimal() +
  scale_x_continuous(breaks = c(0, 3, 6, 9, 12, 15, 18, 21, 24))

The result looks like the following:

enter image description here

Question: I want to show only the 95% lower confidence bound on the worst regression line. How can I do that?

Worst regression line is the line which will intersect the horizontal line == 80 earlier than other lines. I know if I set se == TRUE then all confidence bound for all of the lines will be shown. But I only want to have the lower confidence bound for only the worst line.

Bonus question: How can I fix the legend in such a way that only the symbols are shown (and not the line over them)?


Solution

  • You can plot two geom_smooth()s - one for the 4 'good' lines and one for the 1 'worst' line, e.g.

    library(tidyverse)
    
    set.seed(142222)
    num_lots <- 5
    
    # Create an empty data frame to store the simulated data
    data <- data.frame(Lot = rep(1:num_lots, each = 9),
                       Time = rep(3 * 0:8, times = num_lots),
                       Measurement = numeric(num_lots * 9))
    
    # Simulate purity data for each lot and time point
    for (lot in 1:num_lots) {
      # Generate random intercept and slope for each lot
      intercept <- rnorm(1, mean = 95, sd = 2)
      slope <- runif(1, min = -.7, max = 0)
      
      for (month in 0:8) {
        # Simulate purity data with noise
        data[data$Lot == lot & data$Time == month * 3, "Purity"] <- intercept + slope * month * 3 + rnorm(1, mean = 0, sd = .35)
      }
    }
    
    ggplot(data = data,
           aes(x = Time, y = Purity, 
               color = as.factor(Lot), 
               shape = as.factor(Lot))) +
      geom_point(key_glyph = "point") +
      geom_smooth(data = data %>% filter(Lot == 2),
                  method = "lm", se=TRUE, type = 1,
                  key_glyph = "point") +
      geom_smooth(data = data %>% filter(Lot != 2),
                  method = "lm", se=FALSE, type = 1,
                  key_glyph = "point") +
      labs(
        title = "Test",
        x = "month",
        y = "Purity",
        color = "Lot",    # Set legend title for color
        shape = "Lot"     # Set legend title for shape
      ) +
      theme_minimal() +
      scale_x_continuous(breaks = c(0, 3, 6, 9, 12, 15, 18, 21, 24))
    #> Warning in geom_smooth(data = data %>% filter(Lot == 2), method = "lm", :
    #> Ignoring unknown parameters: `type`
    #> Warning in geom_smooth(data = data %>% filter(Lot != 2), method = "lm", :
    #> Ignoring unknown parameters: `type`
    #> `geom_smooth()` using formula = 'y ~ x'
    #> `geom_smooth()` using formula = 'y ~ x'
    

    Created on 2023-10-12 with reprex v2.0.2

    Edit

    Instead of using key_glyph = "point", it's better to use show_legend = FALSE per @stefan's comment:

    library(tidyverse)
    
    set.seed(142222)
    num_lots <- 5
    
    # Create an empty data frame to store the simulated data
    data <- data.frame(Lot = rep(1:num_lots, each = 9),
                       Time = rep(3 * 0:8, times = num_lots),
                       Measurement = numeric(num_lots * 9))
    
    # Simulate purity data for each lot and time point
    for (lot in 1:num_lots) {
      # Generate random intercept and slope for each lot
      intercept <- rnorm(1, mean = 95, sd = 2)
      slope <- runif(1, min = -.7, max = 0)
      
      for (month in 0:8) {
        # Simulate purity data with noise
        data[data$Lot == lot & data$Time == month * 3, "Purity"] <- intercept + slope * month * 3 + rnorm(1, mean = 0, sd = .35)
      }
    }
    
    ggplot(data = data,
           aes(x = Time, y = Purity, 
               color = as.factor(Lot), 
               shape = as.factor(Lot))) +
      geom_point() +
      geom_smooth(data = data %>% filter(Lot == 2),
                  method = "lm", formula = "y ~ x",
                  se=TRUE,
                  show.legend = FALSE) +
      geom_smooth(data = data %>% filter(Lot != 2),
                  method = "lm", formula = "y ~ x",
                  se=FALSE,
                  show.legend = FALSE) +
      labs(
        title = "Test",
        x = "month",
        y = "Purity",
        color = "Lot",    # Set legend title for color
        shape = "Lot"     # Set legend title for shape
      ) +
      theme_minimal() +
      scale_x_continuous(breaks = c(0, 3, 6, 9, 12, 15, 18, 21, 24))
    

    Created on 2023-10-12 with reprex v2.0.2

    Edit 2

    You can automate the selection of the 'worst' line in different ways; the easiest way is to select the Lot with the lowest Purity at Time = 0, but this may change depending on your data (i.e. maybe you want to select the Lot with the lowest Purity at Time = 24?). You can draw the upper bounds only, but you have to calculate the coordinates yourself, e.g.

    library(tidyverse)
    
    set.seed(142222)
    num_lots <- 5
    
    # Create an empty data frame to store the simulated data
    data <- data.frame(Lot = rep(1:num_lots, each = 9),
                       Time = rep(3 * 0:8, times = num_lots),
                       Measurement = numeric(num_lots * 9))
    
    # Simulate purity data for each lot and time point
    for (lot in 1:num_lots) {
      # Generate random intercept and slope for each lot
      intercept <- rnorm(1, mean = 95, sd = 2)
      slope <- runif(1, min = -.7, max = 0)
      
      for (month in 0:8) {
        # Simulate purity data with noise
        data[data$Lot == lot & data$Time == month * 3, "Purity"] <- intercept + slope * month * 3 + rnorm(1, mean = 0, sd = .35)
      }
    }
    
    # Select the worst regression line
    worst <- data %>% filter(Purity == min(Purity)) %>% pull(Lot)
    
    # Build the 5 linear models
    output <- data %>%
      nest_by(Lot) %>%
      reframe(model = list(lm(data = data, formula = Purity ~ Time)))
    
    # Apply the models and extract the coordinates
    preds <- predict(output$model[[worst]], newdata = data, se.fit = TRUE)
    input_df <- data.frame(fit = preds$fit, se.fit = preds$se.fit) %>%
      bind_cols(data)
      
    # Plot data and input_df
    ggplot(data = data,
           aes(x = Time, y = Purity, 
               color = as.factor(Lot), 
               shape = as.factor(Lot))) +
      geom_point() +
      geom_smooth(method = "lm", formula = "y ~ x",
                  se=FALSE,
                  show.legend = FALSE) +
      geom_ribbon(data = input_df, aes(x = Time, y = Purity,
                                       ymin = fit, ymax = fit + se.fit * 2),
                  inherit.aes = FALSE, lty = 2, fill = "blue", alpha = 0.25) +
      labs(
        title = "Test",
        x = "month",
        y = "Purity",
        color = "Lot",    # Set legend title for color
        shape = "Lot"     # Set legend title for shape
      ) +
      theme_minimal() +
      scale_x_continuous(breaks = c(0, 3, 6, 9, 12, 15, 18, 21, 24))
    

    Created on 2023-10-12 with reprex v2.0.2