r ggplot2 mixed-models confidence-interval

How to display the confidence interval for only one of the regression lines within a mixed-effects model in ggplot2?

The following code creates a data frame with 3 columns.

set.seed(142222)
num_lots <- 5

# Create an empty data frame to store the simulated data
data <- data.frame(Lot = rep(1:num_lots, each = 9),
                   Time = rep(3 * 0:8, times = num_lots),
                   Measurement = numeric(num_lots * 9))

# Simulate purity data for each lot and time point
for (lot in 1:num_lots) {
  # Generate random intercept and slope for each lot
  intercept <- rnorm(1, mean = 95, sd = 2)
  slope <- runif(1, min = -.7, max = 0)
  
  for (month in 0:8) {
    # Simulate purity data with noise
    data[data$Lot == lot & data$Time == month * 3, "Purity"] <- intercept + slope * month * 3 + rnorm(1, mean = 0, sd = .35)
  }
}

And then I fit a mixed-effect model to the simulated data. As follow:

ggplot(data, aes(x = Time, y = Purity, color = as.factor(Lot), shape = as.factor(Lot))) +
  geom_point() +
  geom_smooth(method = "lm", se=FALSE, type = 1) +
  labs(
    title = "Test",
    x = "month",
    y = "Purity",
    color = "Lot",    # Set legend title for color
    shape = "Lot"     # Set legend title for shape
  ) +
  theme_minimal() +
  scale_x_continuous(breaks = c(0, 3, 6, 9, 12, 15, 18, 21, 24))

The result looks like the following:

Question: I want to show only the 95% lower confidence bound on the worst regression line. How can I do that?

Worst regression line is the line which will intersect the horizontal line == 80 earlier than other lines. I know if I set se == TRUE then all confidence bound for all of the lines will be shown. But I only want to have the lower confidence bound for only the worst line.

Bonus question: How can I fix the legend in such a way that only the symbols are shown (and not the line over them)?

Solution

You can plot two geom_smooth()s - one for the 4 'good' lines and one for the 1 'worst' line, e.g.

library(tidyverse)

set.seed(142222)
num_lots <- 5

# Create an empty data frame to store the simulated data
data <- data.frame(Lot = rep(1:num_lots, each = 9),
                   Time = rep(3 * 0:8, times = num_lots),
                   Measurement = numeric(num_lots * 9))

# Simulate purity data for each lot and time point
for (lot in 1:num_lots) {
  # Generate random intercept and slope for each lot
  intercept <- rnorm(1, mean = 95, sd = 2)
  slope <- runif(1, min = -.7, max = 0)
  
  for (month in 0:8) {
    # Simulate purity data with noise
    data[data$Lot == lot & data$Time == month * 3, "Purity"] <- intercept + slope * month * 3 + rnorm(1, mean = 0, sd = .35)
  }
}

ggplot(data = data,
       aes(x = Time, y = Purity, 
           color = as.factor(Lot), 
           shape = as.factor(Lot))) +
  geom_point(key_glyph = "point") +
  geom_smooth(data = data %>% filter(Lot == 2),
              method = "lm", se=TRUE, type = 1,
              key_glyph = "point") +
  geom_smooth(data = data %>% filter(Lot != 2),
              method = "lm", se=FALSE, type = 1,
              key_glyph = "point") +
  labs(
    title = "Test",
    x = "month",
    y = "Purity",
    color = "Lot",    # Set legend title for color
    shape = "Lot"     # Set legend title for shape
  ) +
  theme_minimal() +
  scale_x_continuous(breaks = c(0, 3, 6, 9, 12, 15, 18, 21, 24))
#> Warning in geom_smooth(data = data %>% filter(Lot == 2), method = "lm", :
#> Ignoring unknown parameters: `type`
#> Warning in geom_smooth(data = data %>% filter(Lot != 2), method = "lm", :
#> Ignoring unknown parameters: `type`
#> `geom_smooth()` using formula = 'y ~ x'
#> `geom_smooth()` using formula = 'y ~ x'

^{Created on 2023-10-12 with reprex v2.0.2}

Edit

Instead of using key_glyph = "point", it's better to use show_legend = FALSE per @stefan's comment:

library(tidyverse)

set.seed(142222)
num_lots <- 5

# Create an empty data frame to store the simulated data
data <- data.frame(Lot = rep(1:num_lots, each = 9),
                   Time = rep(3 * 0:8, times = num_lots),
                   Measurement = numeric(num_lots * 9))

# Simulate purity data for each lot and time point
for (lot in 1:num_lots) {
  # Generate random intercept and slope for each lot
  intercept <- rnorm(1, mean = 95, sd = 2)
  slope <- runif(1, min = -.7, max = 0)
  
  for (month in 0:8) {
    # Simulate purity data with noise
    data[data$Lot == lot & data$Time == month * 3, "Purity"] <- intercept + slope * month * 3 + rnorm(1, mean = 0, sd = .35)
  }
}

ggplot(data = data,
       aes(x = Time, y = Purity, 
           color = as.factor(Lot), 
           shape = as.factor(Lot))) +
  geom_point() +
  geom_smooth(data = data %>% filter(Lot == 2),
              method = "lm", formula = "y ~ x",
              se=TRUE,
              show.legend = FALSE) +
  geom_smooth(data = data %>% filter(Lot != 2),
              method = "lm", formula = "y ~ x",
              se=FALSE,
              show.legend = FALSE) +
  labs(
    title = "Test",
    x = "month",
    y = "Purity",
    color = "Lot",    # Set legend title for color
    shape = "Lot"     # Set legend title for shape
  ) +
  theme_minimal() +
  scale_x_continuous(breaks = c(0, 3, 6, 9, 12, 15, 18, 21, 24))

^{Created on 2023-10-12 with reprex v2.0.2}

Edit 2

You can automate the selection of the 'worst' line in different ways; the easiest way is to select the Lot with the lowest Purity at Time = 0, but this may change depending on your data (i.e. maybe you want to select the Lot with the lowest Purity at Time = 24?). You can draw the upper bounds only, but you have to calculate the coordinates yourself, e.g.

library(tidyverse)

set.seed(142222)
num_lots <- 5

# Create an empty data frame to store the simulated data
data <- data.frame(Lot = rep(1:num_lots, each = 9),
                   Time = rep(3 * 0:8, times = num_lots),
                   Measurement = numeric(num_lots * 9))

# Simulate purity data for each lot and time point
for (lot in 1:num_lots) {
  # Generate random intercept and slope for each lot
  intercept <- rnorm(1, mean = 95, sd = 2)
  slope <- runif(1, min = -.7, max = 0)
  
  for (month in 0:8) {
    # Simulate purity data with noise
    data[data$Lot == lot & data$Time == month * 3, "Purity"] <- intercept + slope * month * 3 + rnorm(1, mean = 0, sd = .35)
  }
}

# Select the worst regression line
worst <- data %>% filter(Purity == min(Purity)) %>% pull(Lot)

# Build the 5 linear models
output <- data %>%
  nest_by(Lot) %>%
  reframe(model = list(lm(data = data, formula = Purity ~ Time)))

# Apply the models and extract the coordinates
preds <- predict(output$model[[worst]], newdata = data, se.fit = TRUE)
input_df <- data.frame(fit = preds$fit, se.fit = preds$se.fit) %>%
  bind_cols(data)
  
# Plot data and input_df
ggplot(data = data,
       aes(x = Time, y = Purity, 
           color = as.factor(Lot), 
           shape = as.factor(Lot))) +
  geom_point() +
  geom_smooth(method = "lm", formula = "y ~ x",
              se=FALSE,
              show.legend = FALSE) +
  geom_ribbon(data = input_df, aes(x = Time, y = Purity,
                                   ymin = fit, ymax = fit + se.fit * 2),
              inherit.aes = FALSE, lty = 2, fill = "blue", alpha = 0.25) +
  labs(
    title = "Test",
    x = "month",
    y = "Purity",
    color = "Lot",    # Set legend title for color
    shape = "Lot"     # Set legend title for shape
  ) +
  theme_minimal() +
  scale_x_continuous(breaks = c(0, 3, 6, 9, 12, 15, 18, 21, 24))

^{Created on 2023-10-12 with reprex v2.0.2}