Search code examples
rggplot2boxplotggpubr

Adding significance bars in a ggplot2() boxplot between two groups in the same x value


I am using the built-in ToothGrowth dataset as toy data to make boxplots with the package ggplot2, and am also trying to add significance bars. I got it working using the package ggpubr, where I compare the x-values. Below is a screenshot of the error bars that only compare dose, but I'd like to calculate the significance between each supp value. For example, when only looking at dose = 0.5, are the len values in OJ and VC statistically significantly different. I have the p-values that compare OJ and VC in each dose, but don't know how to plot them

I have included all the code to fully reproduce this data, and I am now trying to add pval_0.5, pval_1, and pval_2 to this plot. I can't figure out how to specify geom_signif() to compare the samples.

https://i.sstatic.net/oE7g79A4.png

Here below is making the original plot with significance bars (very manually formatted):


ToothGrowth$dose <- as.factor(ToothGrowth$dose)
head(ToothGrowth)

# Filter based on len
len_0.5 <- ToothGrowth |>
            dplyr::filter(dose == "0.5") |>
            dplyr::select(len)

len_1 <- ToothGrowth |>
            dplyr::filter(dose == "1") |>
            dplyr::select(len)

len_2 <- ToothGrowth |>
            dplyr::filter(dose == "2") |>
            dplyr::select(len)

# Get p-values
stat_0.5v1 <- t.test(len_0.5, len_1,
                     var = F)

stat_0.5v2 <- t.test(len_0.5, len_2,
                     var = F)

stat_1v2 <- t.test(len_0.5, len_2,
                     var = F)

pval_0.5v1 <- stat_0.5v1$p.value
pval_0.5v2 <- stat_0.5v2$p.value
pval_1v2 <- stat_1v2$p.value

Manually setting parameters for ggplot2


t.test_compare <- list(c("0.5", "1"), 
                       c("0.5", "2"), 
                       c("1", "2"))

t.test_annot <- c(signif(pval_0.5v1, digits = 3), 
                  signif(pval_0.5v2, digits = 3), 
                  signif(pval_1v2, digits = 3))

t.test_ypos <- c(max(ToothGrowth$len)*1.05, 
                 max(ToothGrowth$len)*1.15, 
                 max(ToothGrowth$len)*1.25)

Plotting


library(ggpubr)

p <- ggplot(data = ToothGrowth,
            mapping = aes(x = dose,
                          y = len,
                          fill = supp)) +
      geom_boxplot(position = position_dodge(1)) +
      theme_classic() +
      geom_signif(data = ToothGrowth, 
                  comparisons = t.test_compare,
                  map_signif_level = T,
                  annotations = t.test_annot,
                  y_position = t.test_ypos)
    
p

Get more p-values to compare OJ and VC


len_0.5.oj <- ToothGrowth |>
            dplyr::filter(dose == "0.5", supp == "OJ") |>
            dplyr::select(len)

len_0.5.vc <- ToothGrowth |>
            dplyr::filter(dose == "0.5", supp == "VC") |>
            dplyr::select(len)

len_1.oj <- ToothGrowth |>
            dplyr::filter(dose == "1", supp == "OJ") |>
            dplyr::select(len)

len_1.vc <- ToothGrowth |>
            dplyr::filter(dose == "1", supp == "VC") |>
            dplyr::select(len)

len_2.oj <- ToothGrowth |>
            dplyr::filter(dose == "2", supp == "OJ") |>
            dplyr::select(len)

len_2.vc <- ToothGrowth |>
            dplyr::filter(dose == "2", supp == "VC") |>
            dplyr::select(len)

stat_0.5 <- t.test(len_0.5.oj, len_0.5.vc,
                   var = F)

stat_1 <- t.test(len_1.oj, len_1.vc,
                   var = F)

stat_2 <- t.test(len_2.oj, len_2.vc,
                   var = F)

pval_0.5 <- stat_0.5$p.value

pval_1 <- stat_1$p.value

pval_2 <- stat_2$p.value


Solution

  • First things first from a statistical point of view. I would not calculate a bunch a 2 sample t.tests and rely instead on an ANOVA model which accounts for the overall variability. Having said that, of course you can calculate the p-values the way you like, just adjust the p-values in the code below.

    library(dplyr)
    library(emmeans)
    library(ggplot2)
    library(ggsignif)
    
    ToothGrowth$dose <- as.factor(ToothGrowth$dose)
    
    ## lm model which accounts for `dose` and `supp`
    tg_mod <- lm(len ~ dose * supp, data = ToothGrowth)
    
    ## use emmeans to get the contrasts
    emm <- emmeans(tg_mod, ~ supp | dose)
    
    ## depending on your use case you want to adjust for multiplicity 
    ## (remove the `adjust` argument)
    (con <- contrast(emm, method = "pairwise", adjust = "none") %>% 
        as_tibble() %>% 
        mutate(x_pos = as.integer(dose)) %>% 
        inner_join(ToothGrowth %>% 
                     summarize(y_pos = max(len), .by = dose) %>% 
                     mutate(y_pos = y_pos + 5),
                   "dose") %>% 
        mutate(p.value = signif(p.value, digits = 3L)))
    
    # # A tibble: 3 × 9
    #   contrast dose  estimate    SE    df t.ratio p.value x_pos y_pos
    #   <fct>    <fct>    <dbl> <dbl> <dbl>   <dbl>   <dbl> <int> <dbl>
    # 1 OJ - VC  0.5     5.25    1.62    54  3.23   0.00209     1  26.5
    # 2 OJ - VC  1       5.93    1.62    54  3.65   0.00059     2  32.3
    # 3 OJ - VC  2      -0.0800  1.62    54 -0.0493 0.961       3  38.9
    

    Now that we have the p-values calculated we can add them to the plot. the argument comparisons won't do though, because we used a dodged boxplot, so we need ot use xmin and xmax alternatively:

    ggplot(data = ToothGrowth,
           mapping = aes(x = dose,
                         y = len,
                         fill = supp)) +
      geom_boxplot(position = position_dodge(1)) +
      theme_classic() +
      geom_signif(xmin = con %>% pull(x_pos) - .25,
                  xmax = con %>% pull(x_pos) + .25,
                  annotations = con %>% pull(p.value),
                  y_position = con %>% pull(y_pos))
    

    A boxplot with dose on the x-Axis and dodges boxplots (according to supp). A significance bracket is added on top of each doe group with the p-value as annotation