Search code examples
rggplot2

How to add significance lines to a grouped boxplot in ggplot2 with ggsignif?


My data looks like this

dput(rbind.data.frame(head(combined_iresidues), tail(combined_iresidues)))
structure(list(atomnum = c(889, 889, 889, 889, 238, 238, 121, 
121, 914, 914, 914, 914), atominfo = c("C115-SG", "C115-SG", 
"C115-SG", "C115-SG", "C30-SG", "C30-SG", "G16-C", "G16-C", "D119-CB", 
"D119-CB", "D119-CB", "D119-CB"), dx = c("d1", "d2", "d3", "d4", 
"d1", "d2", "d3", "d4", "d1", "d2", "d3", "d4"), value = c(0.0736, 
0.0912, 0.1027, 0.1046, 0.0734, 0.11, 0.0263, 0.0255, 0.0048, 
0.0185, 0.0231, 0.0184), facet_title = c("F[o]^2 - F[o]^1", "F[o]^3 - F[o]^1", 
"F[o]^4 - F[o]^1", "F[o]^5 - F[o]^1", "F[o]^2 - F[o]^1", "F[o]^3 - F[o]^1", 
"F[o]^4 - F[o]^1", "F[o]^5 - F[o]^1", "F[o]^2 - F[o]^1", "F[o]^3 - F[o]^1", 
"F[o]^4 - F[o]^1", "F[o]^5 - F[o]^1"), xstl = c("XS1-4.5", "XS1-4.5", 
"XS1-4.5", "XS1-4.5", "XS1-4.5", "XS1-4.5", "XS1-9.5", "XS1-9.5", 
"XS1-9.5", "XS1-9.5", "XS1-9.5", "XS1-9.5"), pH = c("4.5", "4.5", 
"4.5", "4.5", "4.5", "4.5", "9.5", "9.5", "9.5", "9.5", "9.5", 
"9.5"), dataset = c("b3x11", "b3x11", "b3x11", "b3x11", "b3x11", 
"b3x11", "b3x14", "b3x14", "b3x14", "b3x14", "b3x14", "b3x14"
), mean = c(0.0204145631067961, 0.0239122683142101, 0.0292734333627538, 
0.0319611650485437, 0.0204145631067961, 0.0239122683142101, 0.0273805996472663, 
0.0315843915343915, 0.0181168430335097, 0.0231408289241623, 0.0273805996472663, 
0.0315843915343915), sd = c(0.00984739879326438, 0.0112094908468026, 
0.0117753129471673, 0.0134786092559798, 0.00984739879326438, 
0.0112094908468026, 0.0110648522827486, 0.013277545556386, 0.00666584795415998, 
0.00959848563099842, 0.0110648522827486, 0.013277545556386), 
    residue_type = c("CYS", "CYS", "CYS", "CYS", "CYS", "CYS", 
    "GLY", "GLY", "ASP|GLU", "ASP|GLU", "ASP|GLU", "ASP|GLU")), row.names = c(NA, 
-12L), groups = structure(list(dx = c("d1", "d2", "d3", "d4"), 
    .rows = structure(list(c(1L, 5L), c(2L, 6L), 3L, 4L), ptype = integer(0), class = c("vctrs_list_of", 
    "vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -4L), .drop = TRUE), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"))

My code to generate the plot is like this

# Boxplot for interesting residues 
ggplot(data = combined_iresidues, aes(x = factor(residue_type, levels = c("CYS", "ASP|GLU", "MET", "GLY")), y = value, fill = factor(xstl))) + geom_hline(aes(yintercept = mean + 3 * sd, colour = factor(xstl)), linetype = "dashed", linewidth = 1.0) +
  geom_boxplot(outlier.alpha=0.5) +
  scale_y_continuous(breaks = c(0, 0.05, 0.10, 0.15)) +
  facet_wrap(~facet_title, labeller = label_parsed) +
  theme_bw(base_size = 16) +
  labs(
    x = "Residue type", 
    y = "y_label",
    fill = "Crystal",  # Change legend title for color
    colour = "Crystal"
  ) 

If I add to the code above + geom_signif(comparisons = list(c(1,2))) I get the lines between two different residue_types, not within the same residue_types.

From this answer (geom_signif between boxplots of different x and same group) It seems that I need to get a new x (x2) ...

combined_iresidues %>%
  mutate(x2 = interaction(xstl, residue_type)) %>%
  ggplot(aes(x = factor(x2), y = value, fill = factor(xstl))) +
  geom_hline(aes(yintercept = mean + 3 * sd, colour = factor(xstl)), linetype = "dashed", linewidth = 1.0) +
  geom_boxplot(outlier.alpha=0.5) +
  scale_y_continuous(breaks = c(0, 0.05, 0.10, 0.15))+
  facet_wrap(~facet_title, labeller = label_parsed) +
  theme_bw(base_size = 16) +
  labs(
    x = "Residue type", 
    y = "y_label",
    fill = "Crystal",  # Change legend title for color
    colour = "Crystal"
  ) +
  geom_signif(comparisons = list(c(1, 2), c(3, 4), c(5, 6), c(7, 8)), map_signif_level = TRUE, textsize = 5, y_position = 0.13) 

This generates the significance lines but it would be good to have the x labels as in the first plot.

I tried changing the labels without success with scale_x_discrete(breaks = c(1.5, 3.5, 5.5, 7.5), labels = c("CYS", "ASP|GLU", "MET", "GLY")) but I get no labels.

Is there an easier way to get the ggsignif without having a new x2? Maybe ggsignif is not the best option here? Rotating the x-axis text helps a little bit but it is an ugly solution.


Solution

  • As we can pass a function to the labels= argument of scale_x_discrete we can use e.g. gsub() to get keep only the residue_type of the interaction variable. To this end I use an underscore as the sep=arator in interaction(). Additionally I converted residue_type to a factor first to get the desired order.

    library(ggplot2)
    library(ggsignif)
    library(dplyr, warn = FALSE)
    
    combined_iresidues |>
      ungroup() |>
      mutate(
        residue_type = factor(residue_type,
          levels = c("CYS", "ASP|GLU", "MET", "GLY")
        ),
        x2 = interaction(xstl, residue_type, sep = "_")
      ) |>
      ggplot(aes(x = x2, y = value, fill = xstl)) +
      geom_hline(aes(
        yintercept = mean + 3 * sd,
        colour = factor(xstl)
      ), linetype = "dashed", linewidth = 1.0) +
      geom_boxplot(outlier.alpha = 0.5) +
      scale_y_continuous(breaks = c(0, 0.05, 0.10, 0.15)) +
      scale_x_discrete(
        labels = \(x) gsub("^.*_", "", x)
      ) +
      facet_wrap(~facet_title, labeller = label_parsed) +
      theme_bw(base_size = 16) +
      labs(
        x = "Residue type",
        y = "y_label",
        fill = "Crystal", # Change legend title for color
        colour = "Crystal"
      ) +
      ggsignif::geom_signif(
        comparisons = list(c(1, 2), c(3, 4), c(5, 6), c(7, 8)),
        map_signif_level = TRUE, textsize = 5, y_position = 0.13
      )
    #> Warning: Computation failed in `stat_signif()`.
    #> Caused by error in `wilcox.test.default()`:
    #> ! not enough 'y' observations
    #> Warning: Computation failed in `stat_signif()`.
    #> Caused by error in `wilcox.test.default()`:
    #> ! not enough 'y' observations