Search code examples
rggplot2histogramfacet-grid

Automatically order x axis on ggplot2 histogram in a nicely way


I have a dataset like this (but with hundreds of samples):

data <- structure(list(sample = c("C001", "C001", "C001", "C001", "C001", 
                          "C001", "C001", "C001", "C001", "C001", "C001", "C001", "C001", 
                          "C002", "C002", "C002", "C002", "C002", "C002", "C002", "C002", 
                          "C002", "C002", "C002", "C002", "C002", "C003", "C003", "C003", 
                          "C003", "C003", "C003", "C003", "C003", "C003", "C003", "C003", 
                          "C003", "C003", "C004", "C004", "C004", "C004", "C004", "C004", 
                          "C004", "C004", "C004", "C004", "C004", "C004", "C004", "C007", 
                          "C007", "C007", "C007", "C007", "C007", "C007", "C007", "C007", 
                          "C007", "C007", "C007", "C007", "C009", "C009", "C009", "C009", 
                          "C009", "C009", "C009", "C009", "C009", "C009", "C009", "C009", 
                          "C009", "C011", "C011", "C011", "C011", "C011", "C011", "C011", 
                          "C011", "C011", "C011", "C011", "C011", "C011", "C012", "C012", 
                          "C012", "C012", "C012", "C012", "C012", "C012", "C012", "C012", 
                          "C012", "C012", "C012", "C014", "C014", "C014", "C014", "C014", 
                          "C014", "C014", "C014", "C014", "C014", "C014", "C014", "C014", 
                          "C015", "C015", "C015", "C015", "C015", "C015", "C015", "C015", 
                          "C015", "C015", "C015", "C015", "C015", "C016", "C016", "C016", 
                          "C016", "C016", "C016", "C016", "C016", "C016", "C016", "C016", 
                          "C016", "C016", "C018", "C018", "C018", "C018", "C018", "C018", 
                          "C018", "C018", "C018", "C018", "C018", "C018", "C018"), count = c(0L, 
                                                                                             130L, 0L, 10L, 0L, 20L, 568L, 23L, 6L, 77L, 616L, 230734L, 177L, 
                                                                                             10L, 6396L, 0L, 5747L, 0L, 208L, 115189L, 13130L, 1L, 38L, 200L, 
                                                                                             2604L, 3104L, 0L, 95476L, 0L, 3591L, 0L, 7L, 26359L, 83L, 5L, 
                                                                                             1L, 1521L, 36004L, 9779L, 12L, 852L, 0L, 13L, 5L, 329L, 152053L, 
                                                                                             288L, 2L, 0L, 0L, 530L, 1023L, 57L, 84L, 98060L, 122L, 0L, 8552L, 
                                                                                             668L, 209L, 7L, 0L, 155L, 10159L, 4934L, 15L, 47L, 83L, 1L, 0L, 
                                                                                             54L, 462L, 89L, 43L, 0L, 127476L, 2614L, 3659L, 12L, 1L, 1L, 
                                                                                             1061L, 0L, 84199L, 845L, 898L, 0L, 29L, 10L, 63L, 1834L, 87L, 
                                                                                             36L, 7L, 407L, 20167L, 39969L, 1429L, 51072L, 0L, 0L, 27L, 9560L, 
                                                                                             3643L, 2899L, 10L, 0L, 380L, 0L, 82L, 1543L, 55L, 765L, 25172L, 
                                                                                             29791L, 39805L, 922L, 6L, 843L, 5L, 110L, 0L, 174L, 134582L, 
                                                                                             575L, 15L, 65L, 37L, 19240L, 830L, 1L, 1L, 0L, 0L, 0L, 63L, 156446L, 
                                                                                             22L, 1L, 15L, 76L, 9710L, 793L, 128L, 4L, 1L, 2L, 0L, 1904L, 
                                                                                             199L, 98779L, 0L, 0L, 11436L, 91L, 1813L), class = structure(c(1L, 
                                                                                                                                                            2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 
                                                                                                                                                            4L, 5L, 6L, 7L, 8L, 9L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 
                                                                                                                                                            6L, 7L, 8L, 9L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 
                                                                                                                                                            8L, 9L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 
                                                                                                                                                            11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 11L, 
                                                                                                                                                            12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 11L, 12L, 
                                                                                                                                                            13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 11L, 12L, 13L, 
                                                                                                                                                            14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 11L, 12L, 13L, 14L, 
                                                                                                                                                            1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 11L, 12L, 13L, 14L, 1L, 2L, 
                                                                                                                                                            3L, 4L, 5L, 6L, 7L, 8L, 9L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 
                                                                                                                                                            5L, 6L, 7L, 8L, 9L, 11L, 12L, 13L, 14L), .Label = c("a", "b", 
                                                                                                                                                                                                                "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n"), class = "factor")), .Names = c("sample", 
                                                                                                                                                                                                                                                                                                            "count", "class"), row.names = c(NA, -156L), class = c("tbl_df", 
                                                                                                                                                                                                                                                                                                                                                                   "tbl", "data.frame"))

And I want to plot an histogram of this data:

library(tidyverse)
ggplot(data = data, aes(x = sample)) +
  geom_bar(aes(y = count, fill = class), color = "black", 
           position = "fill", stat = "identity")

enter image description here

But as you can see, the bar are not weel-ordered ans it's not easy to compare different samples.

So I reorganize it at the hand to make it more "beautiful" (in some ways)

data$sample <- factor(data$sample, levels = c("C001", "C014", "C009", "C018",
                      "C012", "C004", "C016", "C002", "C015", "C011", "C003", "C007"))

ggplot(data = data, aes(x = sample)) +
  geom_bar(aes(y = count, fill = class), color = "black", 
           position = "fill", stat = "identity")

enter image description here

It's probably not the best order but it's easier to compare proportions between similar samples.

At the end, I want to make plots like these (with facet_grid) but let's start from the beginning.

enter image description here Source


Solution

  • There is no clear best way to do this. The first thing you have to do is define some sort of dissimilarity measure between the samples. One minus the correlation seems like one (of many) possible candidate. Then you can look at how to order the results based on the similarity measure. Hierarchical clustering gives you a possible order.

    In the following code I used that your sample data was ordered and complete. Otherwise you may have to adjust.

    # unique samples
    samples <- unique(data$sample)
    ## dissimilarity measure
    dm <- matrix(mapply(function(x, y) 1-cor(data[data$sample == x, ]$count, data[data$sample == y, ]$count), 
                        rep(samples, times = length(samples)),
                        rep(samples, each = length(samples))), nrow = length(samples))
    # single linkage clustering
    hc <- hclust(as.dist(dm), method = "single")
    # reorder
    data$sample <- factor(data$sample, levels = samples[hc$order])
    # plot
    ggplot(data = data, aes(x = sample)) +
      geom_bar(aes(y = count, fill = class), color = "black", 
               position = "fill", stat = "identity")
    

    ordered plot