I have a dataset like this (but with hundreds of samples):
data <- structure(list(sample = c("C001", "C001", "C001", "C001", "C001",
"C001", "C001", "C001", "C001", "C001", "C001", "C001", "C001",
"C002", "C002", "C002", "C002", "C002", "C002", "C002", "C002",
"C002", "C002", "C002", "C002", "C002", "C003", "C003", "C003",
"C003", "C003", "C003", "C003", "C003", "C003", "C003", "C003",
"C003", "C003", "C004", "C004", "C004", "C004", "C004", "C004",
"C004", "C004", "C004", "C004", "C004", "C004", "C004", "C007",
"C007", "C007", "C007", "C007", "C007", "C007", "C007", "C007",
"C007", "C007", "C007", "C007", "C009", "C009", "C009", "C009",
"C009", "C009", "C009", "C009", "C009", "C009", "C009", "C009",
"C009", "C011", "C011", "C011", "C011", "C011", "C011", "C011",
"C011", "C011", "C011", "C011", "C011", "C011", "C012", "C012",
"C012", "C012", "C012", "C012", "C012", "C012", "C012", "C012",
"C012", "C012", "C012", "C014", "C014", "C014", "C014", "C014",
"C014", "C014", "C014", "C014", "C014", "C014", "C014", "C014",
"C015", "C015", "C015", "C015", "C015", "C015", "C015", "C015",
"C015", "C015", "C015", "C015", "C015", "C016", "C016", "C016",
"C016", "C016", "C016", "C016", "C016", "C016", "C016", "C016",
"C016", "C016", "C018", "C018", "C018", "C018", "C018", "C018",
"C018", "C018", "C018", "C018", "C018", "C018", "C018"), count = c(0L,
130L, 0L, 10L, 0L, 20L, 568L, 23L, 6L, 77L, 616L, 230734L, 177L,
10L, 6396L, 0L, 5747L, 0L, 208L, 115189L, 13130L, 1L, 38L, 200L,
2604L, 3104L, 0L, 95476L, 0L, 3591L, 0L, 7L, 26359L, 83L, 5L,
1L, 1521L, 36004L, 9779L, 12L, 852L, 0L, 13L, 5L, 329L, 152053L,
288L, 2L, 0L, 0L, 530L, 1023L, 57L, 84L, 98060L, 122L, 0L, 8552L,
668L, 209L, 7L, 0L, 155L, 10159L, 4934L, 15L, 47L, 83L, 1L, 0L,
54L, 462L, 89L, 43L, 0L, 127476L, 2614L, 3659L, 12L, 1L, 1L,
1061L, 0L, 84199L, 845L, 898L, 0L, 29L, 10L, 63L, 1834L, 87L,
36L, 7L, 407L, 20167L, 39969L, 1429L, 51072L, 0L, 0L, 27L, 9560L,
3643L, 2899L, 10L, 0L, 380L, 0L, 82L, 1543L, 55L, 765L, 25172L,
29791L, 39805L, 922L, 6L, 843L, 5L, 110L, 0L, 174L, 134582L,
575L, 15L, 65L, 37L, 19240L, 830L, 1L, 1L, 0L, 0L, 0L, 63L, 156446L,
22L, 1L, 15L, 76L, 9710L, 793L, 128L, 4L, 1L, 2L, 0L, 1904L,
199L, 98779L, 0L, 0L, 11436L, 91L, 1813L), class = structure(c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 11L, 12L, 13L, 14L, 1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 9L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 11L,
12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 11L, 12L,
13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 11L, 12L, 13L,
14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 11L, 12L, 13L, 14L,
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 11L, 12L, 13L, 14L, 1L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L,
5L, 6L, 7L, 8L, 9L, 11L, 12L, 13L, 14L), .Label = c("a", "b",
"c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n"), class = "factor")), .Names = c("sample",
"count", "class"), row.names = c(NA, -156L), class = c("tbl_df",
"tbl", "data.frame"))
And I want to plot an histogram of this data:
library(tidyverse)
ggplot(data = data, aes(x = sample)) +
geom_bar(aes(y = count, fill = class), color = "black",
position = "fill", stat = "identity")
But as you can see, the bar are not weel-ordered ans it's not easy to compare different samples.
So I reorganize it at the hand to make it more "beautiful" (in some ways)
data$sample <- factor(data$sample, levels = c("C001", "C014", "C009", "C018",
"C012", "C004", "C016", "C002", "C015", "C011", "C003", "C007"))
ggplot(data = data, aes(x = sample)) +
geom_bar(aes(y = count, fill = class), color = "black",
position = "fill", stat = "identity")
It's probably not the best order but it's easier to compare proportions between similar samples.
At the end, I want to make plots like these (with facet_grid
) but let's start from the beginning.
There is no clear best way to do this. The first thing you have to do is define some sort of dissimilarity measure between the samples. One minus the correlation seems like one (of many) possible candidate. Then you can look at how to order the results based on the similarity measure. Hierarchical clustering gives you a possible order.
In the following code I used that your sample data was ordered and complete. Otherwise you may have to adjust.
# unique samples
samples <- unique(data$sample)
## dissimilarity measure
dm <- matrix(mapply(function(x, y) 1-cor(data[data$sample == x, ]$count, data[data$sample == y, ]$count),
rep(samples, times = length(samples)),
rep(samples, each = length(samples))), nrow = length(samples))
# single linkage clustering
hc <- hclust(as.dist(dm), method = "single")
# reorder
data$sample <- factor(data$sample, levels = samples[hc$order])
# plot
ggplot(data = data, aes(x = sample)) +
geom_bar(aes(y = count, fill = class), color = "black",
position = "fill", stat = "identity")