I use geom_tile()
together with geom_text
from ggplot2
to generate basically a table:
Two factors (X and Y) are plotted on the x-axis (resulting in two columns of tiles). Levels of factor Y are nested within factor X. Therefore, multiple identical tiles for factor X are plotted (one per level of factor Y). Is there a way to "merge" the tiles of factor X to larger tiles with the text occurring only once per tile? I am also open for approaches using other functions than geom_tile()
to achieve this.
The result should look like this:
Here is my code:
library(ggplot2)
dat <- data.frame(id = c(1:4, 1:4),
factor = c(rep("X", times = 4), rep("Y", times = 4)),
value = c("A", "A", "B", "B", "C", "D", "E", "F"))
ggplot(dat, aes(y = id, x = factor)) +
geom_tile(color = "black", fill = NA) +
geom_text(aes(label = value))
y = id
must be preserved.
In my actual data id is a factor:
dat <- data.frame(id = factor(c("n1", "n2", "n3", "n4", "n1", "n2", "n3", "n4")),
factor = c(rep("X", times = 4), rep("Y", times = 4)),
value = c("A", "A", "B", "B", "C", "D", "E", "F"))
A reprex that produces the problem of multiple "merged" cells per level of factor with code provided by Allan Cameron:
dat <- data.frame(id = factor(c("n1", "n2", "n3", "n4", "n1", "n2", "n3", "n4")),
factor = rep(c('X', 'Y'), each = 4),
value = c('A', 'C', 'B', 'C', 'D', 'E', 'F', 'G'))
dat %>%
mutate(id = as.numeric(factor(id))) %>%
group_by(factor) %>%
mutate(chunk = data.table::rleid(value)) %>%
group_by(factor, chunk, value) %>%
summarise(y = n()) %>%
group_by(factor) %>%
mutate(height = y) %>%
mutate(y = (cumsum(y) + cumsum(lag(y, 1, 0)))/2 + 0.5) %>%
ggplot(aes(y = y, x = factor)) +
geom_tile(aes(height = height), color = "black", fill = NA) +
geom_text(aes(label = value)) +
scale_y_continuous('id', breaks = seq_along(unique(dat$id)),
labels = levels(factor(dat$id)))
Output:
The rows containing "C" should be merged for each factor.
A subset of my real data:
dat <- structure(list(id = structure(c(3L, 4L, 5L, 6L, 1L, 7L, 2L, 8L,
3L, 4L, 5L, 6L, 1L, 7L, 2L, 8L,
3L, 4L, 5L, 6L, 1L, 7L, 2L, 8L),
levels = c("n374", "n673", "n139", "n2015",
"n344", "n36", "n467", "n76"),
class = "factor"),
factor = structure(c(1L,1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
3L, 3L,3L, 3L, 3L, 3L, 3L, 3L),
levels = c("phylum", "class", "genus"),
class = "factor"),
value = c("Proteobacteria", "Proteobacteria",
"Proteobacteria", "Proteobacteria",
"Bacteroidetes", "Proteobacteria",
"Bacteroidetes", "Proteobacteria",
"Alphaproteobacteria", "Betaproteobacteria",
"Alphaproteobacteria", "Alphaproteobacteria",
"Cytophagia", "Betaproteobacteria",
"Chitinophagia", "Betaproteobacteria",
"Sphingomonas", "Aquabacterium",
"Dongia", "Sphingomonas", "Chryseolinea",
"unidentified", "unidentified","Sphaerotilus")),
row.names = c(NA, -24L),
class = c("tbl_df", "tbl", "data.frame"))
This produces the following output with the code from the EDIT by Allan Cameron:
[![enter image description here][2]][2]
What I want:
The latest code by Allan Cameron still causes "lower level" tiles to merge when they were not nested in the same "higher level" tile. This affects the value "unidentified":
This can be reproduced with the following data:
dat <- structure(list(id = structure(c(3L, 4L, 5L, 1L, 6L, 2L,
3L, 4L, 5L, 1L, 6L, 2L,
3L, 4L, 5L, 1L, 6L, 2L),
levels = c("OTU_374", "OTU_673", "OTU_139",
"OTU_344", "OTU_36", "OTU_467"),
class = "factor"),
factor = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L),
levels = c("phylum", "class", "genus"),
class = "factor"),
value = c("Proteobacteria", "Proteobacteria",
"Proteobacteria", "Bacteroidetes",
"Proteobacteria", "Bacteroidetes",
"Alphaproteobacteria", "Alphaproteobacteria",
"Alphaproteobacteria", "Cytophagia",
"Betaproteobacteria", "Chitinophagia",
"Sphingomonas", "unidentified", "Sphingomonas",
"Chryseolinea", "unidentified", "unidentified")),
row.names = c(NA, -18L),
class = c("tbl_df", "tbl", "data.frame"))
"Lower level" tiles should not merge, if they were already separated by a "higher level" tile.
You could use geom_col
:
ggplot(dplyr::count(dat, value, factor), aes(y = n, x = factor)) +
geom_col(color = "black", fill = NA, position = 'stack', width = 1) +
geom_text(aes(label = value), position = position_stack(vjust = 0.5))
But a more general solution using geom_tile
would be to calculate the central point and height of each tile, mapping the latter to the height
aesthetic.
library(tidyverse)
dat %>%
mutate(id = as.numeric(factor(id))) %>%
group_by(factor) %>%
mutate(chunk = data.table::rleid(value)) %>%
group_by(factor, chunk, value) %>%
summarise(y = n()) %>%
group_by(factor) %>%
mutate(height = y) %>%
mutate(y = (cumsum(y) + cumsum(lag(y, 1, 0)))/2 + 0.5) %>%
ggplot(aes(y = y, x = factor)) +
geom_tile(aes(height = height), color = "black", fill = NA) +
geom_text(aes(label = value)) +
scale_y_continuous('id', breaks = seq_along(unique(dat$id)),
labels = levels(factor(dat$id)))
This also allows non-consecutive blocks to be merged, for example, if your data is
dat <- data.frame(id = c(1:7, 1:7),
factor = rep(c('X', 'y'), each = 7),
value = c('A', 'A', 'B', 'B', 'A', 'B', 'B',
'C', 'D', 'E', 'F', 'B', 'B', 'B'))
Then you would get
So that the IDs are always matched to the correct value and ID ordering takes precedence over cell merging.
EDIT
With some of the actual data now available, and the new information from the OP we can do:
dat2 <- dat %>%
mutate(factor = paste0(factor, '_value')) %>%
pivot_wider(names_from = factor, values_from = value) %>%
arrange(phylum_value, class_value, genus_value) %>%
mutate(id = factor(id, id)) %>%
group_by(phylum_value) %>%
mutate(phylum_chunk = cur_group_id()) %>%
group_by(phylum_value, class_value) %>%
mutate(class_chunk = cur_group_id()) %>%
group_by(phylum_value, class_value, genus_value) %>%
mutate(genus_chunk = cur_group_id()) %>%
pivot_longer(phylum_value:genus_chunk, names_sep = '_',
names_to = c('factor', '.value'))
dat2 %>%
group_by(factor, chunk, value) %>%
summarise(y = n()) %>%
mutate(factor = factor(factor, c('phylum', 'class', 'genus'))) %>%
group_by(factor) %>%
mutate(height = y) %>%
mutate(y = (cumsum(y) + cumsum(lag(y, 1, 0)))/2 + 0.5) %>%
ggplot(aes(y = y, x = factor)) +
geom_tile(aes(height = height), color = "black", fill = NA) +
geom_text(aes(label = value)) +
scale_y_continuous('id', breaks = seq_along(levels(dat2$id)),
labels = levels(dat2$id))