I have a data table of patient clusters before (consensus) and after treatments (single drug) and I want to show how patients flows into different clusters before and after treatment. In this case the actual cluster number doesn't mean much, the important bit is that for most patients cluster together before treatment also end up together after the treatment. Some moves around.
Here is a screenshot of the data
dummy dataset
structure(list(Stimulation = c("3S", "3S", "3S", "3S", "3S",
"3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S",
"3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S",
"3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S",
"3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S",
"3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S",
"3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S"), Patient.ID = c("S3077497",
"S1041120", "S162465", "S563275", "S2911623", "S3117192", "S2859024",
"S2088278", "S3306185", "S190789", "S12146451", "S2170842", "S115594",
"S2024203", "S1063872", "S2914138", "S303984", "S570813", "S2176683",
"S820460", "S1235729", "S3009401", "S2590229", "S629309", "S1208256",
"S2572773", "S3180483", "S3032079", "S3217608", "S5566943", "S5473728",
"S104259", "S2795346", "S2848989", "S2889801", "S2813983", "S2528246",
"S3151923", "S2592908", "S2603793", "S5565867", "S3127064", "S675629",
"S834679", "S3011944", "S5011583", "S2687896", "S2998620", "S651963",
"S2104595", "S2433454", "S2565220", "S3307762", "S294778", "S995510",
"S2476822", "S140868", "S1018263", "S2990223", "S5524130", "S1042529",
"S999706", "S363003", "S2303087", "S868213", "S5568359", "S3174542",
"S521782", "S3294727"), `Cluster assigned consensus` = c(2, 2,
2, 2, 2, 5, 5, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 4, 3, 7, 4, 4, 4,
4, 4, 4, 8, 8, 4, 7, 4, 1, 1, 1, 1, 1, 1, 1, 8, 8, 8, 8, 7, 7,
7, 7, 7, 3, 7, 6, 6, 6, 6, 6, 8, 7, 7, 5, 7, 5, 7, 7, 7, 8, 8,
4, 7, 4, 7), `Cluster assigned single drug` = c("1", "1", "1",
"1", "1", "1", "1", "2", "2", "2", "2", "2", "2", "2", "2", "2",
"2", "2", "2", "2", "3", "3", "3", "3", "3", "3", "3", "4", "4",
"4", "4", "5", "5", "5", "5", "5", "5", "5", "6", "6", "6", "6",
"6", "6", "6", "6", "6", "6", "6", "7", "7", "7", "7", "7", "7",
"7", "7", "8", "8", "8", "8", "8", "8", "8", "8", "8", "8", "8",
"8"), count = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)), row.names = c(NA, -69L), class = c("tbl_df",
"tbl", "data.frame"))
I'm first time getting to sankey plot so I 'm no expert. I added the count column, so each patient has a count of 1, the flow thickness can be then added by the count.
I modified from R tutorial and the code to visualise is here
library(ggplot2)
library(ggalluvial)
ggplot(data = CLL3S,
aes(axis1 = `Cluster assigned consensus`, axis2 = `Cluster assigned single drug`, y = count)) +
scale_x_discrete(limits = c("Consensus cluster", "Single-drug cluster"), expand = c(.1, .1)) +
xlab("Clusters") +
geom_alluvium(aes(fill = `Cluster assigned consensus`)) +
geom_stratum() +
geom_text(stat = "stratum", aes(label = after_stat(stratum))) +
theme_minimal() +
ggtitle("Patient flow between the Consensus clusters and Single-drug treated clusters",
"3S stimulated patients")
This kind of works but the figure isn't pretty:
You see the cluster numbers are surrounded by huge white empty boxes. How can I change that to something smaller? And how do I color code the box into different colors and make sure the if I change the geom_alluvium (fill) so the flow of the data matches the color of the boxes(consensus boxes)?
You control that in geom_stratum. Try this
library(ggplot2)
library(ggalluvial)
library(RColorBrewer)
# Define the number of colors you want
nb.cols <- 10
mycolor1 <- colorRampPalette(brewer.pal(8, "Set2"))(nb.cols)
mycolor2 <- colorRampPalette(brewer.pal(2, "Set2"))(nb.cols)
mycolors <- c("red","blue","green","orange")
ggplot(data = CLL3S,
aes(y = count, axis1 = `Cluster assigned consensus`, axis2 = `Cluster assigned single drug`
)) +
scale_x_discrete(limits = c("Consensus cluster", "Single-drug cluster"), expand = c(.1, .1)) +
labs(x="Clusters") +
geom_alluvium(aes(fill = `Cluster assigned consensus`)) +
geom_stratum(width = 1/4, fill = c(mycolor1[1:8],mycolor1[1:8]), color = "red") +
#geom_stratum() +
geom_text(stat = "stratum", aes(label = after_stat(stratum))) +
#scale_fill_manual(values = mycolors) +
theme_minimal() +
guides(fill=guide_legend(override.aes = list(color=mycolors)))+
ggtitle("Patient flow between the Consensus clusters and Single-drug treated clusters",
"3S stimulated patients")