Search code examples
rggplot2visualizationsankey-diagram

Using Sankey plot to see data flow in R ggalluvial and cosmetics by ggplot


I have a data table of patient clusters before (consensus) and after treatments (single drug) and I want to show how patients flows into different clusters before and after treatment. In this case the actual cluster number doesn't mean much, the important bit is that for most patients cluster together before treatment also end up together after the treatment. Some moves around.

Here is a screenshot of the data enter image description here

dummy dataset 

structure(list(Stimulation = c("3S", "3S", "3S", "3S", "3S", 
"3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", 
"3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", 
"3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", 
"3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", 
"3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", 
"3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S", "3S"), Patient.ID =       c("S3077497", 
"S1041120", "S162465", "S563275", "S2911623", "S3117192", "S2859024", 
"S2088278", "S3306185", "S190789", "S12146451", "S2170842", "S115594", 
"S2024203", "S1063872", "S2914138", "S303984", "S570813", "S2176683", 
"S820460", "S1235729", "S3009401", "S2590229", "S629309", "S1208256", 
"S2572773", "S3180483", "S3032079", "S3217608", "S5566943",     "S5473728", 
"S104259", "S2795346", "S2848989", "S2889801", "S2813983", "S2528246", 
"S3151923", "S2592908", "S2603793", "S5565867", "S3127064", "S675629", 
"S834679", "S3011944", "S5011583", "S2687896", "S2998620", "S651963", 
"S2104595", "S2433454", "S2565220", "S3307762", "S294778", "S995510", 
"S2476822", "S140868", "S1018263", "S2990223", "S5524130", "S1042529", 
"S999706", "S363003", "S2303087", "S868213", "S5568359", "S3174542", 
"S521782", "S3294727"), `Cluster assigned consensus` = c(2, 2, 
2, 2, 2, 5, 5, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 4, 3, 7, 4, 4, 4, 
4, 4, 4, 8, 8, 4, 7, 4, 1, 1, 1, 1, 1, 1, 1, 8, 8, 8, 8, 7, 7, 
7, 7, 7, 3, 7, 6, 6, 6, 6, 6, 8, 7, 7, 5, 7, 5, 7, 7, 7, 8, 8, 
4, 7, 4, 7), `Cluster assigned single drug` = c("1", "1", "1", 
"1", "1", "1", "1", "2", "2", "2", "2", "2", "2", "2", "2", "2", 
"2", "2", "2", "2", "3", "3", "3", "3", "3", "3", "3", "4", "4", 
"4", "4", "5", "5", "5", "5", "5", "5", "5", "6", "6", "6", "6", 
"6", "6", "6", "6", "6", "6", "6", "7", "7", "7", "7", "7", "7", 
"7", "7", "8", "8", "8", "8", "8", "8", "8", "8", "8", "8", "8", 
"8"), count = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)), row.names = c(NA, -69L), class =     c("tbl_df", 
"tbl", "data.frame"))

I'm first time getting to sankey plot so I 'm no expert. I added the count column, so each patient has a count of 1, the flow thickness can be then added by the count.

I modified from R tutorial and the code to visualise is here

library(ggplot2)
library(ggalluvial)

ggplot(data = CLL3S,
       aes(axis1 = `Cluster assigned consensus`, axis2 = `Cluster assigned single drug`, y = count)) +
  scale_x_discrete(limits = c("Consensus cluster", "Single-drug cluster"), expand = c(.1, .1)) +
  xlab("Clusters") +
  geom_alluvium(aes(fill = `Cluster assigned consensus`)) +
  geom_stratum() +
  geom_text(stat = "stratum", aes(label = after_stat(stratum))) +
  theme_minimal() +
  ggtitle("Patient flow between the Consensus clusters and Single-drug treated clusters",
          "3S stimulated patients")

This kind of works but the figure isn't pretty:

enter image description here

You see the cluster numbers are surrounded by huge white empty boxes. How can I change that to something smaller? And how do I color code the box into different colors and make sure the if I change the geom_alluvium (fill) so the flow of the data matches the color of the boxes(consensus boxes)?


Solution

  • You control that in geom_stratum. Try this

    library(ggplot2)
    library(ggalluvial)
    library(RColorBrewer)
    
    # Define the number of colors you want
    nb.cols <- 10
    mycolor1 <- colorRampPalette(brewer.pal(8, "Set2"))(nb.cols)
    mycolor2 <- colorRampPalette(brewer.pal(2, "Set2"))(nb.cols)
    
    mycolors <- c("red","blue","green","orange")
    
    ggplot(data = CLL3S,
           aes(y = count, axis1 = `Cluster assigned consensus`, axis2 = `Cluster assigned single drug` 
               )) +
      scale_x_discrete(limits = c("Consensus cluster", "Single-drug cluster"), expand = c(.1, .1)) +
      labs(x="Clusters") +
      geom_alluvium(aes(fill = `Cluster assigned consensus`)) +
      geom_stratum(width = 1/4, fill = c(mycolor1[1:8],mycolor1[1:8]), color = "red") +
      #geom_stratum() +
      geom_text(stat = "stratum", aes(label = after_stat(stratum))) +
      #scale_fill_manual(values = mycolors) +
      theme_minimal() +
      guides(fill=guide_legend(override.aes = list(color=mycolors)))+
      ggtitle("Patient flow between the Consensus clusters and Single-drug treated clusters",
              "3S stimulated patients")
    

    output