Search code examples
rdata.tableplotlyr-plotlysankey-diagram

How to create a sankey diagram when certain values ​are omitted


I need to create a sankey diagram in R with plotly over 3 years. My group column should be the nodes (1 == worst, 2 == bad, 3 == good and 4 == best), but however in year 2019 and 2020 I have/need an additional node 5 == not available.

My data is very large, so I'll show you just a short snippet of it:

dt.2018 <- structure(list(Year = c(2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L), GPNRPlan = c(100236L, 101554L, 111328L, 
124213L, 127434L, 128509L, 130058L, 130192L, 130224L, 130309L
), TB.Info = c("Below TB", "Over TB", "In TB", "In TB", "In TB", 
"Below TB", "Over TB", "Below TB", "Below TB", "Below TB"), Qeff = c(-0.01, 
0, 0, 0, 0, 0, 0, 0, -0.01, -0.01), group = c(1, 1, 3, 4, 2, 
2, 1, 4, 2, 3)), class = c("data.table", "data.frame"), row.names = c(NA, 
-10L))

dt.2019 <- structure(list(Year = c(2019L, 2019L, 2019L, 2019L, 2019L, 2019L, 
2019L, 2019L, 2019L, 2019L), GPNRPlan = c(100236L, 101554L, 111328L, 
124213L, 127434L, 128003L, 128509L, 130058L, 130192L, 130351L
), TB.Info = c("Below TB", "Over TB", "In TB", "In TB", "In TB", 
"Over TB", "In TB", "Over TB", "Below TB", "Over TB"), Qeff = c(-0.01, 
0.04, -0.01, 0, 0, 0, 0, 0, 0, 0), group = c(1, 2, 3, 1, 2, 4, 
1, 1, 3, 2)), class = c("data.table", "data.frame"), row.names = c(NA, 
-10L))

dt.2020 <- structure(list(Year = c(2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 
2020L, 2020L, 2020L, 2020L), GPNRPlan = c(100236L, 111328L, 128003L, 
130058L, 130192L, 133874L, 135886L, 137792L, 138153L, 142309L
), TB.Info = c("Below TB", "In TB", "Over TB", "Below TB", "Below TB", 
"Over TB", "Below TB", "Over TB", "Over TB", "In TB"), Qeff = c(0, 
-0.01, 0, 0, -0.01, 0.02, -0.01, -0.01, 0.01, 0), group = c(2, 
3, 1, 4, 2, 3, 1, 1, 2, 4)), class = c("data.table", "data.frame"
))

Now I want to see which customers (customer ID == GPNRPlan) from 2018 are still in the same group in 2019 or have changed groups and if they are no longer in 2019, then they should refer to group 5, also called not available. The same should then happen from 2019 to 2020. How could this work?

Is it possible to refer from 2018 to 2020 in the same Sankey diagram?

So my sankey diagram for this sample here looks like this (hand-made):

enter image description here


Solution

  • This mainly is a question of formatting the data correctly.

    I joined the different data.tables to get the NA values.

    Furthermore please check the different arrangement options. I don't think your req. output can be achived 100% - either nodes are overlapping, or using "snap" the order of the nodes is changed.

    library(data.table)
    library(plotly)
    library(scales)
    
    dt.2018 <- structure(list(Year = c(2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L),
                              GPNRPlan = c(100236L, 101554L, 111328L, 124213L, 127434L, 128509L, 130058L, 130192L, 130224L, 130309L),
                              TB.Info = c("Below TB", "Over TB", "In TB", "In TB", "In TB", "Below TB", "Over TB", "Below TB", "Below TB", "Below TB"),
                              Qeff = c(-0.01, 0, 0, 0, 0, 0, 0, 0, -0.01, -0.01), 
                              group = c(1, 1, 3, 4, 2, 2, 1, 4, 2, 3)),
                         class = c("data.table", "data.frame"), row.names = c(NA, -10L))
    
    dt.2019 <- structure(list(Year = c(2019L, 2019L, 2019L, 2019L, 2019L, 2019L, 2019L, 2019L, 2019L, 2019L), 
                              GPNRPlan = c(100236L, 101554L, 111328L, 124213L, 127434L, 128003L, 128509L, 130058L, 130192L, 130351L), 
                              TB.Info = c("Below TB", "Over TB", "In TB", "In TB", "In TB", "Over TB", "In TB", "Over TB", "Below TB", "Over TB"), 
                              Qeff = c(-0.01, 0.04, -0.01, 0, 0, 0, 0, 0, 0, 0),
                              group = c(1, 2, 3, 1, 2, 4, 1, 1, 3, 2)),
                         class = c("data.table", "data.frame"), row.names = c(NA, -10L))
    
    dt.2020 <- structure(list(Year = c(2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L), 
                              GPNRPlan = c(100236L, 111328L, 128003L, 130058L, 130192L, 133874L, 135886L, 137792L, 138153L, 142309L), 
                              TB.Info = c("Below TB", "In TB", "Over TB", "Below TB", "Below TB", "Over TB", "Below TB", "Over TB", "Over TB", "In TB"), 
                              Qeff = c(0, -0.01, 0, 0, -0.01, 0.02, -0.01, -0.01, 0.01, 0), group = c(2, 3, 1, 4, 2, 3, 1, 1, 2, 4)),
                         class = c("data.table", "data.frame"))
    
    lookUpDT <- data.table(group = c(as.character(1:4), "NA"), group_name = c("worst", "bad", "good", "best", "not available"), color = c("red", "orange", "yellow", "green", "darkgrey"))
    
    sankeyDT <- rbindlist(list(merge.data.table(dt.2018, dt.2019, by = "GPNRPlan", all.x = TRUE, suffixes = c(".source", ".target"))[, Year.target := 2019],
    merge.data.table(dt.2019, dt.2020, by = "GPNRPlan", all.x = TRUE, suffixes = c(".source", ".target"))[, Year.target := 2020]
    ))
    
    sankeyDT[, node_id.source := paste0(Year.source, "_", group.source)]
    sankeyDT[, node_id.target := paste0(Year.target, "_", group.target)]
    
    charCols <- c("group.source", "group.target")
    sankeyDT[,(charCols):= lapply(.SD, as.character), .SDcols = charCols]
    
    sankeyDT <- merge.data.table(sankeyDT, lookUpDT, by.x = "group.source", by.y = "group")
    
    sankeyLabelsDT <- data.table(node_id = sort(unique(c(sankeyDT$node_id.source, sankeyDT$node_id.target)), na.last = TRUE))
    sankeyLabelsDT[, c("year", "group") := tstrsplit(node_id, "_", fixed=TRUE)]
    sankeyLabelsDT[, x_scale := .GRP, by = year][, y_scale := .GRP, by = group]
    sankeyLabelsDT[, x_scale := rescale(x_scale, to=c(0, 0.9))][, y_scale := rescale(y_scale, to=c(0.2, 0.75))]
    sankeyLabelsDT <- merge.data.table(sankeyLabelsDT, lookUpDT, by = "group")
    sankeyLabelsDT[, label := paste(year, "-", group_name)]
    setorder(sankeyLabelsDT, year, group, na.last = TRUE)
    
    
    fig <- plot_ly(
      data = sankeyDT,
      type = "sankey",
      arrangement = "perpendicular", #  snap - perpendicular - freeform - fixed
      orientation = "h",
      
      node = list(
        label = sankeyLabelsDT$label,
        color = sankeyLabelsDT$color,
        x = sankeyLabelsDT$x_scale,
        y = sankeyLabelsDT$y_scale,
        pad = 10 # 10 Pixel
      ),
      
      link = list(
        source = match(sankeyDT$node_id.source, sankeyLabelsDT$node_id)-1,
        target = match(sankeyDT$node_id.target, sankeyLabelsDT$node_id)-1,
        value =  rep(1, nrow(sankeyDT)),
        label = paste("customer:", sankeyDT$GPNRPlan),
        color = sankeyDT$color # default: grey
      )
    )
    
    fig <- fig %>% layout(
      title = "Sankey Diagram",
      font = list(
        size = 10
      )
    )
    
    fig
    

    result