I have successfully created a Sankey image using R
's ggplot
+ geom_sankey
following the ggsankey tutorial. However, I followed this post (How to skip nodes with NA value in ggsankey?) to work around the 'NA's in the data.
However, I would like to:
Reproducible Example
devtools::install_github("davidsjoberg/ggsankey")
library(ggsankey); library(ggplot2)
#Making a df
Years <- data.frame(Earlier = c(rep(2012, 2), paste(2013), paste(2014), rep(2015, 2), rep(2018, 2), rep(2022, 2), rep(NA, 31)),
Latest = c(rep(2023, 4), rep(2022, 6), rep(2021, 10), rep(2020, 3), rep(2019, 6), rep(2018, 3), rep(2017, 3), rep(2013, 4), rep(NA, 2)),
Current = c(rep(2023, 10), rep(2022, 12), rep(2021, 11), rep(2020, 1), rep(NA, 7)))
#Shuffling
set.seed(123)
Years[sample(1:nrow(Years)), ]
df_stack <- Years %>% make_long(Earlier, Latest, Current)
head(df_stack)
#graphing
ggplot(df_stack, aes(x = x,
next_x = next_x,
node = node,
next_node = next_node,
fill = factor(node),
label = node,
color = factor(node))) +
geom_sankey(flow.alpha = 0.5, node.color = 1,
smooth = 6, width = 0.2,) + #width = width of nodes
geom_sankey_label(size = 3.5, color = 1, fill = "white") +
scale_fill_viridis_d(direction = -1, option = "turbo") +
scale_colour_viridis_d(direction = -1, option = "turbo") +
theme_sankey(base_size = 15) +
theme(legend.position = "none") + xlab('')
Which produces the following graph. I have indicated points 2 & 3 (above) on this image as well.
For point 1 (above) - I would like to chronologically align the years for easier interpretation. Here is a very rough sketch of where the nodes should be with respect with each other. It should look like the image above, but its the ordering and spacing of the nodes that I am getting at with this sorry image.
Extra info: sessionInfo() R version 4.3.0 (2023-04-21) Platform: aarch64-apple-darwin20 (64-bit) Running under: macOS Ventura 13.6
version: ggsankey_0.0.99999
Any help navigating this quagmire would be greatly appreciated. Thank-you.
The blank boxes are coming from missing values in the column df_stack$node
. You can remove the boxes by filtering the NA's out.
library(ggsankey)
library(ggplot2)
library(dplyr)
Years <- data.frame(Earlier = c(rep(2012, 2), 2013, 2014, rep(2015, 2), rep(2018, 2), rep(2022, 2), rep(NA, 31)),
Latest = c(rep(2023, 4), rep(2022, 6), rep(2021, 10), rep(2020, 3), rep(2019, 6), rep(2018, 3), rep(2017, 3), rep(2013, 4), rep(NA, 2)),
Current = c(rep(2023, 10), rep(2022, 12), rep(2021, 11), rep(2020, 1), rep(NA, 7)))
df_stack <- Years %>%
make_long(Earlier, Latest, Current) %>%
filter(!is.na(node))
# plot
ggplot(df_stack, aes(x = x,
next_x = next_x,
node = node,
next_node = next_node,
fill = factor(node),
label = node,
color = factor(node))) +
geom_sankey(flow.alpha = 0.5, node.color = 1,
smooth = 6, width = 0.2,) +
geom_sankey_label(size = 3.5, color = 1, fill = "white") +
scale_fill_viridis_d(direction = -1, option = "turbo") +
scale_colour_viridis_d(direction = -1, option = "turbo") +
theme_sankey(base_size = 15) +
theme(legend.position = "none") + xlab('')
Created on 2023-11-10 with reprex v2.0.2
EDIT
I couldn't really find clear solutions to your question about aligning the nodes in parallel by years, but I have some suggestions as to resizing or adjusting the space between nodes.
space
argument in geom_sankey
and geom_sankey_label
. This approach increases the spacing between nodes in all columns, though, not the first one only. Note that the values for space
has to be the same for both geom_sankey
and geom_sankey_label
ggplot(df_stack, aes(x = x,
next_x = next_x,
node = node,
next_node = next_node,
fill = factor(node),
label = node,
color = factor(node))) +
geom_sankey(flow.alpha = 0.5, node.color = 1,
smooth = 6, width = 0.2,
space = 15 # add spacing
) +
geom_sankey_label(size = 3.5, color = 1, fill = "white",
space = 15 # add spacing
) +
scale_fill_viridis_d(direction = -1, option = "turbo") +
scale_colour_viridis_d(direction = -1, option = "turbo") +
theme_sankey(base_size = 15) +
theme(legend.position = "none") + xlab('')
Created on 2023-11-10 with reprex v2.0.2
?make_long
), you can specify weights for the connections in your diagram. You can give more weights to the nodes in the first column Earlier
; and this will change the size and spacing accordingly. However, keep in mind that the links between the modified nodes no longer represent the original frequencies.df_stack <- Years |>
mutate(weights = if_else(is.na(Earlier), 1, 3)) |>
make_long(Earlier, Latest, Current, value = weights) |>
filter(!is.na(node))
ggplot(df_stack, aes(x = x,
next_x = next_x,
node = node,
next_node = next_node,
fill = factor(node),
label = node,
color = factor(node),
value = value)) +
geom_sankey(flow.alpha = 0.5, node.color = 1,
smooth = 6, width = 0.2) +
geom_sankey_label(size = 3.5, color = 1, fill = "white") +
scale_fill_viridis_d(direction = -1, option = "turbo") +
scale_colour_viridis_d(direction = -1, option = "turbo") +
theme_sankey(base_size = 15) +
theme(legend.position = "none") + xlab('')
Created on 2023-11-10 with reprex v2.0.2
Alternate Solution
If you're open to learning about a different package, there is also an alternate solution using networkD3::sankeyNetwork
. The output is an interactive diagram where you can move/reorganize individual nodes however you want.
library(networkD3)
library(dplyr)
# define links and nodes
# get counts and add column numbers
df1 <- Years |>
count(Earlier, Latest) |>
na.omit() |>
rename(source = Earlier, target = Latest) |>
mutate(source = paste0(source, "_1"),
target = paste0(target, "_2"))
df2 <- Years |>
count(Latest, Current) |>
na.omit() |>
rename(source = Latest, target = Current) |>
mutate(source = paste0(source, "_2"),
target = paste0(target, "_3"))
links <- bind_rows(df1, df2)
# nodes
nodes <- data.frame(id = unique(c(links$source, links$target)), stringsAsFactors = FALSE) |>
mutate(name = gsub("_\\d$", "", id))
# add source and target ids
links <- links |>
mutate(source_id = match(source, nodes$id) - 1,
target_id = match(target, nodes$id) - 1)
# plot
sn <- sankeyNetwork(Links = links, Nodes = nodes, Source = "source_id",
Target = "target_id", Value = "n", NodeID = "name")
# add x-axis labels
js_string <-
'
function(el) {
var cols_x = this.sankey.nodes()
.map(d => d.x).filter((v, i, a) => a.indexOf(v) === i)
.sort(function(a, b){return a - b});
var labels = ["Earliest", "Latest", "Current" ]
cols_x.forEach((d, i) => {
d3.select(el).select("svg")
.append("text")
.attr("x", d)
.attr("y", 12)
.attr("text-anchor", "start")
.text(labels[i]);
})
}
'
sn <- htmlwidgets::onRender(sn, js_string)
sn