Search code examples
pythonplotlyplotly-pythonsankey-diagram

How do I make a Sankey diagram with Plotly with one layer that goes only one level?


I want to make a Sankey diagram that splits into different levels (obviously), but one of these levels should stop after one, because the further steps do not apply. Much like this:

import pandas as pd

pd.DataFrame({
    'kind': ['not an animal', 'animal', 'animal', 'animal', 'animal'],
    'animal': ['?', 'cat', 'cat', 'dog', 'cat'],
    'sex': ['?', 'female', 'female', 'male', 'male'],
    'status': ['?', 'domesticated', 'domesticated', 'wild', 'domesticated'],
    'count': [8, 10, 11, 14, 6]
})

    kind            animal  sex     status          count
0   not an animal   ?       ?       ?               8
1   animal          cat     female  domesticated    10
2   animal          cat     female  domesticated    11
3   animal          dog     male    wild            14
4   animal          cat     male    domesticated    6

'Not an animal" shouldn't split on further levels since they do not apply. It should look like this:

enter image description here


Solution

  • source target count
    0 animal cat 27
    1 animal dog 14
    2 cat female 21
    3 cat male 6
    4 dog male 14
    5 female domesticated 21
    6 male domesticated 6
    7 male wild 14
    8 not an animal ? 8
    • then it becomes a case of building arrays of nodes and links

    full code

    import pandas as pd
    import numpy as np
    import plotly.graph_objects as go
    import io
    
    df2 = pd.read_csv(
        io.StringIO(
            """    kind            animal  sex     status          count
    0   not an animal   ?       ?       ?               8
    1   animal          cat     female  domesticated    10
    2   animal          cat     female  domesticated    11
    3   animal          dog     male    wild            14
    4   animal          cat     male    domesticated    6"""
        ),
        sep="\s\s+",
        engine="python",
    )
    
    df = (
        pd.concat(
            [
                df2.loc[:, [c1, c2] + ["count"]].rename(
                    columns={c1: "source", c2: "target"}
                )
                for c1, c2 in zip(df2.columns[:-1], df2.columns[1:-1])
            ]
        )
        .loc[lambda d: ~d["source"].eq("?")]
        .groupby(["source", "target"], as_index=False)
        .sum()
    )
    
    nodes = np.unique(df[["source", "target"]], axis=None)
    nodes = pd.Series(index=nodes, data=range(len(nodes)))
    
    go.Figure(
        go.Sankey(
            node={"label": nodes.index},
            link={
                "source": nodes.loc[df["source"]],
                "target": nodes.loc[df["target"]],
                "value": df["count"],
            },
        )
    )
    

    enter image description here

    data frame structuring in stages

    col_pairs = [[c1, c2] for c1, c2 in zip(df2.columns[:-1], df2.columns[1:-1])]
    # reconstruct as source / target pairs
    df = pd.concat(
        [
            df2.loc[:, cols + ["count"]].rename(
                columns={cols[0]: "source", cols[1]: "target"}
            )
            for cols in col_pairs
        ]
    )
    
    # filter out where source is unknown
    df = df.loc[~df["source"].eq("?")]
    # aggregate to limit links in sankey
    df = df.groupby(["source", "target"], as_index=False).sum()