Search code examples
pythonpandasmatplotlibseaborngrouped-bar-chart

How to include both percent and N as bar labels in grouped bar chart


I recently asked a question and on how to include both % and N as bar labels and received assistance Include both % and N as bar labels I am trying to use that example in a bar plot over a variable as per the example below:

data = {
'id': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50],
'baseline': [1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1],
'endline': [1, 0, np.nan, 1, 0, 0, 1, np.nan, 1, 0, 0, 1, 0, 0, 1, 0, np.nan, np.nan, 1, 0, 1, np.nan, 0, 1, 0, 1, 0, np.nan, 1, 0, np.nan, 0, 0, 0, np.nan, 1, np.nan, 1, np.nan, 0, np.nan, 1, 1, 0, 1, 1, 1, 0, 1, 1],
'gender': ['male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female']

}

df = pd.DataFrame(data)

sns.set_style('white')
ax = sns.barplot(data = df.melt(id_vars = ['id', 'gender'], value_vars = ['baseline', 'endline']),
                 x = 'variable', y = 'value',
                 estimator=lambda x: np.sum(x) / np.size(x) * 100, ci=None,
                 color='cornflowerblue', hue = 'gender')

N = df.melt(id_vars = ['id', 'gender'], value_vars = ['baseline', 'endline']).groupby(['gender', 'variable'], sort=False)['value'].count().to_numpy()
N_it = '$\it{N}$'
labels=[f'{np.round(perc,1)}% ({N_it} = {n})' 
        for perc, n in zip(ax.containers[0].datavalues, N)]

ax.bar_label(ax.containers[0], labels = labels, fontsize = 10)
ax.bar_label(ax.containers[1], labels = labels, fontsize = 10)

sns.despine(ax = ax, left = True)
ax.grid(True, axis = 'y')
ax.yaxis.set_major_formatter(PercentFormatter(100))
ax.set_xlabel('')
ax.set_ylabel('')
plt.tight_layout()
plt.show()

but it seems I am missing something in getting the right results.


Solution

  • Your labeling code assumes a regular bar chart with only one container ax.containers[0], but this bar plot has a hue, which makes it a grouped bar plot with multiple ax.containers. That means we need to recompute N and labels to account for the groupings:

    1. Recompute N as a groupby.count matrix where the variables are sorted like the plot's bars (i.e., male before female, baseline before endline)
    2. Generate labels with a double comprehension to have one level per container
    3. Label each container with the corresponding (zipped) labels
    N = df.groupby('gender')[['baseline', 'endline']].count().sort_index(ascending=False)
    #         baseline  endline
    # gender                   
    # male          21       17
    # female        29       22
    
    labels = [
        [f'{pct:.1f}% $(N={_n})$' for pct, _n in zip(c.datavalues, n)]
        for c, n in zip(ax.containers, N.to_numpy())
    ]
    # [['38.1% $(N=21)$', '47.1% $(N=17)$'],
    #  ['48.3% $(N=29)$', '54.5% $(N=22)$']]
    
    for container, label in zip(ax.containers, labels):
        ax.bar_label(container, label, fontsize=10)
    

    corrected figure