pick a specific contour level in seaborn/matplotlib

I have a dataset with x and y variables. I plotted them in a regular plot:

Now I want to plot the contour level of the density probability contour level eg: 50% of the values are in this area. I tried with seaborn with the following code:

import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# define my x an y axes
np.random.seed(0)
x = np.random.randn(100)
y = np.random.randn(100)


# Create a joint plot with scatter plot and KDE contour lines
sns.jointplot(x = x, y = y, kind = "scatter", color = 'b')
sns.kdeplot(x = x, y = y, color = "r", levels = 5)
plt.ylim(0, 17.5)
plt.xlim(0, 20)

# Show the plot
plt.show()

and the result is:

But I would like to choose the contour level values. I searched a long time for a solution but didn't find any really… Is there a simple way of doing this ?

Solution

Not sure that contour labelling is directly accessible in seaborn. matplotlib has a clabel function that adds labels to contours. I've wrapped it in a function overlay_labelled_contours() that overlays contours onto your existing scatter plot's axis.

The data + code below shows how you can get labelled contours at different quantiles (similar to seaborn's kdeplot(levels=...).

Imports and data:

import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# define x and y axes
np.random.seed(0)
x = np.random.randn(1000)
y = x + np.random.randn(1000) - 0.5

Overlay contours:

# Create a joint plot with scatter plot and KDE contour lines
g = sns.jointplot(x=x, y=y, marker='.', kind="scatter", color='tab:brown')
overlay_labelled_contours(x, y, ax=g.figure.axes[0])

plt.gcf().set_size_inches(7, 4)
plt.gca().set(xlabel='x', ylabel='y')

Function that handles the contours:

from scipy.stats import gaussian_kde

def overlay_labelled_contours(
    x, y, ax, lims=None, quantile_levels=[0.1, 0.25, 0.5, 0.8],
    bw_method=0.5, cmap='copper', text_color=None
):
    # Fit KDE estimator on the data
    data_coords = np.column_stack([x, y])
    kde_estimator = gaussian_kde(
        dataset=data_coords.T,
        bw_method=bw_method
    )
    kde_data_scores = kde_estimator.evaluate(data_coords.T).T

    # Define a fine grid and get the map of kde scores
    grid_res = 200
    if not lims:
        lims = [x.min(), x.max(), y.min(), y.max()]
    x_grid, y_grid = np.meshgrid(
        np.linspace(lims[0], lims[1], num=grid_res),
        np.linspace(lims[2], lims[3], num=grid_res)
    )
    kde_grid_scores = kde_estimator.evaluate(
        np.column_stack([x_grid.ravel(), y_grid.ravel()]).T
    ).T
    
    # Convert the KDE density scores to a quantile score
    scores_sorted = np.sort(kde_data_scores)
    scores_cdf = [(kde_data_scores < thresh).mean() for thresh in scores_sorted]
    
    gridscores_as_cdf = np.interp(kde_grid_scores, scores_sorted, scores_cdf)
    one_minus_cdf = 1 - gridscores_as_cdf
    
    # add KDE quantile lines
    c = ax.contour(
        x_grid, y_grid, one_minus_cdf.reshape(x_grid.shape),
        cmap=cmap, levels=quantile_levels, linewidths=3
    )
    # label the contours
    if not text_color:
        text_color = 'black'
    
    ax.clabel(c, fontsize=12, colors=text_color)

How it works

The x and y data coordinates are first organised into a two-column matrix shaped n_samples x 2:

    data_coords = np.column_stack([x, y])

A KDE model is fitted on that data (it needs the data as 2 x n_samples, so we supply the transpose of the data matrix):

    kde_estimator = gaussian_kde(
        dataset=data_coords.T,
        bw_method=bw_method
    )

After fitting the KDE, we get the density estimates at the data points:

    kde_data_scores = kde_estimator.evaluate(data_coords.T).T

Since you are interested in quantiles, we map the estimated densities to data proportions. In other words, we learn a mapping that takes in a density values, and tells us what proportion of the data lies below that value. This allows us to convert density data to quantile data.

    # Convert the KDE density scores to a quantile score
    scores_sorted = np.sort(kde_data_scores)
    scores_cdf = [(kde_data_scores < thresh).mean() for thresh in scores_sorted]

We will be drawing the contours in a 2D space, so we define a 2D grid of coordinates:

    # Define a fine grid and get the map of kde scores
    grid_res = 200
    if not lims:
        lims = [x.min(), x.max(), y.min(), y.max()]
    x_grid, y_grid = np.meshgrid(
        np.linspace(lims[0], lims[1], num=grid_res),
        np.linspace(lims[2], lims[3], num=grid_res)
    )

For each point on the grid, use the fitted KDE model to estimate the density at that point:

    kde_grid_scores = kde_estimator.evaluate(
        np.column_stack([x_grid.ravel(), y_grid.ravel()]).T
    ).T

We then map those estimated densities over the entire 2D area to proportions of the data, since we want the contours to represent proportions of the data.

    gridscores_as_cdf = np.interp(kde_grid_scores, scores_sorted, scores_cdf)
    one_minus_cdf = 1 - gridscores_as_cdf

Finally, ax.contour is used to display and label contours at the user-defined quantile levels.

    # add KDE quantile lines
    c = ax.contour(
        x_grid, y_grid, one_minus_cdf.reshape(x_grid.shape),
        cmap=cmap, levels=quantile_levels, linewidths=3
    )
    # label the contours
    if not text_color:
        text_color = 'black'
    
    ax.clabel(c, fontsize=12, colors=text_color)

A fully-matplotlib approach, including histograms at the margins.

The data is plotted on a plt.scatter plot, with the labelled contours overlaid using overlay_labelled_contours(). Finally, histograms are added at the margins using make_axes_locatable().

#
#Scatter plot with contours and marginal histograms, using matplotlib
#
f, ax = plt.subplots(figsize=(8, 5))

#Scatter x and y data
ax.scatter(x, y, marker='.', s=5, color='tab:blue')

overlay_labelled_contours(x, y, ax)

#optional formatting
[ax.spines[spine].set_visible(False) for spine in ['right', 'top']]
ax.set(xlabel='x', ylabel='y')

# Add marginal histograms
from mpl_toolkits.axes_grid1 import make_axes_locatable

# New axes on top and right of "ax"
divider = make_axes_locatable(ax)
ax_histx = divider.append_axes('top', 0.8, pad=0.1, sharex=ax)
ax_histy = divider.append_axes('right', 0.8, pad=0.1, sharey=ax)
[axis.axis('off') for axis in [ax_histx, ax_histy]]

ax_histx.hist(x, bins=30)
ax_histy.hist(y, bins=30, orientation='horizontal')