Search code examples
pythonmatplotlibseaborncontour

pick a specific contour level in seaborn/matplotlib


I have a dataset with x and y variables. I plotted them in a regular plot:enter image description here

Now I want to plot the contour level of the density probability contour level eg: 50% of the values are in this area. I tried with seaborn with the following code:

import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# define my x an y axes
np.random.seed(0)
x = np.random.randn(100)
y = np.random.randn(100)


# Create a joint plot with scatter plot and KDE contour lines
sns.jointplot(x = x, y = y, kind = "scatter", color = 'b')
sns.kdeplot(x = x, y = y, color = "r", levels = 5)
plt.ylim(0, 17.5)
plt.xlim(0, 20)

# Show the plot
plt.show()

and the result is: enter image description here

But I would like to choose the contour level values. I searched a long time for a solution but didn't find any really… Is there a simple way of doing this ?


Solution

  • Not sure that contour labelling is directly accessible in seaborn. matplotlib has a clabel function that adds labels to contours. I've wrapped it in a function overlay_labelled_contours() that overlays contours onto your existing scatter plot's axis.

    The data + code below shows how you can get labelled contours at different quantiles (similar to seaborn's kdeplot(levels=...).

    enter image description here

    Imports and data:

    import seaborn as sns
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    
    # define x and y axes
    np.random.seed(0)
    x = np.random.randn(1000)
    y = x + np.random.randn(1000) - 0.5
    

    Overlay contours:

    # Create a joint plot with scatter plot and KDE contour lines
    g = sns.jointplot(x=x, y=y, marker='.', kind="scatter", color='tab:brown')
    overlay_labelled_contours(x, y, ax=g.figure.axes[0])
    
    plt.gcf().set_size_inches(7, 4)
    plt.gca().set(xlabel='x', ylabel='y')
    

    Function that handles the contours:

    from scipy.stats import gaussian_kde
    
    def overlay_labelled_contours(
        x, y, ax, lims=None, quantile_levels=[0.1, 0.25, 0.5, 0.8],
        bw_method=0.5, cmap='copper', text_color=None
    ):
        # Fit KDE estimator on the data
        data_coords = np.column_stack([x, y])
        kde_estimator = gaussian_kde(
            dataset=data_coords.T,
            bw_method=bw_method
        )
        kde_data_scores = kde_estimator.evaluate(data_coords.T).T
    
        # Define a fine grid and get the map of kde scores
        grid_res = 200
        if not lims:
            lims = [x.min(), x.max(), y.min(), y.max()]
        x_grid, y_grid = np.meshgrid(
            np.linspace(lims[0], lims[1], num=grid_res),
            np.linspace(lims[2], lims[3], num=grid_res)
        )
        kde_grid_scores = kde_estimator.evaluate(
            np.column_stack([x_grid.ravel(), y_grid.ravel()]).T
        ).T
        
        # Convert the KDE density scores to a quantile score
        scores_sorted = np.sort(kde_data_scores)
        scores_cdf = [(kde_data_scores < thresh).mean() for thresh in scores_sorted]
        
        gridscores_as_cdf = np.interp(kde_grid_scores, scores_sorted, scores_cdf)
        one_minus_cdf = 1 - gridscores_as_cdf
        
        # add KDE quantile lines
        c = ax.contour(
            x_grid, y_grid, one_minus_cdf.reshape(x_grid.shape),
            cmap=cmap, levels=quantile_levels, linewidths=3
        )
        # label the contours
        if not text_color:
            text_color = 'black'
        
        ax.clabel(c, fontsize=12, colors=text_color)
    

    How it works

    The x and y data coordinates are first organised into a two-column matrix shaped n_samples x 2:

        data_coords = np.column_stack([x, y])
    

    A KDE model is fitted on that data (it needs the data as 2 x n_samples, so we supply the transpose of the data matrix):

        kde_estimator = gaussian_kde(
            dataset=data_coords.T,
            bw_method=bw_method
        )
    

    After fitting the KDE, we get the density estimates at the data points:

        kde_data_scores = kde_estimator.evaluate(data_coords.T).T
    

    Since you are interested in quantiles, we map the estimated densities to data proportions. In other words, we learn a mapping that takes in a density values, and tells us what proportion of the data lies below that value. This allows us to convert density data to quantile data.

        # Convert the KDE density scores to a quantile score
        scores_sorted = np.sort(kde_data_scores)
        scores_cdf = [(kde_data_scores < thresh).mean() for thresh in scores_sorted]
    

    We will be drawing the contours in a 2D space, so we define a 2D grid of coordinates:

        # Define a fine grid and get the map of kde scores
        grid_res = 200
        if not lims:
            lims = [x.min(), x.max(), y.min(), y.max()]
        x_grid, y_grid = np.meshgrid(
            np.linspace(lims[0], lims[1], num=grid_res),
            np.linspace(lims[2], lims[3], num=grid_res)
        )
    

    For each point on the grid, use the fitted KDE model to estimate the density at that point:

        kde_grid_scores = kde_estimator.evaluate(
            np.column_stack([x_grid.ravel(), y_grid.ravel()]).T
        ).T
    

    We then map those estimated densities over the entire 2D area to proportions of the data, since we want the contours to represent proportions of the data.

        gridscores_as_cdf = np.interp(kde_grid_scores, scores_sorted, scores_cdf)
        one_minus_cdf = 1 - gridscores_as_cdf
    

    Finally, ax.contour is used to display and label contours at the user-defined quantile levels.

        # add KDE quantile lines
        c = ax.contour(
            x_grid, y_grid, one_minus_cdf.reshape(x_grid.shape),
            cmap=cmap, levels=quantile_levels, linewidths=3
        )
        # label the contours
        if not text_color:
            text_color = 'black'
        
        ax.clabel(c, fontsize=12, colors=text_color)
    



    A fully-matplotlib approach, including histograms at the margins.

    The data is plotted on a plt.scatter plot, with the labelled contours overlaid using overlay_labelled_contours(). Finally, histograms are added at the margins using make_axes_locatable().

    enter image description here

    #
    #Scatter plot with contours and marginal histograms, using matplotlib
    #
    f, ax = plt.subplots(figsize=(8, 5))
    
    #Scatter x and y data
    ax.scatter(x, y, marker='.', s=5, color='tab:blue')
    
    overlay_labelled_contours(x, y, ax)
    
    #optional formatting
    [ax.spines[spine].set_visible(False) for spine in ['right', 'top']]
    ax.set(xlabel='x', ylabel='y')
    
    # Add marginal histograms
    from mpl_toolkits.axes_grid1 import make_axes_locatable
    
    # New axes on top and right of "ax"
    divider = make_axes_locatable(ax)
    ax_histx = divider.append_axes('top', 0.8, pad=0.1, sharex=ax)
    ax_histy = divider.append_axes('right', 0.8, pad=0.1, sharey=ax)
    [axis.axis('off') for axis in [ax_histx, ax_histy]]
    
    ax_histx.hist(x, bins=30)
    ax_histy.hist(y, bins=30, orientation='horizontal')