Search code examples
pythonpandasseabornheatmapcorrelation

Masking correlation matrix based on p-values and correlation


Based on this answer I have the following code to draw a correlation matrix which only plots data where p<0.05:

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

# Simulate 3  correlated variables
num_samples = 100
mu = np.array([5.0, 0.0, 10.0])
# The desired covariance matrix.
r = np.array([
        [  3.40, -2.75, -2.00],
        [ -2.75,  5.50,  1.50],
        [ -2.00,  1.50,  1.25]
    ])
y = np.random.multivariate_normal(mu, r, size=num_samples)
df = pd.DataFrame(y)
df.columns = ["Correlated1","Correlated2","Correlated3"]

# Create two random variables 
for i in range(2):
    df.loc[:,f"Uncorrelated{i}"] = np.random.randint(-2000,2000,len(df))

def corr_sig(df=None):
    p_matrix = np.zeros(shape=(df.shape[1],df.shape[1]))
    for col in df.columns:
        for col2 in df.drop(col,axis=1).columns:
            _ , p = stats.pearsonr(df[col],df[col2])
            p_matrix[df.columns.to_list().index(col),df.columns.to_list().index(col2)] = p
    return p_matrix

p_values = corr_sig(df)
mask = np.invert(np.tril(p_values<0.05))

def plot_cor_matrix(corr, mask=None):
    f, ax = plt.subplots(figsize=(11, 9))
    sns.heatmap(corr, ax=ax,
                mask=mask,
                # cosmetics
                annot=True, 
                cmap='coolwarm')

# Plotting with significance filter
corr = df.corr()                            # get correlation
p_values = corr_sig(df)                     # get p-Value
mask = np.invert(np.tril(p_values<0.05))    # mask - only get significant corr
plot_cor_matrix(corr,mask)  

enter image description here

How can I also also filter out the correlations on the diagonal where features are being compared to themselves (i.e. correlations of 1)?


Solution

  • The tril function can take k as kwarg. According to the doc:

    Diagonal above which to zero elements. k = 0 (the default) is the main diagonal, k < 0 is below it and k > 0 is above.

    In your case you'll want k=-1:

    import pandas as pd
    import numpy as np
    import seaborn as sns
    import matplotlib.pyplot as plt
    from scipy import stats
    
    np.random.seed(1)
    # Simulate 3  correlated variables
    num_samples = 100
    mu = np.array([5.0, 0.0, 10.0])
    # The desired covariance matrix.
    r = np.array([
            [  3.40, -2.75, -2.00],
            [ -2.75,  5.50,  1.50],
            [ -2.00,  1.50,  1.25]
        ])
    y = np.random.multivariate_normal(mu, r, size=num_samples)
    df = pd.DataFrame(y)
    df.columns = ["Correlated1","Correlated2","Correlated3"]
    
    # Create two random variables 
    for i in range(2):
        df.loc[:,f"Uncorrelated{i}"] = np.random.randint(-2000,2000,len(df))
    
    def corr_sig(df=None):
        p_matrix = np.zeros(shape=(df.shape[1],df.shape[1]))
        for col in df.columns:
            for col2 in df.drop(col,axis=1).columns:
                _ , p = stats.pearsonr(df[col],df[col2])
                p_matrix[df.columns.to_list().index(col),df.columns.to_list().index(col2)] = p
        return p_matrix
    
    def plot_cor_matrix(corr, mask=None):
        f, ax = plt.subplots(figsize=(11, 9))
        sns.heatmap(corr, ax=ax,
                    mask=mask,
                    # cosmetics
                    annot=True, 
                    cmap='coolwarm')
    
    # Plotting with significance filter
    corr = df.corr()                            # get correlation
    p_values = corr_sig(df)                     # get p-Value
    mask = np.invert(np.tril(p_values<0.05, k=-1))    # mask - only get significant corr
    plot_cor_matrix(corr,mask) 
    plt.show()
    

    Output: enter image description here