pandas dataframe matrix pairwise pairwise-distance

split-half reliability using Jensen-Shannon Divergence

I have two pandas dataframes in which each row is a person and their response data in the form of a list:

df_1 = pd.DataFrame({'ID': ['a', 'b', 'c', 'd', 'e', 'f'], 'response': [["apple", "berry", "cherry"],
             ["pear", "pineapple", "plum"],
             ["blue_berry"],
             ["orange", "lemon"],
             ["tomato", "pumpkin"],
             ["avocado", "strawberry"]], 'group': [1, 2, 1, 2, 1, 2]})

df_2 = pd.DataFrame({'ID': ['A', 'B','C', 'D', 'E', 'F'], 'response': [["pear", "plum", "cherry"],
                 ["orange", "lemon", "lime", "pineapple"],
                 ["pumpkin"],
                 ["tomato", "strawberry"],
                 ["avocado", "apple"],
                 ["berry", "cherry", "apple"]], 'group': [1, 2, 1, 2, 1, 2]})

I am trying to construct a matrix in which each column and row indices is an ID and group, but where each cell of the matrix is the pair-wise Jensen-Shannon Divergence score calculated from response. My eventual goal is to visualize this as a heatmap to assess reliability between people's responses, but first am struggling to put my data into the correct matrix form.

I am not sure how to convert these dataframes into squareform and then calculate the JSD using the below function:

def jsdiv(P, Q):
    """Compute the Jensen-Shannon divergence between two probability distributions.

    Input
    -----
    P, Q : array-like
        Probability distributions of equal length that sum to 1
    """

    def _kldiv(A, B):
        return np.sum([v for v in A * np.log2(A/B) if not np.isnan(v)])

    P = np.array(P)
    Q = np.array(Q)

    M = 0.5 * (P + Q)

    return 0.5 * (_kldiv(P, M) +_kldiv(Q, M))

Solution

First of all, you need to combine the two dataframes that you have. I suggest the following approaach

import pandas as pd
import numpy as np
from scipy.spatial.distance import jensenshannon
from itertools import combinations

df_1 = pd.DataFrame({'ID': ['a', 'b', 'c', 'd', 'e', 'f'], 
                     'response': [["apple", "berry", "cherry"],
                                  ["pear", "pineapple", "plum"],
                                  ["blue_berry"],
                                  ["orange", "lemon"],
                                  ["tomato", "pumpkin"],
                                  ["avocado", "strawberry"]], 
                     'group': [1, 2, 1, 2, 1, 2]})

df_2 = pd.DataFrame({'ID': ['A', 'B', 'C', 'D', 'E', 'F'], 
                     'response': [["pear", "plum", "cherry"],
                                  ["orange", "lemon", "lime", "pineapple"],
                                  ["pumpkin"],
                                  ["tomato", "strawberry"],
                                  ["avocado", "apple"],
                                  ["berry", "cherry", "apple"]], 
                     'group': [1, 2, 1, 2, 1, 2]})

df_combined = pd.concat([df_1, df_2], axis=0).reset_index(drop=True)

all_fruits = set([fruit for sublist in pd.concat([df_1['response'], df_2['response']]).tolist() for fruit in sublist])

def response_to_prob_dist(response, all_fruits):
    fruit_count = {fruit: response.count(fruit) / len(response) for fruit in all_fruits}
    return [fruit_count[fruit] if fruit in response else 0 for fruit in all_fruits]

df_combined['prob_dist'] = df_combined['response'].apply(lambda x: response_to_prob_dist(x, all_fruits))

which gives you a data frame of the type:

  ID                 response  group  \
0  a   [apple, berry, cherry]      1   
1  b  [pear, pineapple, plum]      2   
2  c             [blue_berry]      1   
3  d          [orange, lemon]      2   
4  e        [tomato, pumpkin]      1   

                                           prob_dist  
0  [0, 0, 0, 0, 0.3333333333333333, 0, 0, 0, 0, 0...  
1  [0, 0, 0, 0.3333333333333333, 0, 0, 0, 0, 0.33...  
2       [0, 0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]  
3     [0, 0.5, 0, 0, 0, 0.5, 0, 0, 0, 0, 0, 0, 0, 0]  
4     [0.5, 0, 0, 0, 0, 0, 0.5, 0, 0, 0, 0, 0, 0, 0]

You can the apply your function,

def jsdiv(P, Q):
    """Compute the Jensen-Shannon divergence between two probability distributions.

    Input
    -----
    P, Q : array-like
        Probability distributions of equal length that sum to 1
    """

    def _kldiv(A, B):
        return np.sum([v for v in A * np.log2(A/B) if not np.isnan(v)])

    P = np.array(P)
    Q = np.array(Q)

    M = 0.5 * (P + Q)

    return 0.5 * (_kldiv(P, M) +_kldiv(Q, M))

n = len(df_combined)
jsd_matrix = np.zeros((n, n))  

for i in range(n):
    for j in range(i+1, n):  
        jsd_matrix[i, j] = jsdiv(df_combined['prob_dist'].iloc[i], df_combined['prob_dist'].iloc[j])
        jsd_matrix[j, i] = jsd_matrix[i, j]  

jsd_df = pd.DataFrame(jsd_matrix, index=df_combined['ID'], columns=df_combined['ID'])

jsd_df.head()

which will give you

ID    a    b    c    d    e    f         A         B         C    D         E  \
ID                                                                              
a   0.0  1.0  1.0  1.0  1.0  1.0  0.666667  1.000000  1.000000  1.0  0.595437   
b   1.0  0.0  1.0  1.0  1.0  1.0  0.333333  0.712642  1.000000  1.0  1.000000   
c   1.0  1.0  0.0  1.0  1.0  1.0  1.000000  1.000000  1.000000  1.0  1.000000   
d   1.0  1.0  1.0  0.0  1.0  1.0  1.000000  0.311278  1.000000  1.0  1.000000   
e   1.0  1.0  1.0  1.0  0.0  1.0  1.000000  1.000000  0.311278  0.5  1.000000   

ID    F  
ID       
a   0.0  
b   1.0  
c   1.0  
d   1.0  
e   1.0

However, I do not understand your function. You do know that you could be using

def jsdiv(P, Q):
    
    return jensenshannon(P, Q, base=2)**2  

n = len(df_combined)
jsd_matrix = np.zeros((n, n))  

for i in range(n):
    for j in range(i+1, n):  
        jsd_matrix[i, j] = jsdiv(df_combined['prob_dist'].iloc[i], df_combined['prob_dist'].iloc[j])
        jsd_matrix[j, i] = jsd_matrix[i, j]  

jsd_df = pd.DataFrame(jsd_matrix, index=df_combined['ID'], columns=df_combined['ID'])

print(jsd_df.head())

directly, right?

This will give you

ID    a    b    c    d    e    f         A         B         C    D         E  \
ID                                                                              
a   0.0  1.0  1.0  1.0  1.0  1.0  0.666667  1.000000  1.000000  1.0  0.595437   
b   1.0  0.0  1.0  1.0  1.0  1.0  0.333333  0.712642  1.000000  1.0  1.000000   
c   1.0  1.0  0.0  1.0  1.0  1.0  1.000000  1.000000  1.000000  1.0  1.000000   
d   1.0  1.0  1.0  0.0  1.0  1.0  1.000000  0.311278  1.000000  1.0  1.000000   
e   1.0  1.0  1.0  1.0  0.0  1.0  1.000000  1.000000  0.311278  0.5  1.000000   

ID    F  
ID       
a   0.0  
b   1.0  
c   1.0  
d   1.0  
e   1.0

Your definition of jsdiv is overcomplicating things.