python pandas numpy scipy python-itertools

Analyzing dataframes contained in Python's For Loop

Current Situation:

I have a function that separates a binary class target variable into "1's" and "0's", it then reads all the independent variable for each. The function also determines the KDE of each of these independent variables based on the class: "1" and "0", then calculates the area of intersection:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde

def intersection_area(data, bandwidth, margin,target_variable_name):
        #target_variable_name is the column name of the response variable
        data = data.dropna()
        X = data.drop(columns = [str(target_variable_name)], axis = 1)
        names = list(X.columns)
        new_columns = []
        for column_name in names[:-1]:
            x0= data.loc[data[str(target_variable_name)] == 0,str(column_name)]
            x1= data.loc[data[str(target_variable_name)] == 1,str(column_name)]
            
            kde0 = gaussian_kde(x0, bw_method=bandwidth)
            kde1 = gaussian_kde(x1, bw_method=bandwidth)
            x_min = min(x0.min(), x1.min()) #find the lowest value between two minimum points
            x_max = min(x0.max(), x1.max()) #finds the lowest value between two maximum points
            dx = margin * (x_max - x_min) # add a margin since the kde is wider than the data
            x_min -= dx
            x_max += dx
        
            x = np.linspace(x_min, x_max, 500)
            kde0_x = kde0(x)
            kde1_x = kde1(x)
            inters_x = np.minimum(kde0_x, kde1_x)
            area_inters_x = np.trapz(inters_x, x) #intersection of two kde
            print(area_inters_x)

Problem: if I have n_class = 4 the function will look like:

def intersection_area(data, bandwidth, margin,target_variable_name):
        #target_variable_name is the column name of the response variable
        data = data.dropna()
        X = data.drop(columns = [str(target_variable_name)], axis = 1)
        names = list(X.columns)
        new_columns = []
        for column_name in names[:-1]:
            x0= data.loc[data[str(target_variable_name)] == 0,str(column_name)]
            x1= data.loc[data[str(target_variable_name)] == 1,str(column_name)]
            x2= data.loc[data[str(target_variable_name)] == 2,str(column_name)]
            x3= data.loc[data[str(target_variable_name)] == 3,str(column_name)]
            
            kde0 = gaussian_kde(x0, bw_method=bandwidth)
            kde1 = gaussian_kde(x1, bw_method=bandwidth)
            kde2 = gaussian_kde(x2, bw_method=bandwidth)
            kde3 = gaussian_kde(x3, bw_method=bandwidth)
            x_min = min(x0.min(), x1.min(),x2.min(),x3.min())
            x_max = min(x0.max(), x1.max(),x2.min(),x3.min())

            dx = margin * (x_max - x_min)
            x_min -= dx
            x_max += dx
        
            x = np.linspace(x_min, x_max, 500)
            kde0_x = kde0(x)
            kde1_x = kde1(x)
            kde2_x = kde1(x)
            kde3_x = kde1(x)
            inters_x = np.minimum(kde0_x, kde1_x, kde2_x, kde3_x)
            area_inters_x = np.trapz(inters_x, x)
            print(area_inters_x)

Now what if I have an unknown dataset with n number of classes? I am trying to improve my old code so that it becomes robust to multiclass datasets, determine the KDE of an indepdendent variable given the class and calculates the intersection of the area. However I am stuck on x = data.loc[data[str(target_name)] == i,str(column_name)] part:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde

def intersection_area(data, bandwidth, margin,target_variable_name):
        # Collect the names of the independent variables
        data = data.dropna()
        X = data.drop(columns = [str(target_variable_name)], axis = 1)
        names = list(X.columns)
       
        # determine the number of unique classes from a multi-class and save them as a list.
        classes = []
        for unique_class in data.target_variable_name.unique():
            classes.append(unique_class)

        new_columns = []    
        # for each unique class, run through the different independent variables
        for i in classes:
            for column_name in names[:-1]:
                print(i) #to show the class (target variable: 0,1,...,n)
                print(column_name) #to show the variable name to be analyzed
                '''This is the part where I got stuck'''
                x = data.loc[data[str(target_name)] == i,str(column_name)]

Simulated datasets for anyone interested in replicating the problem:

from sklearn.datasets import make_classification
#note: to create a binary class target change n_class = 2

X,y = make_classification(n_samples=50000, n_features=6,n_informative=6, n_redundant=0, n_repeated=0, n_classes=4
                          ,n_clusters_per_class=3,class_sep=0.95,flip_y=0.2,weights=[0.7,0.2,0.1], shuffle=True,random_state=93)

dataset_x = pd.DataFrame({'var1': X[:, 0], 'var2': X[:, 1],'var3': X[:, 2], 'var4': X[:, 3]
                        ,'var5': X[:, 4], 'var6': X[:, 5]})

dataset_y = pd.DataFrame({'target': y})

sample_dataset = pd.concat([dataset_x,dataset_y], axis=1)
print(sample_dataset)

Solution

Consider building list of x's and kde's using list comprehension for multiple classes per target level. And instead of printing out the result in each iteration, bind results into a data frame:

def intersection_area_new(data, bandwidth, margin, target_variable_name):
        # Collect the names of the independent variables
        data = data.dropna()
        
        # determine the number of unique classes from a multi-class target variable and save them as a list.
        classes = data['target'].unique()
        
        kde_dicts = []
        for column_name in data.columns[:-1]:
            # BUILD LIST OF x's AND kde's
            x_s = [data.loc[(data[target_variable_name] == i), str(column_name)] for i in classes]
            kde_s = [gaussian_kde(x, bw_method=bandwidth) for x in x_s]
            
            x_min = min([x.min() for x in x_s])              # find the lowest value between two minimum points
            x_max = min([x.max() for x in x_s])              # find the lowest value between two maximum points
                            
            dx = margin * (x_max - x_min)                    # add a margin since the kde is wider than the data
            x_min -= dx
            x_max += dx
    
            x_array = np.linspace(x_min, x_max, 500)
            kde_x_s = [kde(x_array) for kde in kde_s]
                        
            inters_x = np.array(kde_x_s).min(axis=0)
            area_inters_x = np.trapz(inters_x, x_array)      # intersection of kdes
            
            kde_dicts.append({'target': target_variable_name, 
                              'column': column_name,
                              'intersection': area_inters_x})
        
        return pd.DataFrame(kde_dicts)

Output

output = intersection_area_new(sample_dataset, None, 0.5, "target")
print(output.head(10))

#    target column  intersection
# 0  target   var1      0.842256
# 1  target   var2      0.757190
# 2  target   var3      0.676021
# 3  target   var4      0.873074
# 4  target   var5      0.763626
# 5  target   var6      0.868560