Search code examples
pythonpandasnumpyscipypython-itertools

Analyzing dataframes contained in Python's For Loop


Current Situation:

I have a function that separates a binary class target variable into "1's" and "0's", it then reads all the independent variable for each. The function also determines the KDE of each of these independent variables based on the class: "1" and "0", then calculates the area of intersection:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde

def intersection_area(data, bandwidth, margin,target_variable_name):
        #target_variable_name is the column name of the response variable
        data = data.dropna()
        X = data.drop(columns = [str(target_variable_name)], axis = 1)
        names = list(X.columns)
        new_columns = []
        for column_name in names[:-1]:
            x0= data.loc[data[str(target_variable_name)] == 0,str(column_name)]
            x1= data.loc[data[str(target_variable_name)] == 1,str(column_name)]
            
            kde0 = gaussian_kde(x0, bw_method=bandwidth)
            kde1 = gaussian_kde(x1, bw_method=bandwidth)
            x_min = min(x0.min(), x1.min()) #find the lowest value between two minimum points
            x_max = min(x0.max(), x1.max()) #finds the lowest value between two maximum points
            dx = margin * (x_max - x_min) # add a margin since the kde is wider than the data
            x_min -= dx
            x_max += dx
        
            x = np.linspace(x_min, x_max, 500)
            kde0_x = kde0(x)
            kde1_x = kde1(x)
            inters_x = np.minimum(kde0_x, kde1_x)
            area_inters_x = np.trapz(inters_x, x) #intersection of two kde
            print(area_inters_x)

Problem: if I have n_class = 4 the function will look like:

def intersection_area(data, bandwidth, margin,target_variable_name):
        #target_variable_name is the column name of the response variable
        data = data.dropna()
        X = data.drop(columns = [str(target_variable_name)], axis = 1)
        names = list(X.columns)
        new_columns = []
        for column_name in names[:-1]:
            x0= data.loc[data[str(target_variable_name)] == 0,str(column_name)]
            x1= data.loc[data[str(target_variable_name)] == 1,str(column_name)]
            x2= data.loc[data[str(target_variable_name)] == 2,str(column_name)]
            x3= data.loc[data[str(target_variable_name)] == 3,str(column_name)]
            
            kde0 = gaussian_kde(x0, bw_method=bandwidth)
            kde1 = gaussian_kde(x1, bw_method=bandwidth)
            kde2 = gaussian_kde(x2, bw_method=bandwidth)
            kde3 = gaussian_kde(x3, bw_method=bandwidth)
            x_min = min(x0.min(), x1.min(),x2.min(),x3.min())
            x_max = min(x0.max(), x1.max(),x2.min(),x3.min())

            dx = margin * (x_max - x_min)
            x_min -= dx
            x_max += dx
        
            x = np.linspace(x_min, x_max, 500)
            kde0_x = kde0(x)
            kde1_x = kde1(x)
            kde2_x = kde1(x)
            kde3_x = kde1(x)
            inters_x = np.minimum(kde0_x, kde1_x, kde2_x, kde3_x)
            area_inters_x = np.trapz(inters_x, x)
            print(area_inters_x)

Now what if I have an unknown dataset with n number of classes? I am trying to improve my old code so that it becomes robust to multiclass datasets, determine the KDE of an indepdendent variable given the class and calculates the intersection of the area. However I am stuck on x = data.loc[data[str(target_name)] == i,str(column_name)] part:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde

def intersection_area(data, bandwidth, margin,target_variable_name):
        # Collect the names of the independent variables
        data = data.dropna()
        X = data.drop(columns = [str(target_variable_name)], axis = 1)
        names = list(X.columns)
       
        # determine the number of unique classes from a multi-class and save them as a list.
        classes = []
        for unique_class in data.target_variable_name.unique():
            classes.append(unique_class)

        new_columns = []    
        # for each unique class, run through the different independent variables
        for i in classes:
            for column_name in names[:-1]:
                print(i) #to show the class (target variable: 0,1,...,n)
                print(column_name) #to show the variable name to be analyzed
                '''This is the part where I got stuck'''
                x = data.loc[data[str(target_name)] == i,str(column_name)] 

Simulated datasets for anyone interested in replicating the problem:

from sklearn.datasets import make_classification
#note: to create a binary class target change n_class = 2

X,y = make_classification(n_samples=50000, n_features=6,n_informative=6, n_redundant=0, n_repeated=0, n_classes=4
                          ,n_clusters_per_class=3,class_sep=0.95,flip_y=0.2,weights=[0.7,0.2,0.1], shuffle=True,random_state=93)

dataset_x = pd.DataFrame({'var1': X[:, 0], 'var2': X[:, 1],'var3': X[:, 2], 'var4': X[:, 3]
                        ,'var5': X[:, 4], 'var6': X[:, 5]})

dataset_y = pd.DataFrame({'target': y})

sample_dataset = pd.concat([dataset_x,dataset_y], axis=1)
print(sample_dataset)

Solution

  • Consider building list of x's and kde's using list comprehension for multiple classes per target level. And instead of printing out the result in each iteration, bind results into a data frame:

    def intersection_area_new(data, bandwidth, margin, target_variable_name):
            # Collect the names of the independent variables
            data = data.dropna()
            
            # determine the number of unique classes from a multi-class target variable and save them as a list.
            classes = data['target'].unique()
            
            kde_dicts = []
            for column_name in data.columns[:-1]:
                # BUILD LIST OF x's AND kde's
                x_s = [data.loc[(data[target_variable_name] == i), str(column_name)] for i in classes]
                kde_s = [gaussian_kde(x, bw_method=bandwidth) for x in x_s]
                
                x_min = min([x.min() for x in x_s])              # find the lowest value between two minimum points
                x_max = min([x.max() for x in x_s])              # find the lowest value between two maximum points
                                
                dx = margin * (x_max - x_min)                    # add a margin since the kde is wider than the data
                x_min -= dx
                x_max += dx
        
                x_array = np.linspace(x_min, x_max, 500)
                kde_x_s = [kde(x_array) for kde in kde_s]
                            
                inters_x = np.array(kde_x_s).min(axis=0)
                area_inters_x = np.trapz(inters_x, x_array)      # intersection of kdes
                
                kde_dicts.append({'target': target_variable_name, 
                                  'column': column_name,
                                  'intersection': area_inters_x})
            
            return pd.DataFrame(kde_dicts)
    

    Output

    output = intersection_area_new(sample_dataset, None, 0.5, "target")
    print(output.head(10))
    
    #    target column  intersection
    # 0  target   var1      0.842256
    # 1  target   var2      0.757190
    # 2  target   var3      0.676021
    # 3  target   var4      0.873074
    # 4  target   var5      0.763626
    # 5  target   var6      0.868560