Search code examples
pythonpandassamplingimbalanced-data

how can i write from scratch code to do stratified sampling by target variable?


all, i am trying to create from scratch (without use of sklearn libs) to create 5 samples (len of df / 5) such that each one has the same proportion of target variable (1's) as the original dataset. e.g. original has 5% cancer patients i would like each of my 5 samples to also have 5% target variable. unsure how to do this,

df_list=[]
n= round(len(df)/5)
for m in range(1,6):
    m = m*n
    print(df[:m])
    df_list.append(df[:m])

this creates each chunk i would like but how can i now do it such that the target variable is of same % as original?


Solution

  • Solution:

    import numpy as np
    import math
    
    def stratify(data, target='y', n=10):
        array = data.values
        y = data[target].values
        
        unique, counts = np.unique(data[target].values, return_counts=True)
        new_counts = counts * (n/sum(counts))
        new_counts = fit_new_counts_to_n(new_counts, n)
        
        selected_count = np.zeros(len(unique))
        selected_row_indices = []
        for i in range(array.shape[0]):
            if sum(selected_count) == sum(new_counts):
                break
            cr_target_value = y[i]
            cr_target_index = np.where(unique==cr_target_value)[0][0]
            if selected_count[cr_target_index] < new_counts[cr_target_index]:
                selected_row_indices.append(i)
                selected_count[cr_target_index] += 1
        row_indices_mask = np.array([x in selected_row_indices for x in np.arange(array.shape[0])])
        
        return pd.DataFrame(array[row_indices_mask], columns=data.columns)
    

    Utility class:

    def fit_new_counts_to_n(new_counts, n):
        decimals = [math.modf(x)[0] for x in new_counts]
        integers = [int(math.modf(x)[1]) for x in new_counts]
        arg_max = np.array(map(np.argmax, decimals))
        sorting_indices =  np.argsort(decimals)[::-1][:n]
        for i in sorting_indices:
            if sum(integers) < n:
                integers[i] += 1
            else:
                break
        return integers
    

    Example Usage:

    data = [[  3,   0],
            [ 54,   3],
            [  3,   1],
            [ 64,   1],
            [ 65,   0],
            [ 34,   1],
            [ 45,   2],
            [534,   2],
            [ 57,   1],
            [ 64,   3],
            [  5,   1],
            [ 45,   1],
            [546,   1],
            [  4,   2],
            [ 53,   3],
            [345,   2],
            [456,   2],
            [435,   3],
            [545,   1],
            [ 45,   3]]
    
    data = pd.DataFrame(data, columns=['X1', 'y'])
    
    stratified_data = stratify(data, target='y', n=10)
    

    Result:

          [[  3,   0],
           [ 54,   3],
           [  3,   1],
           [ 64,   1],
           [ 34,   1],
           [ 45,   2],
           [534,   2],
           [ 57,   1],
           [ 64,   3],
           [ 53,   3]]