Search code examples
pythonpandasalgorithmnumpy

How to improve pandas DF processing time on different combinations of calculated data


I got a big dataset, something like 100 K or 1 mil rows, and I got a function that makes vector calculations that take 0.03 sec. Now all my columns before the process can be the same for every iteration. I want to calculate the 2^n combinations of conditions I make. So currently it will take me 2^n * 0.03 s to run it all by looping length and run the function. Is there a better way to improve performance and run all these possibilities vectorized or parallel(not Python CPU parallel. It help by little). The only thing I think of is to create unique per iteration column and make regex calculations, but then the df will be too big.

In this example process where each processing takes 0.01ms, the output is: Total number of combinations: 1023.

Total time to evaluate all combinations: 20.73 seconds

import pandas as pd
import numpy as np
from itertools import combinations
import time

# Generate a larger DataFrame with 100,000 rows
data = {
    'Height': np.random.uniform(150, 200, size=100000),
    'Weight': np.random.uniform(50, 100, size=100000),
    'Gender': np.random.choice(['Male', 'Female'], size=100000),
    'Age': np.random.randint(18, 70, size=100000)
}

df = pd.DataFrame(data)

# Define vectorized functions for each condition with dynamic values
def calculate_bmi(height, weight):
    height_m = height / 100
    return weight / (height_m ** 2)

def condition_bmi(df, min_bmi, max_bmi):
    bmi = calculate_bmi(df['Height'], df['Weight'])
    return (min_bmi <= bmi) & (bmi <= max_bmi)

def condition_age(df, min_age, max_age):
    return (min_age <= df['Age']) & (df['Age'] <= max_age)

def condition_height(df, min_height, max_height):
    return (min_height <= df['Height']) & (df['Height'] <= max_height)

def condition_weight(df, min_weight, max_weight):
    return (min_weight <= df['Weight']) & (df['Weight'] <= max_weight)

def condition_gender(df, gender):
    return df['Gender'] == gender

# List of possible dynamic values for each condition (with only 2 values each)
dynamic_values = {
    'BMI is within the normal range': [(18.5, 24.9), (25.0, 29.9)],
    'Age is within the healthy range': [(18, 30), (31, 45)],
    'Height is within the normal range': [(150, 160), (161, 170)],
    'Weight is within the normal range': [(50, 60), (61, 70)],
    'Gender is specified': ['Male', 'Female']
}

# Function to create combinations of conditions with dynamic values
def create_condition_combinations(dynamic_values):
    condition_combinations = []
    for condition_name, values in dynamic_values.items():
        if isinstance(values[0], tuple):  # For range conditions
            for value in values:
                condition_combinations.append((condition_name, value))
        else:  # For categorical conditions
            for value in values:
                condition_combinations.append((condition_name, value))
    return condition_combinations

# Generate all possible combinations of conditions and dynamic values
def generate_all_combinations(condition_combinations):
    all_combinations = []
    for r in range(1, len(condition_combinations) + 1):
        for combo in combinations(condition_combinations, r):
            all_combinations.append(combo)
    return all_combinations

condition_combinations = create_condition_combinations(dynamic_values)
all_combinations = generate_all_combinations(condition_combinations)

# Calculate the total number of combinations
total_combinations = len(all_combinations)
print(f"Total number of combinations: {total_combinations}")

# Apply a combination of conditions
def evaluate_combination(df, combo):
    combined_condition = pd.Series([True] * len(df))
    for condition_name, value in combo:
        if condition_name == 'BMI is within the normal range':
            min_bmi, max_bmi = value
            combined_condition &= condition_bmi(df, min_bmi, max_bmi)
        elif condition_name == 'Age is within the healthy range':
            min_age, max_age = value
            combined_condition &= condition_age(df, min_age, max_age)
        elif condition_name == 'Height is within the normal range':
            min_height, max_height = value
            combined_condition &= condition_height(df, min_height, max_height)
        elif condition_name == 'Weight is within the normal range':
            min_weight, max_weight = value
            combined_condition &= condition_weight(df, min_weight, max_weight)
        elif condition_name == 'Gender is specified':
            gender = value
            combined_condition &= condition_gender(df, gender)
    return combined_condition

# Measure time to run all combinations
start_time = time.time()

for combo in all_combinations:
    combo_start_time = time.time()
    
    evaluate_combination(df, combo)
    
    combo_end_time = time.time()
    combo_elapsed_time = combo_end_time - combo_start_time

end_time = time.time()
total_elapsed_time = end_time - start_time

print(f"Total time to evaluate all combinations: {total_elapsed_time:.2f} seconds")

Solution

  • I think the best method without touching too much your code is by using polars. When I tested your code I was at : 7.52 seconds and now I am at : 0.45 seconds

    import polars as pl
    import numpy as np
    from itertools import combinations
    import time
    
    # Generate similar data with Polars
    data = {
        'Height': np.random.uniform(150, 200, size=100000),
        'Weight': np.random.uniform(50, 100, size=100000),
        'Gender': np.random.choice(['Male', 'Female'], size=100000),
        'Age': np.random.randint(18, 70, size=100000)
    }
    
    df = pl.DataFrame(data)
    
    # Define vectorized functions for each condition
    def calculate_bmi(df):
        height_m = df['Height'] / 100
        return df['Weight'] / (height_m ** 2)
    
    def condition_bmi(df, min_bmi, max_bmi):
        bmi = calculate_bmi(df)
        return df.with_columns(((bmi >= min_bmi) & (bmi <= max_bmi)).alias('bmi_condition'))
    
    def condition_age(df, min_age, max_age):
        return df.with_columns(((df['Age'] >= min_age) & (df['Age'] <= max_age)).alias('age_condition'))
    
    def condition_height(df, min_height, max_height):
        return df.with_columns(((df['Height'] >= min_height) & (df['Height'] <= max_height)).alias('height_condition'))
    
    def condition_weight(df, min_weight, max_weight):
        return df.with_columns(((df['Weight'] >= min_weight) & (df['Weight'] <= max_weight)).alias('weight_condition'))
    
    def condition_gender(df, gender):
        return df.with_columns((df['Gender'] == gender).alias('gender_condition'))
    
    # List of possible dynamic values for each condition
    dynamic_values = {
        'BMI is within the normal range': [(18.5, 24.9), (25.0, 29.9)],
        'Age is within the healthy range': [(18, 30), (31, 45)],
        'Height is within the normal range': [(150, 160), (161, 170)],
        'Weight is within the normal range': [(50, 60), (61, 70)],
        'Gender is specified': ['Male', 'Female']
    }
    
    # Generate all possible combinations of conditions with dynamic values
    def create_condition_combinations(dynamic_values):
        condition_combinations = []
        for condition_name, values in dynamic_values.items():
            for value in values:
                condition_combinations.append((condition_name, value))
        return condition_combinations
    
    condition_combinations = create_condition_combinations(dynamic_values)
    
    # Generate all possible combinations of conditions
    def generate_all_combinations(condition_combinations):
        all_combinations = []
        for r in range(1, len(condition_combinations) + 1):
            for combo in combinations(condition_combinations, r):
                all_combinations.append(combo)
        return all_combinations
    
    all_combinations = generate_all_combinations(condition_combinations)
    total_combinations = len(all_combinations)
    print(f"Total number of combinations: {total_combinations}")
    
    # Evaluate a combination of conditions
    def evaluate_combination(df, combo):
        # Start with True for all rows
        df = df.with_columns(pl.lit(True).alias('combined_condition'))
        
        for condition_name, value in combo:
            if condition_name == 'BMI is within the normal range':
                min_bmi, max_bmi = value
                df = condition_bmi(df, min_bmi, max_bmi)
                df = df.with_columns((df['combined_condition'] & df['bmi_condition']).alias('combined_condition'))
            elif condition_name == 'Age is within the healthy range':
                min_age, max_age = value
                df = condition_age(df, min_age, max_age)
                df = df.with_columns((df['combined_condition'] & df['age_condition']).alias('combined_condition'))
            elif condition_name == 'Height is within the normal range':
                min_height, max_height = value
                df = condition_height(df, min_height, max_height)
                df = df.with_columns((df['combined_condition'] & df['height_condition']).alias('combined_condition'))
            elif condition_name == 'Weight is within the normal range':
                min_weight, max_weight = value
                df = condition_weight(df, min_weight, max_weight)
                df = df.with_columns((df['combined_condition'] & df['weight_condition']).alias('combined_condition'))
            elif condition_name == 'Gender is specified':
                gender = value
                df = condition_gender(df, gender)
                df = df.with_columns((df['combined_condition'] & df['gender_condition']).alias('combined_condition'))
        
        return df['combined_condition']
    
    # Measure the time to run all combinations
    start_time = time.time()
    
    for combo in all_combinations:
        evaluate_combination(df, combo)
    
    end_time = time.time()
    total_elapsed_time = end_time - start_time
    
    print(f"Total time to evaluate all combinations: {total_elapsed_time:.2f} seconds")