I got a big dataset, something like 100 K or 1 mil rows, and I got a function that makes vector calculations that take 0.03 sec. Now all my columns before the process can be the same for every iteration. I want to calculate the 2^n combinations of conditions I make. So currently it will take me 2^n * 0.03 s to run it all by looping length and run the function. Is there a better way to improve performance and run all these possibilities vectorized or parallel(not Python CPU parallel. It help by little). The only thing I think of is to create unique per iteration column and make regex calculations, but then the df
will be too big.
In this example process where each processing takes 0.01ms, the output is: Total number of combinations: 1023.
Total time to evaluate all combinations: 20.73 seconds
import pandas as pd
import numpy as np
from itertools import combinations
import time
# Generate a larger DataFrame with 100,000 rows
data = {
'Height': np.random.uniform(150, 200, size=100000),
'Weight': np.random.uniform(50, 100, size=100000),
'Gender': np.random.choice(['Male', 'Female'], size=100000),
'Age': np.random.randint(18, 70, size=100000)
}
df = pd.DataFrame(data)
# Define vectorized functions for each condition with dynamic values
def calculate_bmi(height, weight):
height_m = height / 100
return weight / (height_m ** 2)
def condition_bmi(df, min_bmi, max_bmi):
bmi = calculate_bmi(df['Height'], df['Weight'])
return (min_bmi <= bmi) & (bmi <= max_bmi)
def condition_age(df, min_age, max_age):
return (min_age <= df['Age']) & (df['Age'] <= max_age)
def condition_height(df, min_height, max_height):
return (min_height <= df['Height']) & (df['Height'] <= max_height)
def condition_weight(df, min_weight, max_weight):
return (min_weight <= df['Weight']) & (df['Weight'] <= max_weight)
def condition_gender(df, gender):
return df['Gender'] == gender
# List of possible dynamic values for each condition (with only 2 values each)
dynamic_values = {
'BMI is within the normal range': [(18.5, 24.9), (25.0, 29.9)],
'Age is within the healthy range': [(18, 30), (31, 45)],
'Height is within the normal range': [(150, 160), (161, 170)],
'Weight is within the normal range': [(50, 60), (61, 70)],
'Gender is specified': ['Male', 'Female']
}
# Function to create combinations of conditions with dynamic values
def create_condition_combinations(dynamic_values):
condition_combinations = []
for condition_name, values in dynamic_values.items():
if isinstance(values[0], tuple): # For range conditions
for value in values:
condition_combinations.append((condition_name, value))
else: # For categorical conditions
for value in values:
condition_combinations.append((condition_name, value))
return condition_combinations
# Generate all possible combinations of conditions and dynamic values
def generate_all_combinations(condition_combinations):
all_combinations = []
for r in range(1, len(condition_combinations) + 1):
for combo in combinations(condition_combinations, r):
all_combinations.append(combo)
return all_combinations
condition_combinations = create_condition_combinations(dynamic_values)
all_combinations = generate_all_combinations(condition_combinations)
# Calculate the total number of combinations
total_combinations = len(all_combinations)
print(f"Total number of combinations: {total_combinations}")
# Apply a combination of conditions
def evaluate_combination(df, combo):
combined_condition = pd.Series([True] * len(df))
for condition_name, value in combo:
if condition_name == 'BMI is within the normal range':
min_bmi, max_bmi = value
combined_condition &= condition_bmi(df, min_bmi, max_bmi)
elif condition_name == 'Age is within the healthy range':
min_age, max_age = value
combined_condition &= condition_age(df, min_age, max_age)
elif condition_name == 'Height is within the normal range':
min_height, max_height = value
combined_condition &= condition_height(df, min_height, max_height)
elif condition_name == 'Weight is within the normal range':
min_weight, max_weight = value
combined_condition &= condition_weight(df, min_weight, max_weight)
elif condition_name == 'Gender is specified':
gender = value
combined_condition &= condition_gender(df, gender)
return combined_condition
# Measure time to run all combinations
start_time = time.time()
for combo in all_combinations:
combo_start_time = time.time()
evaluate_combination(df, combo)
combo_end_time = time.time()
combo_elapsed_time = combo_end_time - combo_start_time
end_time = time.time()
total_elapsed_time = end_time - start_time
print(f"Total time to evaluate all combinations: {total_elapsed_time:.2f} seconds")
I think the best method without touching too much your code is by using polars. When I tested your code I was at : 7.52 seconds and now I am at : 0.45 seconds
import polars as pl
import numpy as np
from itertools import combinations
import time
# Generate similar data with Polars
data = {
'Height': np.random.uniform(150, 200, size=100000),
'Weight': np.random.uniform(50, 100, size=100000),
'Gender': np.random.choice(['Male', 'Female'], size=100000),
'Age': np.random.randint(18, 70, size=100000)
}
df = pl.DataFrame(data)
# Define vectorized functions for each condition
def calculate_bmi(df):
height_m = df['Height'] / 100
return df['Weight'] / (height_m ** 2)
def condition_bmi(df, min_bmi, max_bmi):
bmi = calculate_bmi(df)
return df.with_columns(((bmi >= min_bmi) & (bmi <= max_bmi)).alias('bmi_condition'))
def condition_age(df, min_age, max_age):
return df.with_columns(((df['Age'] >= min_age) & (df['Age'] <= max_age)).alias('age_condition'))
def condition_height(df, min_height, max_height):
return df.with_columns(((df['Height'] >= min_height) & (df['Height'] <= max_height)).alias('height_condition'))
def condition_weight(df, min_weight, max_weight):
return df.with_columns(((df['Weight'] >= min_weight) & (df['Weight'] <= max_weight)).alias('weight_condition'))
def condition_gender(df, gender):
return df.with_columns((df['Gender'] == gender).alias('gender_condition'))
# List of possible dynamic values for each condition
dynamic_values = {
'BMI is within the normal range': [(18.5, 24.9), (25.0, 29.9)],
'Age is within the healthy range': [(18, 30), (31, 45)],
'Height is within the normal range': [(150, 160), (161, 170)],
'Weight is within the normal range': [(50, 60), (61, 70)],
'Gender is specified': ['Male', 'Female']
}
# Generate all possible combinations of conditions with dynamic values
def create_condition_combinations(dynamic_values):
condition_combinations = []
for condition_name, values in dynamic_values.items():
for value in values:
condition_combinations.append((condition_name, value))
return condition_combinations
condition_combinations = create_condition_combinations(dynamic_values)
# Generate all possible combinations of conditions
def generate_all_combinations(condition_combinations):
all_combinations = []
for r in range(1, len(condition_combinations) + 1):
for combo in combinations(condition_combinations, r):
all_combinations.append(combo)
return all_combinations
all_combinations = generate_all_combinations(condition_combinations)
total_combinations = len(all_combinations)
print(f"Total number of combinations: {total_combinations}")
# Evaluate a combination of conditions
def evaluate_combination(df, combo):
# Start with True for all rows
df = df.with_columns(pl.lit(True).alias('combined_condition'))
for condition_name, value in combo:
if condition_name == 'BMI is within the normal range':
min_bmi, max_bmi = value
df = condition_bmi(df, min_bmi, max_bmi)
df = df.with_columns((df['combined_condition'] & df['bmi_condition']).alias('combined_condition'))
elif condition_name == 'Age is within the healthy range':
min_age, max_age = value
df = condition_age(df, min_age, max_age)
df = df.with_columns((df['combined_condition'] & df['age_condition']).alias('combined_condition'))
elif condition_name == 'Height is within the normal range':
min_height, max_height = value
df = condition_height(df, min_height, max_height)
df = df.with_columns((df['combined_condition'] & df['height_condition']).alias('combined_condition'))
elif condition_name == 'Weight is within the normal range':
min_weight, max_weight = value
df = condition_weight(df, min_weight, max_weight)
df = df.with_columns((df['combined_condition'] & df['weight_condition']).alias('combined_condition'))
elif condition_name == 'Gender is specified':
gender = value
df = condition_gender(df, gender)
df = df.with_columns((df['combined_condition'] & df['gender_condition']).alias('combined_condition'))
return df['combined_condition']
# Measure the time to run all combinations
start_time = time.time()
for combo in all_combinations:
evaluate_combination(df, combo)
end_time = time.time()
total_elapsed_time = end_time - start_time
print(f"Total time to evaluate all combinations: {total_elapsed_time:.2f} seconds")