Search code examples
pythonalgorithmoptimizationgenetic-algorithm

Genetic algorithm for kubernetes allocation


I am trying to allocate Kubernetes pods to nodes using a genetic algorithm, where each pod is assigned to one node. Below is my implementation:

from string import ascii_lowercase
import numpy as np
import random
from itertools import compress
import math
import pandas as pd
import random

def create_pods_and_nodes(n_pods=40, n_nodes=15):
    # Create pod and node names
    pod = ['pod_' + str(i+1) for i in range(n_pods)]
    node = ['node_' + str(i+1) for i in range(n_nodes)]

    # Define CPU and RAM options
    cpu = [2**i for i in range(1, 8)]  # 2, 4, 8, 16, 32, 64, 128
    ram = [2**i for i in range(2, 10)]  # 4, 8, 16, ..., 8192

    # Create the pods DataFrame
    pods = pd.DataFrame({
        'pod': pod,
        'cpu': random.choices(cpu[0:3], k=n_pods),  # Small CPU for pods
        'ram': random.choices(ram[0:4], k=n_pods),  # Small RAM for pods
    })

    # Create the nodes DataFrame
    nodes = pd.DataFrame({
        'node': node,
        'cpu': random.choices(cpu[4:len(cpu)-1], k=n_nodes),  # Larger CPU for nodes
        'ram': random.choices(ram[4:len(ram)-1], k=n_nodes),  # Larger RAM for nodes
    })

    return pods, nodes

# Example usage
pods, nodes = create_pods_and_nodes(n_pods=46, n_nodes=6)

# Display the results
print("Pods DataFrame:\n", pods.head())
print("\nNodes DataFrame:\n", nodes.head())

print(f"total CPU pods: {np.sum(pods['cpu'])}")
print(f"total RAM pods: {np.sum(pods['ram'])}")
print('\n')
print(f"total CPU nodes: {np.sum(nodes['cpu'])}")
print(f"total RAM nodes: {np.sum(nodes['ram'])}")

# Genetic Algorithm Parameters
POPULATION_SIZE = 100
GENERATIONS = 50
MUTATION_RATE = 0.1
TOURNAMENT_SIZE = 5

def create_individual():
    return [random.randint(0, len(nodes) - 1) for _ in range(len(pods))]

def create_population(size):
    return [create_individual() for _ in range(size)]

def fitness(individual):
    total_cpu_used = np.zeros(len(nodes))
    total_ram_used = np.zeros(len(nodes))
    unallocated_pods = 0

    for pod_idx, node_idx in enumerate(individual):
        pod_cpu = pods.iloc[pod_idx]['cpu']
        pod_ram = pods.iloc[pod_idx]['ram']

        if total_cpu_used[node_idx] + pod_cpu <= nodes.iloc[node_idx]['cpu'] and total_ram_used[node_idx] + pod_ram <= nodes.iloc[node_idx]['ram']:
            total_cpu_used[node_idx] += pod_cpu
            total_ram_used[node_idx] += pod_ram
        else:
            unallocated_pods += 1  # Count unallocated pods

    # Reward for utilizing resources and penalize for unallocated pods
    return (total_cpu_used.sum() + total_ram_used.sum()) - (unallocated_pods * 10)

def select(population):
    tournament = random.sample(population, TOURNAMENT_SIZE)
    return max(tournament, key=fitness)

def crossover(parent1, parent2):
    crossover_point = random.randint(1, len(pods) - 1)
    child1 = parent1[:crossover_point] + parent2[crossover_point:]
    child2 = parent2[:crossover_point] + parent1[crossover_point:]
    return child1, child2

def mutate(individual):
    for idx in range(len(individual)):
        if random.random() < MUTATION_RATE:
            individual[idx] = random.randint(0, len(nodes) - 1)

def genetic_algorithm():
    population = create_population(POPULATION_SIZE)
    
    for generation in range(GENERATIONS):
        new_population = []
        for _ in range(POPULATION_SIZE // 2):
            parent1 = select(population)
            parent2 = select(population)
            child1, child2 = crossover(parent1, parent2)
            mutate(child1)
            mutate(child2)
            new_population.extend([child1, child2])
        population = new_population

        # Print the best fitness of this generation
        best_fitness = max(fitness(individual) for individual in population)
        print(f"Generation {generation + 1}: Best Fitness = {best_fitness}")

    # Return the best individual found
    best_individual = max(population, key=fitness)
    return best_individual

# Run the genetic algorithm
print("Starting Genetic Algorithm...")
best_allocation = genetic_algorithm()
print("Genetic Algorithm completed.\n")

# Create the allocation DataFrame
allocation_df = pd.DataFrame({
    'Pod': pods['pod'],
    'Node': [nodes.iloc[best_allocation[i]]['node'] for i in range(len(best_allocation))],
    'Pod_Resources': [list(pods.iloc[i][['cpu', 'ram']]) for i in range(len(best_allocation))],
    'Node_Resources': [list(nodes.iloc[best_allocation[i]][['cpu', 'ram']]) for i in range(len(best_allocation))]
})

# Print the allocation DataFrame
print("\nAllocation DataFrame:")
print(allocation_df)

# Summarize total CPU and RAM utilization for each node
node_utilization_df = allocation_df.groupby('Node').agg(
    Total_CPU_Used=pd.NamedAgg(column='Pod_Resources', aggfunc=lambda x: sum([res[0] for res in x if res])),
    Total_RAM_Used=pd.NamedAgg(column='Pod_Resources', aggfunc=lambda x: sum([res[1] for res in x if res])),
    Node_CPU=pd.NamedAgg(column='Node_Resources', aggfunc=lambda x: x.iloc[0][0] if x.iloc[0] is not None else 0),
    Node_RAM=pd.NamedAgg(column='Node_Resources', aggfunc=lambda x: x.iloc[0][1] if x.iloc[0] is not None else 0)
)

# Calculate CPU and RAM utilization percentages for each node
node_utilization_df['CPU_Utilization'] = (node_utilization_df['Total_CPU_Used'] / node_utilization_df['Node_CPU']) * 100
node_utilization_df['RAM_Utilization'] = (node_utilization_df['Total_RAM_Used'] / node_utilization_df['Node_RAM']) * 100

# Print the total CPU and RAM utilization for each node
print("\nTotal CPU and RAM utilization for each node:")
print(node_utilization_df)

My implementation works if the total number of CPU and/or RAM of the pods is smaller than the total CPU and/or RAM of the nodes. However, I want to make it work even if the total CPU and/or RAM of the pods exceeds the total CPU and/or RAM of the nodes, allowing for unallocated pods if they cannot be assigned. How can I achieve this?

Any suggestions or improvements would be greatly appreciated!


Solution

  • This is a straightforward bin packing problem. https://en.wikipedia.org/wiki/Bin_packing_problem Why tackle it with a genetic algorithm!?! That is going to be horribly slow, especially if you use python.

    Implementing a standard bin packing algorithm in a native language with a decent optimizing compiler will give performance many orders of magnitude faster - a millisecond or two for your sample problem

    Here is the C++ code for the implementation of the first-fit decreasing bin packing algorithm with dual resources

    void pack()
    {
        // sort items in order of decreasing largest resource requirement sum
    
        std::sort(
            theItems.begin(), theItems.end(),
            [](const cThing &a, const cThing &b) -> bool
            {
                int sa = a.myRes1 + a.myRes2;
                int sb = b.myRes1 + b.myRes2;
                return sa > sb;
            });
    
        // sort bins in order of increasing capacity sum
    
        std::sort(
            theBins.begin(), theBins.end(),
            [](const cThing &a, const cThing &b) -> bool
            {
                int sa = a.myRes1 + a.myRes2;
                int sb = b.myRes1 + b.myRes2;
                return sa < sb;
            });
    
        // fit each item into the smallest bin that fits
        
        for (cThing &item : theItems)
        {
            for (cThing &bin : theBins)
            {
                if (item.myRes1 > bin.myRes1 ||
                    item.myRes2 > bin.myRes2)
                continue;
    
                bin.pack( item );
    
                break;
            }
        }
    }
    

    The output for a run:

    All iems packed
    node_3 contains: pod_1 pod_21
    node_13 contains: pod_19 pod_15
    node_11 contains: pod_31 pod_7
    node_10 contains: pod_8 pod_28 pod_24 pod_17 pod_32
    node_14 contains: pod_16 pod_36 pod_30 pod_29 pod_6 pod_34 pod_22 pod_9 pod_20 pod_38
    node_15 contains: pod_37 pod_12 pod_13 pod_25 pod_26 pod_4 pod_3 pod_33 pod_27 pod_35 pod_2 pod_39 pod_23 
    node_4 contains: pod_5 pod_18 pod_14 pod_11 pod_10 pod_40
    node_9 is empty
    node_1 is empty
    node_8 is empty
    node_7 is empty
    node_5 is empty
    node_12 is empty
    node_2 is empty
    node_6 is empty
    

    The above run uses the setup posted in the question.

    You added:

    I want to make it work even if the total CPU and/or RAM of the pods exceeds the total CPU and/or RAM of the nodes, allowing for unallocated pods if they cannot be assigned. How can I achieve this?

    So I reduce the number of nodes created ( from 15 to 4 ) so that not all pods can be fitted. Below is the result of this run showing that the code handles this naturally

    pod_12 pod_25 pod_26 pod_4 pod_3 pod_33 pod_27 pod_35 pod_2 pod_39 pod_5 pod_23 pod_18 pod_14 pod_11 pod_10 pod_40
    17 items did not fit
    node_3 contains: pod_1 pod_21
    node_4 contains: pod_19 pod_15 pod_31 pod_7
    node_1 contains: pod_8 pod_28 pod_24 pod_17 pod_32 pod_16 pod_36 pod_30 pod_29 pod_6 pod_34
    node_2 contains: pod_22 pod_9 pod_20 pod_38 pod_37 pod_13
    

    Complete application code at https://github.com/JamesBremner/so79049807