python machine-learning genetic-algorithm feature-selection pygad

Using PyGAD for feature selection

I'm trying to create a Python script for feature selection using PyGAD

My code is shown below, nonetheless, it is returning that all the features are the best subset. How can I be sure it is correct?

import pygad
import numpy

from sklearn.model_selection import train_test_split, cross_val_score
from src.learner_params import target_column, model_features

from sklearn.datasets import load_breast_cancer
from lightgbm import LGBMClassifier as lgbm
from sklearn.metrics import roc_auc_score


bc = load_breast_cancer()
bst = lgbm(random_state = 42,n_estimators=1, max_depth=2)

function_inputs = bc.feature_names


X, y = bc.data,bc.target
X = pd.DataFrame(X, columns=bc.feature_names)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=42)



def fitness_func(ga_instance, solution, solution_idx):
    mask = np.where(solution ==1, True, False)
    selected_features = np.array(bc.feature_names)[mask]
    X_tmp = X_train.loc[:,selected_features]
    score = cross_val_score(bst, X_tmp, y_train, scoring="roc_auc", cv = 2).mean()
    fitness = score
    return fitness



m = len(bc.feature_names)
fitness_function = fitness_func
gene_space = np.full(m,1)

num_generations = 100
num_parents_mating = 4

sol_per_pop = 8
num_genes = m

init_range_low = -2
init_range_high = 5

parent_selection_type = "sss"
keep_parents = 2

crossover_type = "single_point"

mutation_type = "random"
mutation_percent_genes = 100

ga_instance = pygad.GA(gene_space=gene_space,
                       num_generations=num_generations,
                       num_parents_mating=num_parents_mating,
                       fitness_func=fitness_function,
                       sol_per_pop=sol_per_pop,
                       num_genes=num_genes,
                       keep_parents=keep_parents,
                       crossover_type=crossover_type,
                       mutation_type=mutation_type,
                       mutation_percent_genes=mutation_percent_genes)

ga_instance.run()


solution, solution_fitness, solution_idx = ga_instance.best_solution()
print("Parameters of the best solution : {solution}".format(solution=solution))
print("Fitness value of the best solution = {solution_fitness}".format(solution_fitness=solution_fitness))

Solution

I have found the solution.

The thing I added was a random initialisation of the subset of features. After evaluating the model with the subset and comparing it with the model trained on the complete set of features, we even observed an increase in performance.

# Performance with subset of features:
# 0.9440559440559441

# Performance with all the features:
# 0.9370629370629371

import pygad
import numpy

from sklearn.model_selection import train_test_split, cross_val_score
from src.learner_params import target_column, model_features

from sklearn.datasets import load_breast_cancer
from lightgbm import LGBMClassifier as lgbm
from sklearn.metrics import roc_auc_score

from numpy.random import RandomState
seed = 1234
state = RandomState(seed)


bc = load_breast_cancer()
bst = lgbm(random_state = seed)

function_inputs = bc.feature_names


X, y = bc.data,bc.target
X = pd.DataFrame(X, columns=bc.feature_names)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=seed)



def fitness_func(ga_instance, solution, solution_idx):
    mask = np.array(solution, dtype = bool)
    selected_features = np.array(bc.feature_names)[mask]
    X_tmp = X_train.loc[:,selected_features]
    score = cross_val_score(bst, X_tmp, y_train, scoring="roc_auc", cv = 5).mean()
    fitness = score
    return fitness



m = len(bc.feature_names)
fitness_function = fitness_func
# initialize with a random subset of features
gene_space = state.random_integers(0,1,m)

num_generations = 30
num_parents_mating = 2

sol_per_pop = 2
num_genes = m


parent_selection_type = "sss"
keep_parents = 2
crossover_type = "single_point"
mutation_type = "random"
mutation_percent_genes = 15

ga_instance = pygad.GA(gene_space=gene_space,
                       num_generations=num_generations,
                       num_parents_mating=num_parents_mating,
                       fitness_func=fitness_function,
                       sol_per_pop=sol_per_pop,
                       num_genes=num_genes,
                       keep_parents=keep_parents,
                       crossover_type=crossover_type,
                       mutation_type=mutation_type,
                       mutation_percent_genes=mutation_percent_genes,
                       random_seed=seed,
                       )

ga_instance.run()


solution, solution_fitness, solution_idx = ga_instance.best_solution()
print("Parameters of the best solution : {solution}".format(solution=solution))
print("Fitness value of the best solution = {solution_fitness}".format(solution_fitness=solution_fitness))
print(f"Number of features selected = {sum(solution)}")




model = bst.fit(X_train, y_train)
print(f"Performance with all the features:")
model.score(X_test, y_test)


model = bst.fit(X_train.loc[:,selected_], y_train)
print(f"Performance with subset of features:")
model.score(X_test.loc[:,selected_], y_test)