I'm trying to create a Python script for feature selection using PyGAD
My code is shown below, nonetheless, it is returning that all the features are the best subset. How can I be sure it is correct?
import pygad
import numpy
from sklearn.model_selection import train_test_split, cross_val_score
from src.learner_params import target_column, model_features
from sklearn.datasets import load_breast_cancer
from lightgbm import LGBMClassifier as lgbm
from sklearn.metrics import roc_auc_score
bc = load_breast_cancer()
bst = lgbm(random_state = 42,n_estimators=1, max_depth=2)
function_inputs = bc.feature_names
X, y = bc.data,bc.target
X = pd.DataFrame(X, columns=bc.feature_names)
X_train, X_test, y_train, y_test = train_test_split(X,
y,
random_state=42)
def fitness_func(ga_instance, solution, solution_idx):
mask = np.where(solution ==1, True, False)
selected_features = np.array(bc.feature_names)[mask]
X_tmp = X_train.loc[:,selected_features]
score = cross_val_score(bst, X_tmp, y_train, scoring="roc_auc", cv = 2).mean()
fitness = score
return fitness
m = len(bc.feature_names)
fitness_function = fitness_func
gene_space = np.full(m,1)
num_generations = 100
num_parents_mating = 4
sol_per_pop = 8
num_genes = m
init_range_low = -2
init_range_high = 5
parent_selection_type = "sss"
keep_parents = 2
crossover_type = "single_point"
mutation_type = "random"
mutation_percent_genes = 100
ga_instance = pygad.GA(gene_space=gene_space,
num_generations=num_generations,
num_parents_mating=num_parents_mating,
fitness_func=fitness_function,
sol_per_pop=sol_per_pop,
num_genes=num_genes,
keep_parents=keep_parents,
crossover_type=crossover_type,
mutation_type=mutation_type,
mutation_percent_genes=mutation_percent_genes)
ga_instance.run()
solution, solution_fitness, solution_idx = ga_instance.best_solution()
print("Parameters of the best solution : {solution}".format(solution=solution))
print("Fitness value of the best solution = {solution_fitness}".format(solution_fitness=solution_fitness))
I have found the solution.
The thing I added was a random initialisation of the subset of features. After evaluating the model with the subset and comparing it with the model trained on the complete set of features, we even observed an increase in performance.
# Performance with subset of features:
# 0.9440559440559441
# Performance with all the features:
# 0.9370629370629371
import pygad
import numpy
from sklearn.model_selection import train_test_split, cross_val_score
from src.learner_params import target_column, model_features
from sklearn.datasets import load_breast_cancer
from lightgbm import LGBMClassifier as lgbm
from sklearn.metrics import roc_auc_score
from numpy.random import RandomState
seed = 1234
state = RandomState(seed)
bc = load_breast_cancer()
bst = lgbm(random_state = seed)
function_inputs = bc.feature_names
X, y = bc.data,bc.target
X = pd.DataFrame(X, columns=bc.feature_names)
X_train, X_test, y_train, y_test = train_test_split(X,
y,
random_state=seed)
def fitness_func(ga_instance, solution, solution_idx):
mask = np.array(solution, dtype = bool)
selected_features = np.array(bc.feature_names)[mask]
X_tmp = X_train.loc[:,selected_features]
score = cross_val_score(bst, X_tmp, y_train, scoring="roc_auc", cv = 5).mean()
fitness = score
return fitness
m = len(bc.feature_names)
fitness_function = fitness_func
# initialize with a random subset of features
gene_space = state.random_integers(0,1,m)
num_generations = 30
num_parents_mating = 2
sol_per_pop = 2
num_genes = m
parent_selection_type = "sss"
keep_parents = 2
crossover_type = "single_point"
mutation_type = "random"
mutation_percent_genes = 15
ga_instance = pygad.GA(gene_space=gene_space,
num_generations=num_generations,
num_parents_mating=num_parents_mating,
fitness_func=fitness_function,
sol_per_pop=sol_per_pop,
num_genes=num_genes,
keep_parents=keep_parents,
crossover_type=crossover_type,
mutation_type=mutation_type,
mutation_percent_genes=mutation_percent_genes,
random_seed=seed,
)
ga_instance.run()
solution, solution_fitness, solution_idx = ga_instance.best_solution()
print("Parameters of the best solution : {solution}".format(solution=solution))
print("Fitness value of the best solution = {solution_fitness}".format(solution_fitness=solution_fitness))
print(f"Number of features selected = {sum(solution)}")
model = bst.fit(X_train, y_train)
print(f"Performance with all the features:")
model.score(X_test, y_test)
model = bst.fit(X_train.loc[:,selected_], y_train)
print(f"Performance with subset of features:")
model.score(X_test.loc[:,selected_], y_test)