python machine-learning scikit-learn xgboost knn

Printing out Features used in Feature Selection with XGBoost Feature Importance Scores

I'm using XGBoost Feature Importance Scores to perform Feature Selection in my KNN Model using the following code (taken from this article):

# this section for training and testing the algorithm after feature selection

#dataset spliting
X = df.iloc[:, 0:17]
y_bin = df.iloc[:, 17]


# spliting the dataset into train, test and validate for binary classification
X_train, X_test, y_bin_train, y_bin_test = train_test_split(X, y_bin, random_state=0, test_size=0.2)

# fit model on training data
model = XGBClassifier()
model.fit(X_train, y_bin_train)

# using normalization technique to feature scale the training data
norm = MinMaxScaler()
X_train= norm.fit_transform(X_train)
X_test= norm.transform(X_test)

#oversampling
smote= SMOTE()
X_train, y_bin_train = smote.fit_resample(X_train,y_bin_train)

# Fit model using each importance as a threshold
thresholds = sort(model.feature_importances_)
for thresh in thresholds:
  # select features using threshold
  selection = SelectFromModel(model, threshold=thresh, prefit=True)
  select_X_train = selection.transform(X_train)
  
  # train model
  knn = KNeighborsClassifier(n_neighbors=3, metric='euclidean')
  knn.fit(select_X_train, y_bin_train)

  # eval model
  select_X_test = selection.transform(X_test)
  y_pred = knn.predict(select_X_test)

  report = classification_report(y_bin_test,y_pred)
  print("Thresh= {} , n= {}\n {}" .format(thresh, select_X_train.shape[1], report))
  cm = confusion_matrix(y_bin_test, y_pred)
  print(cm)

The output that I'm getting is showing me for each iteration the number of featues used select_X_train.shape[1], the threshhold that is used everytime a feature is removed thresh, the classification report, and the confusion matrix:

Thresh= 0.0 , n= 17
               precision    recall  f1-score   support

           0       0.98      0.96      0.97     42930
           1       0.87      0.92      0.89     11996

    accuracy                           0.95     54926
   macro avg       0.92      0.94      0.93     54926
weighted avg       0.95      0.95      0.95     54926

[[41226  1704]
 [  909 11087]]
Thresh= 0.007143254857510328 , n= 16
               precision    recall  f1-score   support

           0       0.98      0.96      0.97     42930
           1       0.87      0.92      0.89     11996

    accuracy                           0.95     54926
   macro avg       0.92      0.94      0.93     54926
weighted avg       0.95      0.95      0.95     54926

[[41226  1704]
 [  909 11087]]

This output will keep on going until the number of features used reaches 1 (n=1). What I want to do is that i want to also include the names of the features used (or removed) in each iteration but I can't figure it out. Is there a way to get it done?

Solution

You can use

X.columns[selector.get_support()].to_list()

to extract the list of names of the selected features, where X is the pandas data frame with the feature values and selector is the SelectFromModel meta-transformer. See also this answer.

import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler

# generate some data
df = pd.DataFrame({
    'x1': np.random.normal(0, 1, 100),
    'x2': np.random.normal(2, 3, 100),
    'x3': np.random.normal(4, 5, 100),
    'y': np.random.choice([0, 1], 100),
})

# extract the features and target
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)

# scale the data
scaler = MinMaxScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# resample the data
smote = SMOTE()
X_train, y_train = smote.fit_resample(X_train, y_train)

# fit the XGBoost classifier using all the features
model = XGBClassifier()
model.fit(X_train, y_train)

# fit the KNN classifier using each feature importance 
# value as a feature selection threshold
thresholds = np.sort(model.feature_importances_)

for threshold in thresholds:

    # select the features
    selector = SelectFromModel(model, threshold=threshold, prefit=True)
    X_train_ = selector.transform(X_train)
    X_test_ = selector.transform(X_test)

    # extract the names of the selected features 
    selected_features = X.columns[selector.get_support()].to_list()

    # train the model
    knn = KNeighborsClassifier(n_neighbors=3, metric='euclidean')
    knn.fit(X_train_, y_train)

    # generate the model predictions
    y_pred = knn.predict(X_test_)

    # calculate the model performance metrics
    report = classification_report(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    print('Threshold: {}'.format(threshold))
    print('Selected features: \n {}'.format(selected_features))
    print('Confusion matrix: \n {}'.format(cm))
    print('Classification report: \n {}'.format(report))
    print('----------------------------')

# Threshold: 0.2871088981628418
# Selected features: 
#  ['x1', 'x2', 'x3']
# Confusion matrix: 
#  [[6 0]
#  [7 7]]
# Classification report: 
#                precision    recall  f1-score   support
#
#            0       0.46      1.00      0.63         6
#            1       1.00      0.50      0.67        14
#
#     accuracy                           0.65        20
#    macro avg       0.73      0.75      0.65        20
# weighted avg       0.84      0.65      0.66        20
#
# ----------------------------
# Threshold: 0.34210699796676636
# Selected features: 
#  ['x1', 'x3']
# Confusion matrix: 
#  [[ 4  2]
#  [10  4]]
# Classification report: 
#                precision    recall  f1-score   support
#
#            0       0.29      0.67      0.40         6
#            1       0.67      0.29      0.40        14
#
#     accuracy                           0.40        20
#    macro avg       0.48      0.48      0.40        20
# weighted avg       0.55      0.40      0.40        20
#
# ----------------------------
# Threshold: 0.37078407406806946
# Selected features: 
#  ['x1']
# Confusion matrix: 
#  [[3 3]
#  [5 9]]
# Classification report: 
#                precision    recall  f1-score   support
#
#            0       0.38      0.50      0.43         6
#            1       0.75      0.64      0.69        14
#
#     accuracy                           0.60        20
#    macro avg       0.56      0.57      0.56        20
# weighted avg       0.64      0.60      0.61        20
#
# ----------------------------