Search code examples
pythonmachine-learningscikit-learnxgboostknn

Printing out Features used in Feature Selection with XGBoost Feature Importance Scores


I'm using XGBoost Feature Importance Scores to perform Feature Selection in my KNN Model using the following code (taken from this article):

# this section for training and testing the algorithm after feature selection

#dataset spliting
X = df.iloc[:, 0:17]
y_bin = df.iloc[:, 17]


# spliting the dataset into train, test and validate for binary classification
X_train, X_test, y_bin_train, y_bin_test = train_test_split(X, y_bin, random_state=0, test_size=0.2)

# fit model on training data
model = XGBClassifier()
model.fit(X_train, y_bin_train)

# using normalization technique to feature scale the training data
norm = MinMaxScaler()
X_train= norm.fit_transform(X_train)
X_test= norm.transform(X_test)

#oversampling
smote= SMOTE()
X_train, y_bin_train = smote.fit_resample(X_train,y_bin_train)

# Fit model using each importance as a threshold
thresholds = sort(model.feature_importances_)
for thresh in thresholds:
  # select features using threshold
  selection = SelectFromModel(model, threshold=thresh, prefit=True)
  select_X_train = selection.transform(X_train)
  
  # train model
  knn = KNeighborsClassifier(n_neighbors=3, metric='euclidean')
  knn.fit(select_X_train, y_bin_train)

  # eval model
  select_X_test = selection.transform(X_test)
  y_pred = knn.predict(select_X_test)

  report = classification_report(y_bin_test,y_pred)
  print("Thresh= {} , n= {}\n {}" .format(thresh, select_X_train.shape[1], report))
  cm = confusion_matrix(y_bin_test, y_pred)
  print(cm)

The output that I'm getting is showing me for each iteration the number of featues used select_X_train.shape[1], the threshhold that is used everytime a feature is removed thresh, the classification report, and the confusion matrix:

Thresh= 0.0 , n= 17
               precision    recall  f1-score   support

           0       0.98      0.96      0.97     42930
           1       0.87      0.92      0.89     11996

    accuracy                           0.95     54926
   macro avg       0.92      0.94      0.93     54926
weighted avg       0.95      0.95      0.95     54926

[[41226  1704]
 [  909 11087]]
Thresh= 0.007143254857510328 , n= 16
               precision    recall  f1-score   support

           0       0.98      0.96      0.97     42930
           1       0.87      0.92      0.89     11996

    accuracy                           0.95     54926
   macro avg       0.92      0.94      0.93     54926
weighted avg       0.95      0.95      0.95     54926

[[41226  1704]
 [  909 11087]]

This output will keep on going until the number of features used reaches 1 (n=1). What I want to do is that i want to also include the names of the features used (or removed) in each iteration but I can't figure it out. Is there a way to get it done?


Solution

  • You can use

    X.columns[selector.get_support()].to_list()
    

    to extract the list of names of the selected features, where X is the pandas data frame with the feature values and selector is the SelectFromModel meta-transformer. See also this answer.

    import pandas as pd
    import numpy as np
    from imblearn.over_sampling import SMOTE
    from xgboost import XGBClassifier
    from sklearn.model_selection import train_test_split
    from sklearn.feature_selection import SelectFromModel
    from sklearn.metrics import classification_report, confusion_matrix
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.preprocessing import MinMaxScaler
    
    # generate some data
    df = pd.DataFrame({
        'x1': np.random.normal(0, 1, 100),
        'x2': np.random.normal(2, 3, 100),
        'x3': np.random.normal(4, 5, 100),
        'y': np.random.choice([0, 1], 100),
    })
    
    # extract the features and target
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    
    # split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)
    
    # scale the data
    scaler = MinMaxScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    
    # resample the data
    smote = SMOTE()
    X_train, y_train = smote.fit_resample(X_train, y_train)
    
    # fit the XGBoost classifier using all the features
    model = XGBClassifier()
    model.fit(X_train, y_train)
    
    # fit the KNN classifier using each feature importance 
    # value as a feature selection threshold
    thresholds = np.sort(model.feature_importances_)
    
    for threshold in thresholds:
    
        # select the features
        selector = SelectFromModel(model, threshold=threshold, prefit=True)
        X_train_ = selector.transform(X_train)
        X_test_ = selector.transform(X_test)
    
        # extract the names of the selected features 
        selected_features = X.columns[selector.get_support()].to_list()
    
        # train the model
        knn = KNeighborsClassifier(n_neighbors=3, metric='euclidean')
        knn.fit(X_train_, y_train)
    
        # generate the model predictions
        y_pred = knn.predict(X_test_)
    
        # calculate the model performance metrics
        report = classification_report(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)
    
        print('Threshold: {}'.format(threshold))
        print('Selected features: \n {}'.format(selected_features))
        print('Confusion matrix: \n {}'.format(cm))
        print('Classification report: \n {}'.format(report))
        print('----------------------------')
    
    # Threshold: 0.2871088981628418
    # Selected features: 
    #  ['x1', 'x2', 'x3']
    # Confusion matrix: 
    #  [[6 0]
    #  [7 7]]
    # Classification report: 
    #                precision    recall  f1-score   support
    #
    #            0       0.46      1.00      0.63         6
    #            1       1.00      0.50      0.67        14
    #
    #     accuracy                           0.65        20
    #    macro avg       0.73      0.75      0.65        20
    # weighted avg       0.84      0.65      0.66        20
    #
    # ----------------------------
    # Threshold: 0.34210699796676636
    # Selected features: 
    #  ['x1', 'x3']
    # Confusion matrix: 
    #  [[ 4  2]
    #  [10  4]]
    # Classification report: 
    #                precision    recall  f1-score   support
    #
    #            0       0.29      0.67      0.40         6
    #            1       0.67      0.29      0.40        14
    #
    #     accuracy                           0.40        20
    #    macro avg       0.48      0.48      0.40        20
    # weighted avg       0.55      0.40      0.40        20
    #
    # ----------------------------
    # Threshold: 0.37078407406806946
    # Selected features: 
    #  ['x1']
    # Confusion matrix: 
    #  [[3 3]
    #  [5 9]]
    # Classification report: 
    #                precision    recall  f1-score   support
    #
    #            0       0.38      0.50      0.43         6
    #            1       0.75      0.64      0.69        14
    #
    #     accuracy                           0.60        20
    #    macro avg       0.56      0.57      0.56        20
    # weighted avg       0.64      0.60      0.61        20
    #
    # ----------------------------