python scikit-learn pipeline xgboost feature-selection

Find and use top 10 features in XGBoost regression pipeline

I want to get the top 10 features with XGBRegressor with ft_weights_xgb_reg.sort_values(by='weight', ascending=False).head(10) I get the top 10 features. But how could I use this in my pipeline?

I have this class FeatureSelector_Only_Top_10, how could I only use the top 10 features and later printed out? For example print(grid.feature_selection_top_10.top10features).

Imports:

import time
import xgboost as xgb
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.manifold import TSNE
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso

XGB:

xgb_reg_start = time.time()

xgb_reg = xgb.XGBRegressor()
xgb_reg.fit(X_train_nor, y_train)
training_preds_xgb_reg = xgb_reg.predict(X_train_nor)
val_preds_xgb_reg = xgb_reg.predict(X_test_nor)

xgb_reg_end = time.time()

print(f"Time taken to run: {round((xgb_reg_end - xgb_reg_start)/60,1)} minutes")
print("\nTraining MSE:", round(metrics.mean_squared_error(y_train, training_preds_xgb_reg),4))
print("Validation MSE:", round(metrics.mean_squared_error(y_test, val_preds_xgb_reg),4))
print("\nTraining r2:", round(metrics.r2_score(y_train, training_preds_xgb_reg),4))
print("Validation r2:", round(metrics.r2_score(y_test, val_preds_xgb_reg),4))

ft_weights_xgb_reg = pd.DataFrame(xgb_reg.feature_importances_, columns=['weight'], index=X_train.columns)
ft_weights_xgb_reg.sort_values('weight', inplace=True)
ft_weights_xgb_reg.sort_values(by='weight', ascending=False).head(10)

Pipeline:

class FeatureSelector_Only_Top_10(BaseEstimator, TransformerMixin):
    def __init__(self,n_components = 10):
        self.n_components = n_components
       

    def fit(self, X, y = None):
       # Don't know
        return self

    def transform(self, X, y = None):
        # Don't know
        return X

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=30)
    
steps = [#('feature_selection_top_10', FeatureSelector_Only_Top_10()),
         #('feature_selection', SelectFromModel(estimator=LogisticRegression(max_iter=100))),
         ('lasso', Lasso(alpha=0.03))]

pipeline = Pipeline(steps) 
parameteres = { }

grid = GridSearchCV(pipeline, param_grid=parameteres, cv=5)                
grid.fit(X_train, y_train)
print(grid.best_params_)                    
print("score = %3.2f" %(grid.score(X_test,y_test)))

Solution

You can include SelectFromModel in the pipeline in order to extract the top 10 features based on their importance weights, there is no need to create a custom transformer. As explained in the documentation, if you want to select 10 features you need to set max_features=10 and threshold=-np.inf.

import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.datasets import make_regression
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LinearRegression

X, y = make_regression(n_features=100, n_samples=1000, random_state=42)

X = pd.DataFrame(data=X, columns=['x' + str(i) for i in range(X.shape[1])])
y = pd.Series(y, name='y')

pipeline = Pipeline([
    ('selector', SelectFromModel(estimator=XGBRegressor(), max_features=10, threshold=-np.inf)),
    ('regressor', LinearRegression())
])

pipeline.fit(X, y)

selected_features = pipeline['selector'].get_support()
print(selected_features.sum())
# 10

selected_features_names = X.columns[selected_features].tolist()
print(selected_features_names)
# ['x0', 'x14', 'x17', 'x35', 'x42', 'x43', 'x57', 'x71', 'x84', 'x95']

selected_features_importances = pipeline['selector'].estimator_.feature_importances_[selected_features]
print(selected_features_importances)
# [0.09361505 0.18474296 0.14420615 0.01952794 0.10946904 0.02192107 0.03307951 0.02948984 0.02851948 0.1216883]

selected_features_coefficients = pipeline['regressor'].coef_
print(selected_features_coefficients)
# [49.43000693 83.91437854 78.25242596 -0.76411769 56.67970515  0.16829694 28.81967319  0.50277914 24.55006237 68.17120687]