scikit-learn cross-validation gridsearchcv

Specify columns to be selected in parameter grid for GridSearchCV

I want to train models with certain sets of features as hyperparameters, using sklearn's GridSearchCV.

An example parameter grid would be:

[
    {
        'clf': [LogisticRegression()],
        'clf__C': [0.5, 0.1, 0.05, 0.01],
        'coltrans__feature_selector__feature_names': [
            ['COUNT(activities)', 'COUNT(events WHERE device_category = desktop)'], 
            ['COUNT(activities)']
        ]
    },
    {
        'clf': [DummyClassifier()],
        'clf__strategy': ['prior', 'most_frequent'],
        'coltrans__feature_selector__feature_names': [
            ['COUNT(activities)', 'COUNT(events WHERE device_category = desktop)'], 
            ['COUNT(activities)']
        ]
    }
]

This means I'd like GridSearchCV to train 4 logistic regressions (one for each value of C) using the set of features ['COUNT(activities)', 'COUNT(events WHERE device_category = desktop)'] and 4 logistic regressions using the set of features ['COUNT(activities)']. The same goes for the dummy model.

Here's what I've tried

import pandas as pd
from typing import List, Dict
from functools import reduce
from utils import ClfSwitcher, update_pgrid

from optbinning import BinningProcess
from sklearn.model_selection import cross_validate, GridSearchCV, KFold
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.dummy import DummyClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier

# 

# feature selector transformer. Given a set of features it will output a datraframe with all columns that contain the names of the features given in the parameter 'feature_names'

class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_names):
        self.feature_names = feature_names

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        selected_features = [col for col in X.columns if any(name in col for name in self.feature_names)]
        return X[selected_features]


# nested cross validation setup

n_folds = 3
scoring = {'auc': 'roc_auc', 'log_loss': 'neg_log_loss', 'brier_score': 'neg_brier_score'}
p_grid =     [
    {
        'clf': [LogisticRegression()],
        'clf__C': [0.5, 0.1, 0.05, 0.01],
        'coltrans__feature_selector__feature_names': [
            ['COUNT(activities)', 'COUNT(events WHERE device_category = desktop)'], 
            ['COUNT(activities)']
        ]
    },
    {
        'clf': [DummyClassifier()],
        'clf__strategy': ['prior', 'most_frequent'],
        'coltrans__feature_selector__feature_names': [
            ['COUNT(activities)', 'COUNT(events WHERE device_category = desktop)'], 
            ['COUNT(activities)']
        ]
    }
]


inner_cv = KFold(n_splits=n_folds, shuffle=True, random_state=1)
outer_cv = KFold(n_splits=n_folds, shuffle=True, random_state=3)

# get the names of categorical and numerical features
num_vars = []
cat_vars = []
for v, t in zip(X.dtypes.index, X.dtypes):
    if ("int" in str(t)) or ("float" in str(t)):
        num_vars.append(v)
    else:
        cat_vars.append(v)

# initialize transfomers that will go in the columntransfomer

imp = SimpleImputer(strategy="median")
scl = StandardScaler()
ohe = OneHotEncoder(
    drop="first", handle_unknown="infrequent_if_exist", min_frequency=0.1
)

feature_selector = FeatureSelector(feature_names=['COUNT(activities)', 'COUNT(events WHERE device_category = desktop)'])

# build columntransfomer

t = [
    ("imp_scale", make_pipeline(imp, scl), num_vars ),
    ("ohe", ohe, cat_vars),
    ('feature_selector', feature_selector, cat_vars+num_vars),
]
    
col_transformer = ColumnTransformer(transformers=t, remainder='drop')


# create a pipeline
pipe  = Pipeline([
    ('coltrans', col_transformer),
    ('clf', DummyClassifier()),
                        ])

# run cross-validation

clf = GridSearchCV(estimator=pipe, param_grid=p_grid, cv=inner_cv, refit=True, error_score='raise')

cv_results = cross_validate(
clf,
X,
y,
cv=outer_cv,
scoring=scoring,
return_estimator=False,
)

auc = reduce(lambda x, y: x + y, cv_results["test_auc"]) / n_folds
log_loss = reduce(lambda x, y: x + y, cv_results["test_log_loss"]) / n_folds


print(
" AUC estimate: ",
auc,
"\n",
"Log loss estimate: ",
log_loss
)

Here's the thing, if modify my column transfomer in the following way:

t = [
    ('feature_selector', feature_selector, cat_vars+num_vars),
]
    
col_transformer = ColumnTransformer(transformers=t, remainder='drop')

And then apply it to X:

col_transformer.fit_transform(X)

I get an array with only two columns, it works perfectly. The catch is that I have to put the feature_selector transfomer inside a ColumnTransformer because it needs the names of the columns to work. What I can't figure out is how to select the features I want and then make sure they go through all the other transfomartions (imputing and one-hot-encoding). The code I wrote works, but after using the column transfomer, I get an array with all my initial numerical features plus all the dummy columns created by one-hot-encoding.

I've tried using mlxtend's feature_selection in the actualy pipeline, but then I don't really know the indices of the features I want to select, since they've gone through one-hot-encoding (is there a way to circumvent this?).

Solution

In your original approach:

t = [
    ("imp_scale", make_pipeline(imp, scl), num_vars ),
    ("ohe", ohe, cat_vars),
    ('feature_selector', feature_selector, cat_vars+num_vars),
]
    
col_transformer = ColumnTransformer(transformers=t, remainder='drop')

you end up with every (num+cat) feature included after transformation by the first two transformers, and then the one/two you want to include without transformation via the last transformer. (See also Consistent ColumnTransformer for intersecting lists of columns and its linked questions.)

It seems like you want to include just the subset of features, and transform them accordingly. So you should pipeline the selector before the rest of the transformations:

processor = ColumnTransformer(t[:-1], remainder='drop')

pipe = Pipeline([
    ('select', feature_selector),
    ('process', processor),
])

Since your feature selector produces a dataframe, you don't have to worry about the column transformer getting feature names, but you don't know in advance which subset of features are even going to reach it. But you can use a callable in the column specification instead of a hard list (and you've already got that!):

def num_type_detector(X):
    num_vars = []
    for v, t in zip(X.dtypes.index, X.dtypes):
        if ("int" in str(t)) or ("float" in str(t)):
            num_vars.append(v)
    return num_vars

def cat_type_detector(X):
    cat_vars = []
    for v, t in zip(X.dtypes.index, X.dtypes):
        if ("int" in str(t)) or ("float" in str(t)):
            cat_vars.append(v)
    return cat_vars

processor = ColumnTransformer(
    [
        ("imp_scale", make_pipeline(imp, scl), num_type_detector),
        ("ohe", ohe, cat_type_detector),
    ],
    remainder='drop',
)

pipe = Pipeline([
    ('select', feature_selector),
    ('process', processor),
])

You should consider a more elegant version of num_type_detector, e.g. using make_column_selector (docs).

If you were to use a less-custom feature selector, you could use the pandas-out functionality included from sklearn v1.2. That doesn't work for sparse arrays (yet), so you'll need to set sparse=False in the one-hot encoder, and you might have problems with mixed types.