Search code examples
pythonscikit-learnpipeline

RandomizedSearchCV Pipeline select hyperparameters of SelectPercentile using mutual_info_classif


What I have

I have a pipeline that runs with my hyperparameter distributions

pipe = Pipeline(steps=[
    ('scale', MinMaxScaler()),
    ('vt', VarianceThreshold()),
    ('pca', PCA(random_state=0)),
    ('select', SelectPercentile()),
    ('clf', RandomForestClassifier(random_state=0))
])

hyper_params0 = {
    'vt__threshold' : stats.distributions.uniform(0, 0.1),
    'pca__n_components' : stats.distributions.uniform(0.8, 0.19),
    'select__percentile' : stats.distributions.randint(1, 101),
    'clf__n_estimators' : stats.distributions.randint(50, 1000),
    'clf__criterion' : ['gini', 'entropy'],
    'clf__min_samples_split' : stats.distributions.uniform(0, 0.1),
    'clf__min_samples_leaf' : stats.distributions.uniform(0, 0.1),
    'clf__max_features' : ['sqrt', 'log2', None],
    'clf__bootstrap' : [True, False],
}

hyper_params=[
    {
        **hyper_params0,
        **{
            'select__score_func' : [mutual_info_classif],
        }
    },
    {
        **hyper_params0,
        **{
            'select__score_func' : [f_classif],
        }
    }
]

rscv = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=hyper_params,
    n_iter=25,
    cv=5,
    scoring='f1_macro',
    n_jobs=-1,
    random_state=0,
    verbose=3
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

rscv.fit(X_train, y_train)

What I want

What I would like to do is search through the n_neighbors parameter in the mutual_info_classif within SelectPercentile.

What I tried

I tried editing hyper_params like so:

hyper_params=[
    {
        **hyper_params0,
        **{
            'select__score_func' : [mutual_info_classif],
            'select__score_func__n_neighbors' : stats.distributions.randint(3, 15)
        }
    },
    {
        **hyper_params0,
        **{
            'select__score_func' : [f_classif],
        }
    }
]

But I get the error AttributeError: 'function' object has no attribute 'set_params'. I followed a loose example on the scikit-learn site here but didn't get very far. Also tried using 'passthrough' like this:

pipe = Pipeline(steps=[
    ('scale', MinMaxScaler()),
    ('vt', VarianceThreshold()),
    ('pca', PCA(random_state=0)),
    ('select', 'passthrough'),
    ('clf', RandomForestClassifier(random_state=0))
])
...
hyper_params=[
    {
        **hyper_params0,
        **{
            'select__score_func' : [SelectPercentile(mutual_info_classif)],
            'select__score_func__n_neighbors' : stats.distributions.randint(3, 15)
        }
    },
    {
        **hyper_params0,
        **{
            'select__score_func' : [SelectPercentile(f_classif)],
        }
    }
]

But get the error AttributeError: 'str' object has no attribute 'set_params'.

Question

Any advice on how to do this?


Solution

  • As the error suggests the mutual_info_classif is a function, hence GridSearchCV cannot set the parameters for it using __. GridSearchCV can set the parameter only for the classes which support the BaseEstimator design.

    First, you need to create a custom SelectPercentile that can take the n_neighbors as a class parameter.

    class SelectPercentileMI(SelectPercentile):
        def __init__(self, percentile=10, n_neighbors=3):
            self.n_neighbors=n_neighbors
            super().__init__(percentile=percentile,
                             score_func=partial(mutual_info_classif, n_neighbors=3))
    

    Now, your problem is resolved.

    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn.datasets import load_digits
    from sklearn.model_selection import GridSearchCV
    from sklearn.pipeline import Pipeline
    from sklearn.svm import LinearSVC
    from sklearn.decomposition import PCA, NMF
    from sklearn.feature_selection import SelectKBest, chi2, SelectPercentile, mutual_info_classif, f_classif
    from scipy import stats
    from functools import partial
    
    
    pipe = Pipeline([
        ('feature_selector', 'passthrough'),
        ('classify', LinearSVC(dual=False, max_iter=10000))
    ])
    
    C_OPTIONS = [1, 10]
    param_grid = [
        {
            'feature_selector': [SelectPercentile(f_classif)],
            'classify__C': C_OPTIONS
        },
        {
            'feature_selector': [SelectPercentileMI()],
            'feature_selector__n_neighbors' : [2,3],
            'classify__C': C_OPTIONS
        },
    ]
    
    grid = GridSearchCV(pipe, n_jobs=1, param_grid=param_grid)
    X, y = load_digits(return_X_y=True)
    grid.fit(X, y)
    
    grid.best_params_
    
    

    {'classify__C': 10, 'feature_selector': SelectPercentileMI(n_neighbors=3, percentile=10), 'feature_selector__n_neighbors': 3}