I have a pipeline that runs with my hyperparameter distributions
pipe = Pipeline(steps=[
('scale', MinMaxScaler()),
('vt', VarianceThreshold()),
('pca', PCA(random_state=0)),
('select', SelectPercentile()),
('clf', RandomForestClassifier(random_state=0))
])
hyper_params0 = {
'vt__threshold' : stats.distributions.uniform(0, 0.1),
'pca__n_components' : stats.distributions.uniform(0.8, 0.19),
'select__percentile' : stats.distributions.randint(1, 101),
'clf__n_estimators' : stats.distributions.randint(50, 1000),
'clf__criterion' : ['gini', 'entropy'],
'clf__min_samples_split' : stats.distributions.uniform(0, 0.1),
'clf__min_samples_leaf' : stats.distributions.uniform(0, 0.1),
'clf__max_features' : ['sqrt', 'log2', None],
'clf__bootstrap' : [True, False],
}
hyper_params=[
{
**hyper_params0,
**{
'select__score_func' : [mutual_info_classif],
}
},
{
**hyper_params0,
**{
'select__score_func' : [f_classif],
}
}
]
rscv = RandomizedSearchCV(
estimator=pipe,
param_distributions=hyper_params,
n_iter=25,
cv=5,
scoring='f1_macro',
n_jobs=-1,
random_state=0,
verbose=3
)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
rscv.fit(X_train, y_train)
What I would like to do is search through the n_neighbors
parameter in the mutual_info_classif
within SelectPercentile
.
I tried editing hyper_params
like so:
hyper_params=[
{
**hyper_params0,
**{
'select__score_func' : [mutual_info_classif],
'select__score_func__n_neighbors' : stats.distributions.randint(3, 15)
}
},
{
**hyper_params0,
**{
'select__score_func' : [f_classif],
}
}
]
But I get the error AttributeError: 'function' object has no attribute 'set_params'
. I followed a loose example on the scikit-learn site here but didn't get very far. Also tried using 'passthrough'
like this:
pipe = Pipeline(steps=[
('scale', MinMaxScaler()),
('vt', VarianceThreshold()),
('pca', PCA(random_state=0)),
('select', 'passthrough'),
('clf', RandomForestClassifier(random_state=0))
])
...
hyper_params=[
{
**hyper_params0,
**{
'select__score_func' : [SelectPercentile(mutual_info_classif)],
'select__score_func__n_neighbors' : stats.distributions.randint(3, 15)
}
},
{
**hyper_params0,
**{
'select__score_func' : [SelectPercentile(f_classif)],
}
}
]
But get the error AttributeError: 'str' object has no attribute 'set_params'
.
Any advice on how to do this?
As the error suggests the mutual_info_classif
is a function, hence GridSearchCV
cannot set the parameters for it using __
. GridSearchCV
can set the parameter only for the classes which support the BaseEstimator
design.
First, you need to create a custom SelectPercentile
that can take the n_neighbors
as a class parameter.
class SelectPercentileMI(SelectPercentile):
def __init__(self, percentile=10, n_neighbors=3):
self.n_neighbors=n_neighbors
super().__init__(percentile=percentile,
score_func=partial(mutual_info_classif, n_neighbors=3))
Now, your problem is resolved.
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.decomposition import PCA, NMF
from sklearn.feature_selection import SelectKBest, chi2, SelectPercentile, mutual_info_classif, f_classif
from scipy import stats
from functools import partial
pipe = Pipeline([
('feature_selector', 'passthrough'),
('classify', LinearSVC(dual=False, max_iter=10000))
])
C_OPTIONS = [1, 10]
param_grid = [
{
'feature_selector': [SelectPercentile(f_classif)],
'classify__C': C_OPTIONS
},
{
'feature_selector': [SelectPercentileMI()],
'feature_selector__n_neighbors' : [2,3],
'classify__C': C_OPTIONS
},
]
grid = GridSearchCV(pipe, n_jobs=1, param_grid=param_grid)
X, y = load_digits(return_X_y=True)
grid.fit(X, y)
grid.best_params_
{'classify__C': 10, 'feature_selector': SelectPercentileMI(n_neighbors=3, percentile=10), 'feature_selector__n_neighbors': 3}