I happen to a this code that is creating a voting classifier and enabling to use the gridsearch on a set of tuples of classifiers to respectively compare those
#Classifiers
clf1 = KNeighborsClassifier(n_neighbors=3)
clf2 = RandomForestClassifier(random_state=123)
clf3 = LogisticRegression(max_iter=1000)
clf4 = SVC()
#Voting Classifier
vclf = VotingClassifier(estimators=[('knn', clf1), ('rf', clf2), ('lr', clf3), ('svm', clf4)], voting='hard')
#Training
cv3 = KFold(n_splits=4, random_state=111, shuffle=True)
for clf, label in zip([clf1, clf2, clf3, clf4, vclf], ['KNN', 'Random Forest', 'Logistic Regression', 'Voting Classifier']):
scores = cross_validate(clf, X_train, y_train, cv=cv3, scoring=['accuracy','f1'])
print("[%s]: \n Accuracy: %0.2f (+/- %0.2f)" % (label, scores['test_accuracy'].mean(), scores['test_accuracy'].std()),
"F1 score: %0.2f (+/- %0.2f)" % (scores['test_f1'].mean(), scores['test_f1'].std()))
#GridSearch
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
params = {'knn__n_neighbors': [5, 9],
'rf__n_estimators': [20, 100, 200],
'svm__C': [0.01, 0.1, 1],
'lr__C' : [0.01, 0.1, 1],
'estimators': [[('knn', clf1), ('lr', clf3)], [('knn', clf1), ('rf', clf2), ('svm', clf4)]]
}
grid = GridSearchCV(estimator=vclf, param_grid=params, cv=5)
print(grid.estimator.get_params().keys())
grid = grid.fit(X_train, y_train)
That returns an error message saying:
dict_keys(['estimators', 'flatten_transform', 'n_jobs', 'verbose', 'voting', 'weights', 'knn', 'rf', 'lr', 'svm', 'knn__algorithm', 'knn__leaf_size', 'knn__metric', 'knn__metric_params', 'knn__n_jobs', 'knn__n_neighbors', 'knn__p', 'knn__weights', 'rf__bootstrap', 'rf__ccp_alpha', 'rf__class_weight', 'rf__criterion', 'rf__max_depth', 'rf__max_features', 'rf__max_leaf_nodes', 'rf__max_samples', 'rf__min_impurity_decrease', 'rf__min_samples_leaf', 'rf__min_samples_split', 'rf__min_weight_fraction_leaf', 'rf__n_estimators', 'rf__n_jobs', 'rf__oob_score', 'rf__random_state', 'rf__verbose', 'rf__warm_start', 'lr__C', 'lr__class_weight', 'lr__dual', 'lr__fit_intercept', 'lr__intercept_scaling', 'lr__l1_ratio', 'lr__max_iter', 'lr__multi_class', 'lr__n_jobs', 'lr__penalty', 'lr__random_state', 'lr__solver', 'lr__tol', 'lr__verbose', 'lr__warm_start', 'svm__C', 'svm__break_ties', 'svm__cache_size', 'svm__class_weight', 'svm__coef0', 'svm__decision_function_shape', 'svm__degree', 'svm__gamma', 'svm__kernel', 'svm__max_iter', 'svm__probability', 'svm__random_state', 'svm__shrinking', 'svm__tol', 'svm__verbose'])
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-12-cdf5449f2486> in <module>
11 grid = GridSearchCV(estimator=vclf, param_grid=params, cv=5)
12 print(grid.estimator.get_params().keys())
---> 13 grid = grid.fit(X_train, y_train)
14 #print(grid.best_params_)
/home/ubuntu/virtualenvs/python3/lib/python3.9/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
889 return results
890
--> 891 self._run_search(evaluate_candidates)
892
893 # multimetric is determined here because in the case of a callable
/home/ubuntu/virtualenvs/python3/lib/python3.9/site-packages/sklearn/model_selection/_search.py in _run_search(self, evaluate_candidates)
1390 def _run_search(self, evaluate_candidates):
1391 """Search all candidates in param_grid"""
-> 1392 evaluate_candidates(ParameterGrid(self.param_grid))
1393
1394
/home/ubuntu/virtualenvs/python3/lib/python3.9/site-packages/sklearn/model_selection/_search.py in evaluate_candidates(candidate_params, cv, more_results)
836 )
837
--> 838 out = parallel(
839 delayed(_fit_and_score)(
840 clone(base_estimator),
/home/ubuntu/virtualenvs/python3/lib/python3.9/site-packages/joblib/parallel.py in __call__(self, iterable)
1083 # remaining jobs.
1084 self._iterating = False
-> 1085 if self.dispatch_one_batch(iterator):
1086 self._iterating = self._original_iterator is not None
1087
/home/ubuntu/virtualenvs/python3/lib/python3.9/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
899 return False
900 else:
--> 901 self._dispatch(tasks)
902 return True
903
/home/ubuntu/virtualenvs/python3/lib/python3.9/site-packages/joblib/parallel.py in _dispatch(self, batch)
817 with self._lock:
818 job_idx = len(self._jobs)
--> 819 job = self._backend.apply_async(batch, callback=cb)
820 # A job can complete so quickly than its callback is
821 # called before we get here, causing self._jobs to
/home/ubuntu/virtualenvs/python3/lib/python3.9/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
/home/ubuntu/virtualenvs/python3/lib/python3.9/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
595 # Don't delay the application, to avoid keeping the input
596 # arguments in memory
--> 597 self.results = batch()
598
599 def get(self):
/home/ubuntu/virtualenvs/python3/lib/python3.9/site-packages/joblib/parallel.py in __call__(self)
286 # change the default number of processes to -1
287 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 288 return [func(*args, **kwargs)
289 for func, args, kwargs in self.items]
290
/home/ubuntu/virtualenvs/python3/lib/python3.9/site-packages/joblib/parallel.py in <listcomp>(.0)
286 # change the default number of processes to -1
287 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 288 return [func(*args, **kwargs)
289 for func, args, kwargs in self.items]
290
/home/ubuntu/virtualenvs/python3/lib/python3.9/site-packages/sklearn/utils/fixes.py in __call__(self, *args, **kwargs)
214 def __call__(self, *args, **kwargs):
215 with config_context(**self.config):
--> 216 return self.function(*args, **kwargs)
217
218
/home/ubuntu/virtualenvs/python3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, split_progress, candidate_progress, error_score)
666 cloned_parameters[k] = clone(v, safe=False)
667
--> 668 estimator = estimator.set_params(**cloned_parameters)
669
670 start_time = time.time()
/home/ubuntu/virtualenvs/python3/lib/python3.9/site-packages/sklearn/ensemble/_base.py in set_params(self, **params)
290 Estimator instance.
291 """
--> 292 super()._set_params("estimators", **params)
293 return self
294
/home/ubuntu/virtualenvs/python3/lib/python3.9/site-packages/sklearn/utils/metaestimators.py in _set_params(self, attr, **params)
52 self._replace_estimator(attr, name, params.pop(name))
53 # 3. Step parameters and other initialisation arguments
---> 54 super().set_params(**params)
55 return self
56
/home/ubuntu/virtualenvs/python3/lib/python3.9/site-packages/sklearn/base.py in set_params(self, **params)
243 key, delim, sub_key = key.partition("__")
244 if key not in valid_params:
--> 245 raise ValueError(
246 "Invalid parameter %s for estimator %s. "
247 "Check the list of available parameters "
ValueError: Invalid parameter rf for estimator VotingClassifier(estimators=[('knn', KNeighborsClassifier(n_neighbors=3)),
('lr', LogisticRegression(max_iter=1000))]). Check the list of available parameters with `estimator.get_params().keys()`.
But in looking at the parameter, the rf parameter exists and an array is entered. Could you guide me on that syntax ? Thank you very much for your answer !
In the parameters dict
, "estimators"
can be one of two possibilities
params = {
...
'estimators': [
[('knn', clf1), ('lr', clf3)], #option 1
[('knn', clf1), ('rf', clf2), ('svm', clf4)] #option 2
]
}
When option 1 is selected, there is no rf__n_samples
attribute as the estimator list doesn't include the random forest. The grid search tries to set n_samples
for rf
, and raises an error because that attribute doesn't exist.
What you could do is define two separate parameter dictionaries: one for each estimator configuration. Each parameter dictionary has the valid entries for that estimator:
params = [
#This dict is for the estimator=[(knn, lr)]
{'knn__n_neighbors': [5, 9],
'lr__C': [0.01, 0.1, 1],
'estimators': [ [('knn', clf1), ('lr', clf3)], ]
},
#This dict if for estimator=[(knn, rf, svm)]
{'knn__n_neighbors': [5, 9],
'rf__n_estimators': [20, 100, 200],
'svm__C': [0.01, 0.1, 1],
'estimators': [ [('knn', clf1), ('rf', clf2), ('svm', clf4)], ]
}
]
It now runs at my end without erroring.