Voting classifier and gridsearch

I happen to a this code that is creating a voting classifier and enabling to use the gridsearch on a set of tuples of classifiers to respectively compare those

#Classifiers
clf1 = KNeighborsClassifier(n_neighbors=3)
clf2 = RandomForestClassifier(random_state=123)
clf3 = LogisticRegression(max_iter=1000)
clf4 = SVC()

#Voting Classifier
vclf = VotingClassifier(estimators=[('knn', clf1), ('rf', clf2), ('lr', clf3), ('svm', clf4)], voting='hard')

#Training
cv3 = KFold(n_splits=4, random_state=111, shuffle=True)

for clf, label in zip([clf1, clf2, clf3, clf4, vclf], ['KNN', 'Random Forest', 'Logistic Regression', 'Voting Classifier']):
    scores = cross_validate(clf, X_train, y_train, cv=cv3, scoring=['accuracy','f1'])
    print("[%s]: \n Accuracy: %0.2f (+/- %0.2f)" % (label, scores['test_accuracy'].mean(), scores['test_accuracy'].std()),
          "F1 score: %0.2f (+/- %0.2f)" % (scores['test_f1'].mean(), scores['test_f1'].std()))

#GridSearch
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

params = {'knn__n_neighbors': [5, 9],
          'rf__n_estimators': [20, 100, 200],
          'svm__C': [0.01, 0.1, 1],
          'lr__C' : [0.01, 0.1, 1],
          'estimators': [[('knn', clf1), ('lr', clf3)], [('knn', clf1), ('rf', clf2), ('svm', clf4)]] 
          }

grid = GridSearchCV(estimator=vclf, param_grid=params, cv=5)
print(grid.estimator.get_params().keys())
grid = grid.fit(X_train, y_train)

That returns an error message saying:

dict_keys(['estimators', 'flatten_transform', 'n_jobs', 'verbose', 'voting', 'weights', 'knn', 'rf', 'lr', 'svm', 'knn__algorithm', 'knn__leaf_size', 'knn__metric', 'knn__metric_params', 'knn__n_jobs', 'knn__n_neighbors', 'knn__p', 'knn__weights', 'rf__bootstrap', 'rf__ccp_alpha', 'rf__class_weight', 'rf__criterion', 'rf__max_depth', 'rf__max_features', 'rf__max_leaf_nodes', 'rf__max_samples', 'rf__min_impurity_decrease', 'rf__min_samples_leaf', 'rf__min_samples_split', 'rf__min_weight_fraction_leaf', 'rf__n_estimators', 'rf__n_jobs', 'rf__oob_score', 'rf__random_state', 'rf__verbose', 'rf__warm_start', 'lr__C', 'lr__class_weight', 'lr__dual', 'lr__fit_intercept', 'lr__intercept_scaling', 'lr__l1_ratio', 'lr__max_iter', 'lr__multi_class', 'lr__n_jobs', 'lr__penalty', 'lr__random_state', 'lr__solver', 'lr__tol', 'lr__verbose', 'lr__warm_start', 'svm__C', 'svm__break_ties', 'svm__cache_size', 'svm__class_weight', 'svm__coef0', 'svm__decision_function_shape', 'svm__degree', 'svm__gamma', 'svm__kernel', 'svm__max_iter', 'svm__probability', 'svm__random_state', 'svm__shrinking', 'svm__tol', 'svm__verbose'])
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-12-cdf5449f2486> in <module>
     11 grid = GridSearchCV(estimator=vclf, param_grid=params, cv=5)
     12 print(grid.estimator.get_params().keys())
---> 13 grid = grid.fit(X_train, y_train)
     14 #print(grid.best_params_)

/home/ubuntu/virtualenvs/python3/lib/python3.9/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
    889                 return results
    890 
--> 891             self._run_search(evaluate_candidates)
    892 
    893             # multimetric is determined here because in the case of a callable

/home/ubuntu/virtualenvs/python3/lib/python3.9/site-packages/sklearn/model_selection/_search.py in _run_search(self, evaluate_candidates)
   1390     def _run_search(self, evaluate_candidates):
   1391         """Search all candidates in param_grid"""
-> 1392         evaluate_candidates(ParameterGrid(self.param_grid))
   1393 
   1394 

/home/ubuntu/virtualenvs/python3/lib/python3.9/site-packages/sklearn/model_selection/_search.py in evaluate_candidates(candidate_params, cv, more_results)
    836                     )
    837 
--> 838                 out = parallel(
    839                     delayed(_fit_and_score)(
    840                         clone(base_estimator),

/home/ubuntu/virtualenvs/python3/lib/python3.9/site-packages/joblib/parallel.py in __call__(self, iterable)
   1083             # remaining jobs.
   1084             self._iterating = False
-> 1085             if self.dispatch_one_batch(iterator):
   1086                 self._iterating = self._original_iterator is not None
   1087 

/home/ubuntu/virtualenvs/python3/lib/python3.9/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
    899                 return False
    900             else:
--> 901                 self._dispatch(tasks)
    902                 return True
    903 

/home/ubuntu/virtualenvs/python3/lib/python3.9/site-packages/joblib/parallel.py in _dispatch(self, batch)
    817         with self._lock:
    818             job_idx = len(self._jobs)
--> 819             job = self._backend.apply_async(batch, callback=cb)
    820             # A job can complete so quickly than its callback is
    821             # called before we get here, causing self._jobs to

/home/ubuntu/virtualenvs/python3/lib/python3.9/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
    206     def apply_async(self, func, callback=None):
    207         """Schedule a func to be run"""
--> 208         result = ImmediateResult(func)
    209         if callback:
    210             callback(result)

/home/ubuntu/virtualenvs/python3/lib/python3.9/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
    595         # Don't delay the application, to avoid keeping the input
    596         # arguments in memory
--> 597         self.results = batch()
    598 
    599     def get(self):

/home/ubuntu/virtualenvs/python3/lib/python3.9/site-packages/joblib/parallel.py in __call__(self)
    286         # change the default number of processes to -1
    287         with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 288             return [func(*args, **kwargs)
    289                     for func, args, kwargs in self.items]
    290 

/home/ubuntu/virtualenvs/python3/lib/python3.9/site-packages/joblib/parallel.py in <listcomp>(.0)
    286         # change the default number of processes to -1
    287         with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 288             return [func(*args, **kwargs)
    289                     for func, args, kwargs in self.items]
    290 

/home/ubuntu/virtualenvs/python3/lib/python3.9/site-packages/sklearn/utils/fixes.py in __call__(self, *args, **kwargs)
    214     def __call__(self, *args, **kwargs):
    215         with config_context(**self.config):
--> 216             return self.function(*args, **kwargs)
    217 
    218 

/home/ubuntu/virtualenvs/python3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, split_progress, candidate_progress, error_score)
    666             cloned_parameters[k] = clone(v, safe=False)
    667 
--> 668         estimator = estimator.set_params(**cloned_parameters)
    669 
    670     start_time = time.time()

/home/ubuntu/virtualenvs/python3/lib/python3.9/site-packages/sklearn/ensemble/_base.py in set_params(self, **params)
    290             Estimator instance.
    291         """
--> 292         super()._set_params("estimators", **params)
    293         return self
    294 

/home/ubuntu/virtualenvs/python3/lib/python3.9/site-packages/sklearn/utils/metaestimators.py in _set_params(self, attr, **params)
     52                 self._replace_estimator(attr, name, params.pop(name))
     53         # 3. Step parameters and other initialisation arguments
---> 54         super().set_params(**params)
     55         return self
     56 

/home/ubuntu/virtualenvs/python3/lib/python3.9/site-packages/sklearn/base.py in set_params(self, **params)
    243             key, delim, sub_key = key.partition("__")
    244             if key not in valid_params:
--> 245                 raise ValueError(
    246                     "Invalid parameter %s for estimator %s. "
    247                     "Check the list of available parameters "

ValueError: Invalid parameter rf for estimator VotingClassifier(estimators=[('knn', KNeighborsClassifier(n_neighbors=3)),
                             ('lr', LogisticRegression(max_iter=1000))]). Check the list of available parameters with `estimator.get_params().keys()`.

But in looking at the parameter, the rf parameter exists and an array is entered. Could you guide me on that syntax ? Thank you very much for your answer !

Solution

In the parameters dict, "estimators" can be one of two possibilities

params = {
    ...
    'estimators': [
        [('knn', clf1), ('lr', clf3)],               #option 1
        [('knn', clf1), ('rf', clf2), ('svm', clf4)] #option 2
    ]
}

When option 1 is selected, there is no rf__n_samples attribute as the estimator list doesn't include the random forest. The grid search tries to set n_samples for rf, and raises an error because that attribute doesn't exist.

What you could do is define two separate parameter dictionaries: one for each estimator configuration. Each parameter dictionary has the valid entries for that estimator:

params = [
    #This dict is for the estimator=[(knn, lr)]
    {'knn__n_neighbors': [5, 9],
     'lr__C': [0.01, 0.1, 1],
     'estimators': [ [('knn', clf1), ('lr', clf3)], ]
    },
    #This dict if for estimator=[(knn, rf, svm)]
    {'knn__n_neighbors': [5, 9],
     'rf__n_estimators': [20, 100, 200],
     'svm__C': [0.01, 0.1, 1],
     'estimators': [ [('knn', clf1), ('rf', clf2), ('svm', clf4)], ]
     }
]

It now runs at my end without erroring.