Search code examples
pythondata-fittinggridsearchcv

Error when running gridsearchcv with pipeline


I want to create a pipeline structure that contains all the processes in the model training process. After making the relevant libraries and definitions, I created the following structure to experiment. I used telco churn dataset.

ohe_f =["gender","SeniorCitizen","Partner","Dependents","PhoneService","MultipleLines",
    "InternetService","OnlineSecurity","OnlineBackup","DeviceProtection","TechSupport",
    "StreamingTV","StreamingMovies","Contract","PaperlessBilling","PaymentMethod"]

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    stratify=y,
                                                    random_state=11)


pipeline = Pipeline(steps = [['smote', SMOTE(random_state=11)],
                             ['scaler', MinMaxScaler()],
                             ['encoder', OneHotEncoder(),ohe_f],
                             ['classifier', LogisticRegression(random_state=11)]])

stratified_kfold = StratifiedKFold(n_splits=3,
                                       shuffle=True,
                                       random_state=11)
    
param_grid = {'classifier__C':[0.01, 0.1, 1, 10, 100]}

grid_search = GridSearchCV(estimator=pipeline,
                           param_grid=param_grid,
                           scoring='roc_auc',
                           cv=stratified_kfold,
                           n_jobs=-1)

When I start training the model I get the following error. How can I solve it?

---------------------------------------------------------------------------
_RemoteTraceback                          Traceback (most recent call last)
_RemoteTraceback: 
"""
Traceback (most recent call last):
  File "C:\Users\burak\anaconda3\lib\site-packages\joblib\externals\loky\process_executor.py", line 436, in _process_worker
    r = call_item()
  File "C:\Users\burak\anaconda3\lib\site-packages\joblib\externals\loky\process_executor.py", line 288, in __call__
    return self.fn(*self.args, **self.kwargs)
  File "C:\Users\burak\anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 595, in __call__
    return self.func(*args, **kwargs)
  File "C:\Users\burak\anaconda3\lib\site-packages\joblib\parallel.py", line 262, in __call__
    return [func(*args, **kwargs)
  File "C:\Users\burak\anaconda3\lib\site-packages\joblib\parallel.py", line 262, in <listcomp>
    return [func(*args, **kwargs)
  File "C:\Users\burak\anaconda3\lib\site-packages\sklearn\utils\fixes.py", line 216, in __call__
    return self.function(*args, **kwargs)
  File "C:\Users\burak\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 668, in _fit_and_score
    estimator = estimator.set_params(**cloned_parameters)
  File "C:\Users\burak\anaconda3\lib\site-packages\sklearn\pipeline.py", line 188, in set_params
    self._set_params("steps", **kwargs)
  File "C:\Users\burak\anaconda3\lib\site-packages\sklearn\utils\metaestimators.py", line 54, in _set_params
    super().set_params(**params)
  File "C:\Users\burak\anaconda3\lib\site-packages\sklearn\base.py", line 239, in set_params
    valid_params = self.get_params(deep=True)
  File "C:\Users\burak\anaconda3\lib\site-packages\sklearn\pipeline.py", line 167, in get_params
    return self._get_params("steps", deep=deep)
  File "C:\Users\burak\anaconda3\lib\site-packages\sklearn\utils\metaestimators.py", line 33, in _get_params
    out.update(estimators)
ValueError: dictionary update sequence element #2 has length 3; 2 is required
"""

The above exception was the direct cause of the following exception:

ValueError                                Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_1388/1962240236.py in <module>
     23                            n_jobs=-1)
     24 
---> 25 grid_search.fit(X_train, y_train)
     26 cv_score = grid_search.best_score_
     27 test_score = grid_search.score(X_test, y_test)

~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
    889                 return results
    890 
--> 891             self._run_search(evaluate_candidates)
    892 
    893             # multimetric is determined here because in the case of a callable

~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
   1390     def _run_search(self, evaluate_candidates):
   1391         """Search all candidates in param_grid"""
-> 1392         evaluate_candidates(ParameterGrid(self.param_grid))
   1393 
   1394 

~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params, cv, more_results)
    836                     )
    837 
--> 838                 out = parallel(
    839                     delayed(_fit_and_score)(
    840                         clone(base_estimator),

~\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
   1054 
   1055             with self._backend.retrieval_context():
-> 1056                 self.retrieve()
   1057             # Make sure that we get a last message telling us we are done
   1058             elapsed_time = time.time() - self._start_time

~\anaconda3\lib\site-packages\joblib\parallel.py in retrieve(self)
    933             try:
    934                 if getattr(self._backend, 'supports_timeout', False):
--> 935                     self._output.extend(job.get(timeout=self.timeout))
    936                 else:
    937                     self._output.extend(job.get())

~\anaconda3\lib\site-packages\joblib\_parallel_backends.py in wrap_future_result(future, timeout)
    540         AsyncResults.get from multiprocessing."""
    541         try:
--> 542             return future.result(timeout=timeout)
    543         except CfTimeoutError as e:
    544             raise TimeoutError from e

~\anaconda3\lib\concurrent\futures\_base.py in result(self, timeout)
    443                     raise CancelledError()
    444                 elif self._state == FINISHED:
--> 445                     return self.__get_result()
    446                 else:
    447                     raise TimeoutError()

~\anaconda3\lib\concurrent\futures\_base.py in __get_result(self)
    388         if self._exception:
    389             try:
--> 390                 raise self._exception
    391             finally:
    392                 # Break a reference cycle with the exception in self._exception

ValueError: dictionary update sequence element #2 has length 3; 2 is required

Solution

  • Your need to split your pipeline into 2 parts : one to process the numeric features (with the min max scaler) and another one to process categorical features (with the one hot encoder). You can use the class ColumnTransformer from scikit-learn : https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html