I want to create a pipeline structure that contains all the processes in the model training process. After making the relevant libraries and definitions, I created the following structure to experiment. I used telco churn dataset.
ohe_f =["gender","SeniorCitizen","Partner","Dependents","PhoneService","MultipleLines",
"InternetService","OnlineSecurity","OnlineBackup","DeviceProtection","TechSupport",
"StreamingTV","StreamingMovies","Contract","PaperlessBilling","PaymentMethod"]
X_train, X_test, y_train, y_test = train_test_split(X,
y,
test_size=0.2,
stratify=y,
random_state=11)
pipeline = Pipeline(steps = [['smote', SMOTE(random_state=11)],
['scaler', MinMaxScaler()],
['encoder', OneHotEncoder(),ohe_f],
['classifier', LogisticRegression(random_state=11)]])
stratified_kfold = StratifiedKFold(n_splits=3,
shuffle=True,
random_state=11)
param_grid = {'classifier__C':[0.01, 0.1, 1, 10, 100]}
grid_search = GridSearchCV(estimator=pipeline,
param_grid=param_grid,
scoring='roc_auc',
cv=stratified_kfold,
n_jobs=-1)
When I start training the model I get the following error. How can I solve it?
---------------------------------------------------------------------------
_RemoteTraceback Traceback (most recent call last)
_RemoteTraceback:
"""
Traceback (most recent call last):
File "C:\Users\burak\anaconda3\lib\site-packages\joblib\externals\loky\process_executor.py", line 436, in _process_worker
r = call_item()
File "C:\Users\burak\anaconda3\lib\site-packages\joblib\externals\loky\process_executor.py", line 288, in __call__
return self.fn(*self.args, **self.kwargs)
File "C:\Users\burak\anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 595, in __call__
return self.func(*args, **kwargs)
File "C:\Users\burak\anaconda3\lib\site-packages\joblib\parallel.py", line 262, in __call__
return [func(*args, **kwargs)
File "C:\Users\burak\anaconda3\lib\site-packages\joblib\parallel.py", line 262, in <listcomp>
return [func(*args, **kwargs)
File "C:\Users\burak\anaconda3\lib\site-packages\sklearn\utils\fixes.py", line 216, in __call__
return self.function(*args, **kwargs)
File "C:\Users\burak\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 668, in _fit_and_score
estimator = estimator.set_params(**cloned_parameters)
File "C:\Users\burak\anaconda3\lib\site-packages\sklearn\pipeline.py", line 188, in set_params
self._set_params("steps", **kwargs)
File "C:\Users\burak\anaconda3\lib\site-packages\sklearn\utils\metaestimators.py", line 54, in _set_params
super().set_params(**params)
File "C:\Users\burak\anaconda3\lib\site-packages\sklearn\base.py", line 239, in set_params
valid_params = self.get_params(deep=True)
File "C:\Users\burak\anaconda3\lib\site-packages\sklearn\pipeline.py", line 167, in get_params
return self._get_params("steps", deep=deep)
File "C:\Users\burak\anaconda3\lib\site-packages\sklearn\utils\metaestimators.py", line 33, in _get_params
out.update(estimators)
ValueError: dictionary update sequence element #2 has length 3; 2 is required
"""
The above exception was the direct cause of the following exception:
ValueError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_1388/1962240236.py in <module>
23 n_jobs=-1)
24
---> 25 grid_search.fit(X_train, y_train)
26 cv_score = grid_search.best_score_
27 test_score = grid_search.score(X_test, y_test)
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
889 return results
890
--> 891 self._run_search(evaluate_candidates)
892
893 # multimetric is determined here because in the case of a callable
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1390 def _run_search(self, evaluate_candidates):
1391 """Search all candidates in param_grid"""
-> 1392 evaluate_candidates(ParameterGrid(self.param_grid))
1393
1394
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params, cv, more_results)
836 )
837
--> 838 out = parallel(
839 delayed(_fit_and_score)(
840 clone(base_estimator),
~\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1054
1055 with self._backend.retrieval_context():
-> 1056 self.retrieve()
1057 # Make sure that we get a last message telling us we are done
1058 elapsed_time = time.time() - self._start_time
~\anaconda3\lib\site-packages\joblib\parallel.py in retrieve(self)
933 try:
934 if getattr(self._backend, 'supports_timeout', False):
--> 935 self._output.extend(job.get(timeout=self.timeout))
936 else:
937 self._output.extend(job.get())
~\anaconda3\lib\site-packages\joblib\_parallel_backends.py in wrap_future_result(future, timeout)
540 AsyncResults.get from multiprocessing."""
541 try:
--> 542 return future.result(timeout=timeout)
543 except CfTimeoutError as e:
544 raise TimeoutError from e
~\anaconda3\lib\concurrent\futures\_base.py in result(self, timeout)
443 raise CancelledError()
444 elif self._state == FINISHED:
--> 445 return self.__get_result()
446 else:
447 raise TimeoutError()
~\anaconda3\lib\concurrent\futures\_base.py in __get_result(self)
388 if self._exception:
389 try:
--> 390 raise self._exception
391 finally:
392 # Break a reference cycle with the exception in self._exception
ValueError: dictionary update sequence element #2 has length 3; 2 is required
Your need to split your pipeline into 2 parts : one to process the numeric features (with the min max scaler) and another one to process categorical features (with the one hot encoder). You can use the class ColumnTransformer
from scikit-learn : https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html