I am fairly new to python and am trying to teach myself how to work with pipelines for feature preprocessing and model fitting. I tried to preprocess my data (drop features that may be constants under cv sampling and then scale them) and then fit a Cox PH model using elastic net for feature selection. I want to tune the parameters of the elastic net. However, I keep gettin that the score is nan for all models. When I fit one of these models 'by hand' it works fine, so I figure there must be something wrong in the way that I'm setting up the pipeline.
from sksurv.datasets import load_breast_cancer
X, y = load_breast_cancer()
cat_features = ["er", "grade"] # categorical features I want to OneHotEncode
num_features = np.setdiff1d(X.columns, cat_features).tolist() # num features to scale
event_column = 'e.tdm'
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, OneHotEncoder
from sklearn.feature_selection import VarianceThreshold
from sklearn.compose import ColumnTransformer
from sksurv.linear_model import CoxnetSurvivalAnalysis
# Create numeric preprocessing pipeline.
numeric_preprocess = Pipeline(steps=[
('vt0', VarianceThreshold()), # if I end up with a constant under cv, drop it.
('scale', StandardScaler()) # scale any feature that is not constant.
# Create categorical preprocessing pipeline.
categorical_preprocess = Pipeline(steps=[
('vt0', VarianceThreshold()),
('one-hot', OneHotEncoder(handle_unknown='ignore', sparse=False))
# Combine these two pipelines into one.
full_preprocessor = ColumnTransformer(transformers=[
('number', numeric_preprocess, num_features),
('category', categorical_preprocess, cat_features)
# Instantiate model
cph = CoxnetSurvivalAnalysis()
# Combine the preprocessing the model fit in a pipeline
coxnet_pipe = Pipeline(steps = [
('preprocess', full_preprocessor),
('model', cph)
# figure out what the names are for the parameter grid
params = {
'preprocess__number__scale': [StandardScaler, RobustScaler, MinMaxScaler],
'model__alphas': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 0.0, 1.0, 10.0, 100.0],
'model__l1_ratio': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
from sklearn.model_selection import GridSearchCV, KFold
cv = KFold(n_splits=2, shuffle=True, random_state=0)
gcv = GridSearchCV(coxnet_pipe,
param_grid = params, cv=cv, verbose = 3)
Then I get the following:
/opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/sklearn/model_selection/_validation.py:372: FitFailedWarning:
486 fits failed out of a total of 486.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.
Below are more details about the failures:
486 fits failed with the following error:
Traceback (most recent call last):
File "/opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "/opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/sklearn/pipeline.py", line 390, in fit
Xt = self._fit(X, y, **fit_params_steps)
File "/opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/sklearn/pipeline.py", line 348, in _fit
X, fitted_transformer = fit_transform_one_cached(
File "/opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/joblib/memory.py", line 349, in __call__
return self.func(*args, **kwargs)
File "/opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/sklearn/pipeline.py", line 893, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "/opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/sklearn/compose/_column_transformer.py", line 675, in fit_transform
result = self._fit_transform(X, y, _fit_transform_one)
File "/opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/sklearn/compose/_column_transformer.py", line 606, in _fit_transform
return Parallel(n_jobs=self.n_jobs)(
File "/opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/joblib/parallel.py", line 1043, in __call__
if self.dispatch_one_batch(iterator):
File "/opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/joblib/parallel.py", line 861, in dispatch_one_batch
File "/opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/joblib/parallel.py", line 779, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "/opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/joblib/_parallel_backends.py", line 208, in apply_async
result = ImmediateResult(func)
File "/opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/joblib/_parallel_backends.py", line 572, in __init__
self.results = batch()
File "/opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/joblib/parallel.py", line 262, in __call__
return [func(*args, **kwargs)
File "/opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/joblib/parallel.py", line 262, in <listcomp>
return [func(*args, **kwargs)
File "/opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/sklearn/utils/fixes.py", line 216, in __call__
return self.function(*args, **kwargs)
File "/opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/sklearn/pipeline.py", line 893, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "/opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/sklearn/pipeline.py", line 434, in fit_transform
return last_step.fit_transform(Xt, y, **fit_params_last_step)
File "/opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/sklearn/base.py", line 852, in fit_transform
return self.fit(X, **fit_params).transform(X)
AttributeError: 'numpy.ndarray' object has no attribute 'fit'
warnings.warn(some_fits_failed_message, FitFailedWarning)
/opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/sklearn/model_selection/_search.py:969: UserWarning: One or more of the test scores are non-finite: [nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
nan nan nan nan nan nan nan nan nan]
AttributeError Traceback (most recent call last)
Input In [197], in <cell line: 5>()
2 cv = KFold(n_splits=2, shuffle=True, random_state=0)
3 gcv = GridSearchCV(coxnet_pipe,
4 param_grid = params, cv=cv, verbose = 3)
----> 5 gcv.fit(X,y)
File /opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/sklearn/model_selection/_search.py:926, in BaseSearchCV.fit(self, X, y, groups, **fit_params)
924 refit_start_time = time.time()
925 if y is not None:
--> 926 self.best_estimator_.fit(X, y, **fit_params)
927 else:
928 self.best_estimator_.fit(X, **fit_params)
File /opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/sklearn/pipeline.py:390, in Pipeline.fit(self, X, y, **fit_params)
364 """Fit the model.
366 Fit all the transformers one after the other and transform the
387 Pipeline with fitted steps.
388 """
389 fit_params_steps = self._check_fit_params(**fit_params)
--> 390 Xt = self._fit(X, y, **fit_params_steps)
391 with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)):
392 if self._final_estimator != "passthrough":
File /opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/sklearn/pipeline.py:348, in Pipeline._fit(self, X, y, **fit_params_steps)
346 cloned_transformer = clone(transformer)
347 # Fit or load from cache the current transformer
--> 348 X, fitted_transformer = fit_transform_one_cached(
349 cloned_transformer,
350 X,
351 y,
352 None,
353 message_clsname="Pipeline",
354 message=self._log_message(step_idx),
355 **fit_params_steps[name],
356 )
357 # Replace the transformer of the step with the fitted
358 # transformer. This is necessary when loading the transformer
359 # from the cache.
360 self.steps[step_idx] = (name, fitted_transformer)
File /opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/joblib/memory.py:349, in NotMemorizedFunc.__call__(self, *args, **kwargs)
348 def __call__(self, *args, **kwargs):
--> 349 return self.func(*args, **kwargs)
File /opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/sklearn/pipeline.py:893, in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
891 with _print_elapsed_time(message_clsname, message):
892 if hasattr(transformer, "fit_transform"):
--> 893 res = transformer.fit_transform(X, y, **fit_params)
894 else:
895 res = transformer.fit(X, y, **fit_params).transform(X)
File /opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/sklearn/compose/_column_transformer.py:675, in ColumnTransformer.fit_transform(self, X, y)
672 self._validate_column_callables(X)
673 self._validate_remainder(X)
--> 675 result = self._fit_transform(X, y, _fit_transform_one)
677 if not result:
678 self._update_fitted_transformers([])
File /opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/sklearn/compose/_column_transformer.py:606, in ColumnTransformer._fit_transform(self, X, y, func, fitted, column_as_strings)
600 transformers = list(
601 self._iter(
602 fitted=fitted, replace_strings=True, column_as_strings=column_as_strings
603 )
604 )
605 try:
--> 606 return Parallel(n_jobs=self.n_jobs)(
607 delayed(func)(
608 transformer=clone(trans) if not fitted else trans,
609 X=_safe_indexing(X, column, axis=1),
610 y=y,
611 weight=weight,
612 message_clsname="ColumnTransformer",
613 message=self._log_message(name, idx, len(transformers)),
614 )
615 for idx, (name, trans, column, weight) in enumerate(transformers, 1)
616 )
617 except ValueError as e:
618 if "Expected 2D array, got 1D array instead" in str(e):
File /opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/joblib/parallel.py:1043, in Parallel.__call__(self, iterable)
1034 try:
1035 # Only set self._iterating to True if at least a batch
1036 # was dispatched. In particular this covers the edge
1040 # was very quick and its callback already dispatched all the
1041 # remaining jobs.
1042 self._iterating = False
-> 1043 if self.dispatch_one_batch(iterator):
1044 self._iterating = self._original_iterator is not None
1046 while self.dispatch_one_batch(iterator):
File /opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/joblib/parallel.py:861, in Parallel.dispatch_one_batch(self, iterator)
859 return False
860 else:
--> 861 self._dispatch(tasks)
862 return True
File /opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/joblib/parallel.py:779, in Parallel._dispatch(self, batch)
777 with self._lock:
778 job_idx = len(self._jobs)
--> 779 job = self._backend.apply_async(batch, callback=cb)
780 # A job can complete so quickly than its callback is
781 # called before we get here, causing self._jobs to
782 # grow. To ensure correct results ordering, .insert is
783 # used (rather than .append) in the following line
784 self._jobs.insert(job_idx, job)
File /opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/joblib/_parallel_backends.py:208, in SequentialBackend.apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
File /opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/joblib/_parallel_backends.py:572, in ImmediateResult.__init__(self, batch)
569 def __init__(self, batch):
570 # Don't delay the application, to avoid keeping the input
571 # arguments in memory
--> 572 self.results = batch()
File /opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/joblib/parallel.py:262, in BatchedCalls.__call__(self)
258 def __call__(self):
259 # Set the default nested backend to self._backend but do not set the
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
File /opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/joblib/parallel.py:262, in <listcomp>(.0)
258 def __call__(self):
259 # Set the default nested backend to self._backend but do not set the
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
File /opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/sklearn/utils/fixes.py:216, in _FuncWrapper.__call__(self, *args, **kwargs)
214 def __call__(self, *args, **kwargs):
215 with config_context(**self.config):
--> 216 return self.function(*args, **kwargs)
File /opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/sklearn/pipeline.py:893, in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
891 with _print_elapsed_time(message_clsname, message):
892 if hasattr(transformer, "fit_transform"):
--> 893 res = transformer.fit_transform(X, y, **fit_params)
894 else:
895 res = transformer.fit(X, y, **fit_params).transform(X)
File /opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/sklearn/pipeline.py:434, in Pipeline.fit_transform(self, X, y, **fit_params)
432 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
433 if hasattr(last_step, "fit_transform"):
--> 434 return last_step.fit_transform(Xt, y, **fit_params_last_step)
435 else:
436 return last_step.fit(Xt, y, **fit_params_last_step).transform(Xt)
File /opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/sklearn/base.py:852, in TransformerMixin.fit_transform(self, X, y, **fit_params)
848 # non-optimized default implementation; override when a better
849 # method is possible for a given clustering algorithm
850 if y is None:
851 # fit method of arity 1 (unsupervised transformation)
--> 852 return self.fit(X, **fit_params).transform(X)
853 else:
854 # fit method of arity 2 (supervised transformation)
855 return self.fit(X, y, **fit_params).transform(X)
AttributeError: 'numpy.ndarray' object has no attribute 'fit'
You just need some parentheses to instantiate your scalers in the hyperparameter space definition:
'preprocess__number__scale': [StandardScaler, RobustScaler, MinMaxScaler],
'preprocess__number__scale': [StandardScaler(), RobustScaler(), MinMaxScaler()],
The problem is that the methods take self
as first argument, and without an instance to use, the positional argument X
is handed over as the self
argument, hence the final line in the traceback, self.fit(...)
, complains because self
is actually a numpy array.