I am playing around with the titanic ddataset and trying to make a correct usage of the sklearn make_pipeline, but I'm becoming a little confused on how tu correctly build the pipelines. Here's the code:
def sum_relatives(X):
X_copy = X.copy()
X_copy['total_relatives'] = X_copy['SibSp'] + X_copy['Parch']
return X_copy
class_order = [[1, 2, 3]]
ord_pipeline = make_pipeline(
OrdinalEncoder(categories=class_order)
)
def age_transformer(X):
X_copy = X.copy()
for index, row in self.median_age_by_class.iterrows():
class_value = row['Pclass']
median_age = row['median_age']
X_copy.loc[X_copy['Pclass'] == class_value, 'Age'] = X_copy.loc[X_copy['Pclass'] == class_value, 'Age'].fillna(median_age)
bins = [0, 10, 20, 30, 40, 50, 60, 70, 100]
X_copy['age_interval'] = pd.cut(X_copy['Age'], bins=bins)
return X_copy
def age_processor():
return make_pipeline(
FunctionTransformer(age_transformer),
)
total_relatives_pipeline = make_pipeline(
FunctionTransformer(sum_relatives)
)
cat_pipeline = make_pipeline(
OneHotEncoder(handle_unknown="ignore")
)
num_pipeline = make_pipeline([
StandardScaler()
])
preprocessing = ColumnTransformer([
("ord", ord_pipeline, ['Pclass']),
("age_processing", age_processor(), ['Pclass', 'Age']),
("total_relatives", total_relatives_pipeline, ['SibSp', 'Parch']),
("cat", cat_pipeline, ['Sex', 'Embarked', 'traveling_category', 'age_interval']),
("num", num_pipeline, ['Fare']),
])
IT gives me the following error when calling the 'fit_transform' on my data:
Output exceeds the size limit. Open the full output data in a text editor
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Cell In[43], line 1
----> 1 data_processed = preprocessing.fit_transform(titanic_data)
2 data_processed.shape
File ~/.virtualenvs/handson/lib/python3.10/site-packages/sklearn/utils/_set_output.py: 157, in _wrap_method_output.<locals>.wrapped(self, X, *args, **kwargs)
155 @wraps(f)
156 def wrapped(self, X, *args, **kwargs):
--> 157 data_to_wrap = f(self, X, *args, **kwargs)
158 if isinstance(data_to_wrap, tuple):
159 # only wrap the first output for cross decomposition
160 return_tuple = (
161 _wrap_data_with_container(method, data_to_wrap[0], X, self),
162 *data_to_wrap[1:],
163 )
File ~/.virtualenvs/handson/lib/python3.10/site-packages/sklearn/base.py:1152, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)
1145 estimator._validate_params()
1147 with config_context(
1148 skip_parameter_validation=(
1149 prefer_skip_nested_validation or global_skip_validation
1150 )
1151 ):
-> 1152 return fit_method(estimator, *args, **kwargs)
...
445 "transform, or can be 'drop' or 'passthrough' "
446 "specifiers. '%s' (type %s) doesn't." % (t, type(t))
447 )
TypeError: All estimators should implement fit and transform, or can be 'drop' or 'passthrough' specifiers. 'Pipeline(steps=[('list', [('scaler', StandardScaler())])])' (type <class 'sklearn.pipeline.Pipeline'>) doesn't.
I know that transformers given to the pipeline shouldn't be contained in a list but I have no idea why this error is being raised. Any help?
I think the error is being caused by the square brackets in make_pipeline()
for num_pipeline
. Replace with:
num_pipeline = make_pipeline(
StandardScaler()
)
Alternatively, since it's just a single step, you could do the following if you wanted something a bit more concise:
num_processor = StandardScaler()
preprocessing = ColumnTransformer([
...
("num", num_processor, ['Fare']),
])
If you are okay with skipping defining num_processor
, you can supply StandardScaler()
directly:
preprocessing = ColumnTransformer([
...
("num", StandardScaler(), ['Fare']),
])
Or to get both in a single line, the :=
operator:
preprocessing = ColumnTransformer([
...
("num", num_processor := StandardScaler(), ['Fare']),
])