I am using the ColumnTransformer
method of sklearn to build a pipeline for data processing. Inside the ColumnTransformer
I am creating new columns for my dataset that are combinations of other columns, but in the next step I want to use this new column that were just created but I am getting the error that this column does not exist. I understand that it does not exist in the original dataset but why can't I use this new column? Or if I can: how?
The code is the following:
def sum_name(function_transformer, feature_names_in):
return ["relatives"] # feature names out
def sum_relatives(X):
X_copy = X.copy()
X_copy['total_relatives'] = X_copy['SibSp'] + X_copy['Parch']
return X_copy
def cat_travel_name(function_transformer, feature_names_in):
return ["relatives"] # feature names out
def categorize_travel(X):
X_copy = X.copy()
conditions = [
(X_copy['total_relatives'] == 0),
(X_copy['total_relatives'] >= 1) & (X_copy['total_relatives'] <= 3),
(X_copy['total_relatives'] >= 4)
]
categories = ['A', 'B', 'C']
X_copy['traveling_category'] = np.select(conditions, categories, default='Unknown')
return X_copy
class_order = [[1, 2, 3]]
ord_pipeline = make_pipeline(
OrdinalEncoder(categories=class_order)
)
def interval_name(function_transformer, feature_names_in):
return ["interval"] # feature names out
def age_transformer(X):
X_copy = X.copy()
median_age_by_class = X_copy.groupby('Pclass')['Age'].median().reset_index()
median_age_by_class.columns = ['Pclass', 'median_age']
for index, row in median_age_by_class.iterrows():
class_value = row['Pclass']
median_age = row['median_age']
X_copy.loc[X_copy['Pclass'] == class_value, 'Age'] = X_copy.loc[X_copy['Pclass'] == class_value, 'Age'].fillna(median_age)
bins = [0, 10, 20, 30, 40, 50, 60, 70, 100]
X_copy['age_interval'] = pd.cut(X_copy['Age'], bins=bins)
return X_copy
def age_processor():
return make_pipeline(
FunctionTransformer(age_transformer, feature_names_out=interval_name),
)
total_relatives_pipeline = make_pipeline(
FunctionTransformer(sum_relatives, feature_names_out=sum_name)
)
travel_category_pipeline = make_pipeline(
FunctionTransformer(categorize_travel, feature_names_out=cat_travel_name)
)
cat_pipeline = make_pipeline(
SimpleImputer(strategy="most_frequent"),
OneHotEncoder(handle_unknown="ignore")
)
num_pipeline = make_pipeline(
StandardScaler()
)
preprocessing = ColumnTransformer([
("ord", ord_pipeline, ['Pclass']),
("age_processing", age_processor(), ['Pclass', 'Age']),
("total_relatives", total_relatives_pipeline, ['SibSp', 'Parch']),
("travel_category", travel_category_pipeline, ['total_relatives']),
("cat", cat_pipeline, ['Sex', 'Embarked', 'traveling_category', 'age_interval']),
("num", num_pipeline, ['Fare']),
])
This code gives the error that total_relatives
is not a column of the dataset when I call the fit_transform
method with:
data_processed = preprocessing.fit_transform(titanic_data)
Well, it is true, the total_relatives
is indeed not a column of the original dataset, it is being created dynamically inside the ColumnTransformer
. Do I need to create it again if I want to use the "travel_category" step? Can't I recover it from the previously step and use it?
the same thing will happen in the next step since traveling_category
and age_interval
were created in the previous steps and are not from the original dataset.
There is mismatch between the columns that the function transformer returns, and its feature_names_out=
name. They are clashing, both in terms of how many features are returned and their name. I firstly made sure that each function transformer only returns the one column (or more) that it's supposed to, and also changed the names to make them consistent. This way, the number of features and names defined in feature_names_out=
matches the actual returned data for each transformer.
A second issue was that new features are created in the ColumnTransformer
, and an attempt is made to access those variables by other parts of the same ColumnTransformer
. This won't work as the column transformer runs things in parallel. What you'd need to do is chain the column transformers in a sequence, such that the next column transformer can access the new variables created from the earlier one. I have made this change to the code.
I made some other minor changes, including forcing transformers to return pandas
dataframes rather than numpy
arrays.
The new preprocessor:
Its output column names:
Index(['Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S',
'traveling_category_A', 'traveling_category_B', 'traveling_category_C',
'age_interval_(0, 10]', 'age_interval_(10, 20]',
'age_interval_(20, 30]', 'age_interval_(30, 40]',
'age_interval_(40, 50]', 'age_interval_(50, 60]',
'age_interval_(60, 70]', 'age_interval_(70, 100]', 'Pclass', 'Fare',
'PassengerId', 'Survived', 'Name', 'Ticket', 'Cabin'],
dtype='object')
Code:
import pandas as pd
titanic_data = pd.read_csv('../titanic.csv')
from sklearn.pipeline import make_pipeline, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn import set_config
set_config(transform_output='pandas')
#
# Sum relatives
#
def sum_name(function_transformer, feature_names_in):
return ["total_relatives"] # feature names out
def sum_relatives(X):
X_copy = X.copy()
X_copy['total_relatives'] = X_copy['SibSp'] + X_copy['Parch']
return X_copy[['total_relatives']]
total_relatives_pipeline = make_pipeline(
FunctionTransformer(sum_relatives, feature_names_out=sum_name)
)
#
#Categorize travel
#
def cat_travel_name(function_transformer, feature_names_in):
return ["traveling_category"] # feature names out
def categorize_travel(X):
X_copy = X.copy()
conditions = [
(X_copy['total_relatives'] == 0),
(X_copy['total_relatives'] >= 1) & (X_copy['total_relatives'] <= 3),
(X_copy['total_relatives'] >= 4)
]
categories = ['A', 'B', 'C']
X_copy['traveling_category'] = np.select(conditions, categories, default='Unknown')
return X_copy[['traveling_category']]
travel_category_pipeline = make_pipeline(
FunctionTransformer(categorize_travel, feature_names_out=cat_travel_name)
)
#
# Ordinal encoder
#
class_order = [[1, 2, 3]]
ord_pipeline = make_pipeline(
OrdinalEncoder(categories=class_order)
)
cat_pipeline = make_pipeline(
SimpleImputer(strategy="most_frequent"),
OneHotEncoder(handle_unknown="ignore", sparse_output=False)
)
#Numerical for fare
fare_pipeline = make_pipeline(
StandardScaler()
)
#
# Age transformer
#
def interval_name(function_transformer, feature_names_in):
return ["age_interval"] # feature names out
def age_transformer(X):
X_copy = X.copy()
median_age_by_class = X_copy.groupby('Pclass')['Age'].median().reset_index()
median_age_by_class.columns = ['Pclass', 'median_age']
for index, row in median_age_by_class.iterrows():
class_value = row['Pclass']
median_age = row['median_age']
X_copy.loc[X_copy['Pclass'] == class_value, 'Age'] = \
X_copy.loc[X_copy['Pclass'] == class_value, 'Age'].fillna(median_age)
bins = [0, 10, 20, 30, 40, 50, 60, 70, 100]
X_copy['age_interval'] = pd.cut(X_copy['Age'], bins=bins)
return X_copy[['age_interval']]
def age_processor():
return make_pipeline(
FunctionTransformer(age_transformer, feature_names_out=interval_name),
)
#
# Column transformers
#
preprocessing_initial = ColumnTransformer([
("ord", ord_pipeline, ['Pclass']),
("age_processing", age_processor(), ['Pclass', 'Age']),
("num", fare_pipeline, ['Fare']),
("total_relatives", total_relatives_pipeline, ['SibSp', 'Parch'])],
remainder='passthrough',
verbose_feature_names_out=False
)
preprocessing_travel_category = ColumnTransformer(
[("travel_category", travel_category_pipeline, ['total_relatives'])],
remainder='passthrough',
verbose_feature_names_out=False
)
preprocessing_cat = ColumnTransformer(
[("cat", cat_pipeline, ['Sex', 'Embarked', 'traveling_category', 'age_interval'])],
remainder='passthrough',
verbose_feature_names_out=False
)
#Final transformer
preprocessor = make_pipeline(
preprocessing_initial,
preprocessing_travel_category,
preprocessing_cat
)
#Run
preprocessor.fit(titanic_data)
preprocessor.fit_transform(titanic_data).columns