Search code examples
python-3.xscikit-learnpipeline

How to use a column created with ColumnTransformer pipeline in the same pipeline?


I am using the ColumnTransformer method of sklearn to build a pipeline for data processing. Inside the ColumnTransformer I am creating new columns for my dataset that are combinations of other columns, but in the next step I want to use this new column that were just created but I am getting the error that this column does not exist. I understand that it does not exist in the original dataset but why can't I use this new column? Or if I can: how? The code is the following:

def sum_name(function_transformer, feature_names_in):
    return ["relatives"]  # feature names out

def sum_relatives(X):
    X_copy = X.copy()
    X_copy['total_relatives'] = X_copy['SibSp'] + X_copy['Parch']
    return X_copy

def cat_travel_name(function_transformer, feature_names_in):
    return ["relatives"]  # feature names out

def categorize_travel(X):
    X_copy = X.copy()

    conditions = [
        (X_copy['total_relatives'] == 0),
        (X_copy['total_relatives'] >= 1) & (X_copy['total_relatives'] <= 3),
        (X_copy['total_relatives'] >= 4)
    ]
    categories = ['A', 'B', 'C']

    X_copy['traveling_category'] = np.select(conditions, categories, default='Unknown')
    return X_copy

class_order = [[1, 2, 3]]

ord_pipeline = make_pipeline(
    OrdinalEncoder(categories=class_order)    
    )

def interval_name(function_transformer, feature_names_in):
    return ["interval"]  # feature names out

def age_transformer(X):
    X_copy = X.copy()
    median_age_by_class = X_copy.groupby('Pclass')['Age'].median().reset_index()
    median_age_by_class.columns = ['Pclass', 'median_age']
    for index, row in median_age_by_class.iterrows():
        class_value = row['Pclass']
        median_age = row['median_age']
        X_copy.loc[X_copy['Pclass'] == class_value, 'Age'] = X_copy.loc[X_copy['Pclass'] == class_value, 'Age'].fillna(median_age)
    bins = [0, 10, 20, 30, 40, 50, 60, 70, 100]
    X_copy['age_interval'] = pd.cut(X_copy['Age'], bins=bins)
    return X_copy

def age_processor():
    return make_pipeline(
        FunctionTransformer(age_transformer, feature_names_out=interval_name),
        )

total_relatives_pipeline = make_pipeline(
    FunctionTransformer(sum_relatives, feature_names_out=sum_name)
)

travel_category_pipeline = make_pipeline(
    FunctionTransformer(categorize_travel, feature_names_out=cat_travel_name)
)

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore")
    )

num_pipeline = make_pipeline(
        StandardScaler()
    )

preprocessing = ColumnTransformer([
        ("ord", ord_pipeline, ['Pclass']),
        ("age_processing", age_processor(), ['Pclass', 'Age']),
        ("total_relatives", total_relatives_pipeline, ['SibSp', 'Parch']),
        ("travel_category", travel_category_pipeline, ['total_relatives']),
        ("cat", cat_pipeline, ['Sex', 'Embarked', 'traveling_category', 'age_interval']),
        ("num", num_pipeline, ['Fare']),
    ])

This code gives the error that total_relatives is not a column of the dataset when I call the fit_transformmethod with:

data_processed = preprocessing.fit_transform(titanic_data)

Well, it is true, the total_relatives is indeed not a column of the original dataset, it is being created dynamically inside the ColumnTransformer. Do I need to create it again if I want to use the "travel_category" step? Can't I recover it from the previously step and use it?

the same thing will happen in the next step since traveling_category and age_interval were created in the previous steps and are not from the original dataset.


Solution

  • There is mismatch between the columns that the function transformer returns, and its feature_names_out= name. They are clashing, both in terms of how many features are returned and their name. I firstly made sure that each function transformer only returns the one column (or more) that it's supposed to, and also changed the names to make them consistent. This way, the number of features and names defined in feature_names_out= matches the actual returned data for each transformer.

    A second issue was that new features are created in the ColumnTransformer, and an attempt is made to access those variables by other parts of the same ColumnTransformer. This won't work as the column transformer runs things in parallel. What you'd need to do is chain the column transformers in a sequence, such that the next column transformer can access the new variables created from the earlier one. I have made this change to the code.

    I made some other minor changes, including forcing transformers to return pandas dataframes rather than numpy arrays.

    The new preprocessor:

    enter image description here

    Its output column names:

    Index(['Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S',
           'traveling_category_A', 'traveling_category_B', 'traveling_category_C',
           'age_interval_(0, 10]', 'age_interval_(10, 20]',
           'age_interval_(20, 30]', 'age_interval_(30, 40]',
           'age_interval_(40, 50]', 'age_interval_(50, 60]',
           'age_interval_(60, 70]', 'age_interval_(70, 100]', 'Pclass', 'Fare',
           'PassengerId', 'Survived', 'Name', 'Ticket', 'Cabin'],
          dtype='object')
    

    Code:

    import pandas as pd
    titanic_data = pd.read_csv('../titanic.csv')
    
    from sklearn.pipeline import make_pipeline, FunctionTransformer
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
    from sklearn.impute import SimpleImputer
    
    from sklearn import set_config
    set_config(transform_output='pandas')
    
    #
    # Sum relatives
    #
    def sum_name(function_transformer, feature_names_in):
        return ["total_relatives"]  # feature names out
    
    def sum_relatives(X):
        X_copy = X.copy()
        X_copy['total_relatives'] = X_copy['SibSp'] + X_copy['Parch']
        return X_copy[['total_relatives']]
    
    total_relatives_pipeline = make_pipeline(
        FunctionTransformer(sum_relatives, feature_names_out=sum_name)
    )
    
    #
    #Categorize travel
    #
    def cat_travel_name(function_transformer, feature_names_in):
        return ["traveling_category"]  # feature names out
    
    def categorize_travel(X):
        X_copy = X.copy()
    
        conditions = [
            (X_copy['total_relatives'] == 0),
            (X_copy['total_relatives'] >= 1) & (X_copy['total_relatives'] <= 3),
            (X_copy['total_relatives'] >= 4)
        ]
        categories = ['A', 'B', 'C']
    
        X_copy['traveling_category'] = np.select(conditions, categories, default='Unknown')
        return X_copy[['traveling_category']]
    
    travel_category_pipeline = make_pipeline(
        FunctionTransformer(categorize_travel, feature_names_out=cat_travel_name)
    )
    
    #
    # Ordinal encoder
    #
    class_order = [[1, 2, 3]]
    
    ord_pipeline = make_pipeline(
        OrdinalEncoder(categories=class_order)    
        )
    
    cat_pipeline = make_pipeline(
        SimpleImputer(strategy="most_frequent"),
        OneHotEncoder(handle_unknown="ignore", sparse_output=False)
        )
    
    #Numerical for fare
    fare_pipeline = make_pipeline(
            StandardScaler()
        )
    
    #
    # Age transformer
    #
    def interval_name(function_transformer, feature_names_in):
        return ["age_interval"]  # feature names out
    
    def age_transformer(X):
        X_copy = X.copy()
        median_age_by_class = X_copy.groupby('Pclass')['Age'].median().reset_index()
        median_age_by_class.columns = ['Pclass', 'median_age']
        for index, row in median_age_by_class.iterrows():
            class_value = row['Pclass']
            median_age = row['median_age']
            X_copy.loc[X_copy['Pclass'] == class_value, 'Age'] = \
                X_copy.loc[X_copy['Pclass'] == class_value, 'Age'].fillna(median_age)
        bins = [0, 10, 20, 30, 40, 50, 60, 70, 100]
        X_copy['age_interval'] = pd.cut(X_copy['Age'], bins=bins)
        return X_copy[['age_interval']]
    
    def age_processor():
        return make_pipeline(
            FunctionTransformer(age_transformer, feature_names_out=interval_name),
            )
    
    #
    # Column transformers
    #
    preprocessing_initial = ColumnTransformer([
            ("ord", ord_pipeline, ['Pclass']),
            ("age_processing", age_processor(), ['Pclass', 'Age']),
            ("num", fare_pipeline, ['Fare']),
            ("total_relatives", total_relatives_pipeline, ['SibSp', 'Parch'])],
            remainder='passthrough',
            verbose_feature_names_out=False
    )
    
    preprocessing_travel_category = ColumnTransformer(
        [("travel_category", travel_category_pipeline, ['total_relatives'])],
        remainder='passthrough',
        verbose_feature_names_out=False
    )
    
    preprocessing_cat = ColumnTransformer(
        [("cat", cat_pipeline, ['Sex', 'Embarked', 'traveling_category', 'age_interval'])],
        remainder='passthrough',
        verbose_feature_names_out=False
    )
    
    #Final transformer
    preprocessor = make_pipeline(
        preprocessing_initial,
        preprocessing_travel_category,
        preprocessing_cat
    )
    
    #Run
    preprocessor.fit(titanic_data)
    preprocessor.fit_transform(titanic_data).columns