Custom Transformers work individually but breaks down when combining them into one Pipeline

I need some help debugging. I am using scikit-learn to process some data and train an ML model to predict housing prices. I have come up with 2 custom transformers that takes care of unwanted features, and also combines a few features to create new features. Both custom transformers work individually when I call them but as soon as I combine them into a single pipeline to improve the workflow, I get an error. im not sure what the issue is. for example, here is the first transformer:

from sklearn.base import BaseEstimator, TransformerMixin

class ColumnSelector(BaseEstimator, TransformerMixin):

    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.columns]

then testing it out:

relevant_columns = ['OverallQual','GrLivArea','GarageCars','GarageArea','YearBuilt','BsmtFinSF1','FullBath', 
                    'GarageYrBlt', 'TotalBsmtSF', '2ndFlrSF', '1stFlrSF', 'HalfBath']

cs = ColumnSelector(columns=relevant_columns)
transformed = cs.fit_transform(X_train)

transformed.head()

returns this dataframe.

similarly,



class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    total_bsmt_sa_ix, second_flr_ix, first_flr_ix, full_bath_ix, half_bath_ix = [
    list(transformed.columns).index(col) for col in ('TotalBsmtSF', '2ndFlrSF', '1stFlrSF', 'FullBath', 'HalfBath')]

    def __init__(self, add_total_sa=True, add_total_baths=True):
        self.add_total_sa = add_total_sa
        self.add_total_baths = add_total_baths

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):

            if self.add_total_sa and self.add_total_baths:
                total_sa = X[:, total_bsmt_sa_ix] + X[:, second_flr_ix] + X[:, first_flr_ix]
                total_bath = X[:, full_bath_ix] + X[:, half_bath_ix]/2
                return np.c_[X, total_sa, total_bath]

            elif self.add_total_sa:
                total_sa = X[:, total_bsmt_sa_ix] + X[:, second_flr_ix] + X[:, first_flr_ix]
                return np.c_[X, total_sa]

            elif self.add_total_baths:
                total_bath = X[:, full_bath_ix] + X[:, half_bath_ix]/2
                return np.c_[X, total_bath]

            else:
                pass

atr_adder = CombinedAttributesAdder()
housing_extra_attr = atr_adder.transform(transformed.values)

housing_extra_attr = pd.DataFrame(housing_extra_attr, columns=relevant_columns+['total_sa', 'total_bath'], index=transformed.index)
housing_extra_attr.head()

returns this

however, when I make a pipeline like so:

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

pipeline = Pipeline([
    ('column_selector', ColumnSelector(columns=relevant_columns)),
    ('attr adder', CombinedAttributesAdder()),
    ('scaler', StandardScaler())
])

X_train_prepd = pipeline.fit(X_train)

I get this error message


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-107-ab78197544be> in <module>
      8 ])
      9 
---> 10 X_train_prepd = pipeline.fit(X_train)

~\Anaconda3\envs\ml_book\lib\site-packages\sklearn\pipeline.py in fit(self, X, y, **fit_params)
    350             This estimator
    351         """
--> 352         Xt, fit_params = self._fit(X, y, **fit_params)
    353         with _print_elapsed_time('Pipeline',
    354                                  self._log_message(len(self.steps) - 1)):

~\Anaconda3\envs\ml_book\lib\site-packages\sklearn\pipeline.py in _fit(self, X, y, **fit_params)
    315                 message_clsname='Pipeline',
    316                 message=self._log_message(step_idx),
--> 317                 **fit_params_steps[name])
    318             # Replace the transformer of the step with the fitted
    319             # transformer. This is necessary when loading the transformer

~\Anaconda3\envs\ml_book\lib\site-packages\joblib\memory.py in __call__(self, *args, **kwargs)
    353 
    354     def __call__(self, *args, **kwargs):
--> 355         return self.func(*args, **kwargs)
    356 
    357     def call_and_shelve(self, *args, **kwargs):

~\Anaconda3\envs\ml_book\lib\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
    714     with _print_elapsed_time(message_clsname, message):
    715         if hasattr(transformer, 'fit_transform'):
--> 716             res = transformer.fit_transform(X, y, **fit_params)
    717         else:
    718             res = transformer.fit(X, y, **fit_params).transform(X)

~\Anaconda3\envs\ml_book\lib\site-packages\sklearn\base.py in fit_transform(self, X, y, **fit_params)
    551         if y is None:
    552             # fit method of arity 1 (unsupervised transformation)
--> 553             return self.fit(X, **fit_params).transform(X)
    554         else:
    555             # fit method of arity 2 (supervised transformation)

<ipython-input-94-607115cdc09e> in transform(self, X, y)
     13 
     14             if self.add_total_sa and self.add_total_baths:
---> 15                 total_sa = X[:, total_bsmt_sa_ix] + X[:, second_flr_ix] + X[:, first_flr_ix]
     16                 total_bath = X[:, full_bath_ix] + X[:, half_bath_ix]/2
     17                 return np.c_[X, total_sa, total_bath]

~\Anaconda3\envs\ml_book\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
   2978             if self.columns.nlevels > 1:
   2979                 return self._getitem_multilevel(key)
-> 2980             indexer = self.columns.get_loc(key)
   2981             if is_integer(indexer):
   2982                 indexer = [indexer]

~\Anaconda3\envs\ml_book\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
   2895                 )
   2896             try:
-> 2897                 return self._engine.get_loc(key)
   2898             except KeyError:
   2899                 return self._engine.get_loc(self._maybe_cast_indexer(key))

pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()

TypeError: '(slice(None, None, None), 8)' is an invalid key

Does anyone have an idea on what could be going wrong? Im really floored here. Thanks for the help

Solution

Figured out what the problem was. The issue was coming from how I was calling the indices of the columns in the CombinedAttributesAdder(). the solution was to change the calculations to the following:

if self.add_total_sa and self.add_total_baths:
            total_sa = X.iloc[:, total_bsmt_sa_ix] + X.iloc[:, second_flr_ix] + X.iloc[:, first_flr_ix]
            total_bath = X.iloc[:, full_bath_ix] + X.iloc[:, half_bath_ix]/2
            return np.c_[X, total_sa, total_bath]

        elif self.add_total_sa:
            total_sa = X.iloc[:, total_bsmt_sa_ix] + X.iloc[:, second_flr_ix] + X.iloc[:, first_flr_ix]
            return np.c_[X, total_sa]

        elif self.add_total_baths:
            total_bath = X.iloc[:, full_bath_ix] + X.iloc[:, half_bath_ix]/2
            return np.c_[X, total_bath]

        else:
            pass

all I did was add X.iloc in each of the calculations