I need some help debugging. I am using scikit-learn to process some data and train an ML model to predict housing prices. I have come up with 2 custom transformers that takes care of unwanted features, and also combines a few features to create new features. Both custom transformers work individually when I call them but as soon as I combine them into a single pipeline to improve the workflow, I get an error. im not sure what the issue is. for example, here is the first transformer:
from sklearn.base import BaseEstimator, TransformerMixin
class ColumnSelector(BaseEstimator, TransformerMixin):
def __init__(self, columns):
self.columns = columns
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.columns]
then testing it out:
relevant_columns = ['OverallQual','GrLivArea','GarageCars','GarageArea','YearBuilt','BsmtFinSF1','FullBath',
'GarageYrBlt', 'TotalBsmtSF', '2ndFlrSF', '1stFlrSF', 'HalfBath']
cs = ColumnSelector(columns=relevant_columns)
transformed = cs.fit_transform(X_train)
transformed.head()
similarly,
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
total_bsmt_sa_ix, second_flr_ix, first_flr_ix, full_bath_ix, half_bath_ix = [
list(transformed.columns).index(col) for col in ('TotalBsmtSF', '2ndFlrSF', '1stFlrSF', 'FullBath', 'HalfBath')]
def __init__(self, add_total_sa=True, add_total_baths=True):
self.add_total_sa = add_total_sa
self.add_total_baths = add_total_baths
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
if self.add_total_sa and self.add_total_baths:
total_sa = X[:, total_bsmt_sa_ix] + X[:, second_flr_ix] + X[:, first_flr_ix]
total_bath = X[:, full_bath_ix] + X[:, half_bath_ix]/2
return np.c_[X, total_sa, total_bath]
elif self.add_total_sa:
total_sa = X[:, total_bsmt_sa_ix] + X[:, second_flr_ix] + X[:, first_flr_ix]
return np.c_[X, total_sa]
elif self.add_total_baths:
total_bath = X[:, full_bath_ix] + X[:, half_bath_ix]/2
return np.c_[X, total_bath]
else:
pass
atr_adder = CombinedAttributesAdder()
housing_extra_attr = atr_adder.transform(transformed.values)
housing_extra_attr = pd.DataFrame(housing_extra_attr, columns=relevant_columns+['total_sa', 'total_bath'], index=transformed.index)
housing_extra_attr.head()
however, when I make a pipeline like so:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
pipeline = Pipeline([
('column_selector', ColumnSelector(columns=relevant_columns)),
('attr adder', CombinedAttributesAdder()),
('scaler', StandardScaler())
])
X_train_prepd = pipeline.fit(X_train)
I get this error message
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-107-ab78197544be> in <module>
8 ])
9
---> 10 X_train_prepd = pipeline.fit(X_train)
~\Anaconda3\envs\ml_book\lib\site-packages\sklearn\pipeline.py in fit(self, X, y, **fit_params)
350 This estimator
351 """
--> 352 Xt, fit_params = self._fit(X, y, **fit_params)
353 with _print_elapsed_time('Pipeline',
354 self._log_message(len(self.steps) - 1)):
~\Anaconda3\envs\ml_book\lib\site-packages\sklearn\pipeline.py in _fit(self, X, y, **fit_params)
315 message_clsname='Pipeline',
316 message=self._log_message(step_idx),
--> 317 **fit_params_steps[name])
318 # Replace the transformer of the step with the fitted
319 # transformer. This is necessary when loading the transformer
~\Anaconda3\envs\ml_book\lib\site-packages\joblib\memory.py in __call__(self, *args, **kwargs)
353
354 def __call__(self, *args, **kwargs):
--> 355 return self.func(*args, **kwargs)
356
357 def call_and_shelve(self, *args, **kwargs):
~\Anaconda3\envs\ml_book\lib\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
714 with _print_elapsed_time(message_clsname, message):
715 if hasattr(transformer, 'fit_transform'):
--> 716 res = transformer.fit_transform(X, y, **fit_params)
717 else:
718 res = transformer.fit(X, y, **fit_params).transform(X)
~\Anaconda3\envs\ml_book\lib\site-packages\sklearn\base.py in fit_transform(self, X, y, **fit_params)
551 if y is None:
552 # fit method of arity 1 (unsupervised transformation)
--> 553 return self.fit(X, **fit_params).transform(X)
554 else:
555 # fit method of arity 2 (supervised transformation)
<ipython-input-94-607115cdc09e> in transform(self, X, y)
13
14 if self.add_total_sa and self.add_total_baths:
---> 15 total_sa = X[:, total_bsmt_sa_ix] + X[:, second_flr_ix] + X[:, first_flr_ix]
16 total_bath = X[:, full_bath_ix] + X[:, half_bath_ix]/2
17 return np.c_[X, total_sa, total_bath]
~\Anaconda3\envs\ml_book\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2978 if self.columns.nlevels > 1:
2979 return self._getitem_multilevel(key)
-> 2980 indexer = self.columns.get_loc(key)
2981 if is_integer(indexer):
2982 indexer = [indexer]
~\Anaconda3\envs\ml_book\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2895 )
2896 try:
-> 2897 return self._engine.get_loc(key)
2898 except KeyError:
2899 return self._engine.get_loc(self._maybe_cast_indexer(key))
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
TypeError: '(slice(None, None, None), 8)' is an invalid key
Does anyone have an idea on what could be going wrong? Im really floored here. Thanks for the help
Figured out what the problem was.
The issue was coming from how I was calling the indices of the columns in the CombinedAttributesAdder()
. the solution was to change the calculations to the following:
if self.add_total_sa and self.add_total_baths:
total_sa = X.iloc[:, total_bsmt_sa_ix] + X.iloc[:, second_flr_ix] + X.iloc[:, first_flr_ix]
total_bath = X.iloc[:, full_bath_ix] + X.iloc[:, half_bath_ix]/2
return np.c_[X, total_sa, total_bath]
elif self.add_total_sa:
total_sa = X.iloc[:, total_bsmt_sa_ix] + X.iloc[:, second_flr_ix] + X.iloc[:, first_flr_ix]
return np.c_[X, total_sa]
elif self.add_total_baths:
total_bath = X.iloc[:, full_bath_ix] + X.iloc[:, half_bath_ix]/2
return np.c_[X, total_bath]
else:
pass
all I did was add X.iloc
in each of the calculations