For housing data set, I am trying to use DataFrameMapper() from sklearn_pandas to apply polynomial features on selected columns.
My code:
from sklearn.preprocessing import PolynomialFeatures
from sklearn_pandas import DataFrameMapper
mapper = DataFrameMapper([
('houseAge_income', PolynomialFeatures(2)),
('median_income', PolynomialFeatures(2)),
(['latitude', 'housing_median_age', 'total_rooms', 'population', 'median_house_value',
'ocean_proximity']], None)
])
poly_feature = mapper.fit_transform(housing)
I was getting this error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-44-30679ae791ae> in <module>
11
12 # fit
---> 13 poly_feature = mapper.fit_transform(df)
e:\Anaconda3\lib\site-packages\sklearn_pandas\dataframe_mapper.py in fit_transform(self, X, y)
397 y the target vector relative to X, optional
398 """
--> 399 return self._transform(X, y, True)
e:\Anaconda3\lib\site-packages\sklearn_pandas\dataframe_mapper.py in _transform(self, X, y, do_fit)
308 with add_column_names_to_exception(columns):
309 if do_fit and hasattr(transformers, 'fit_transform'):
--> 310 Xt = _call_fit(transformers.fit_transform, Xt, y)
311 else:
312 if do_fit:
e:\Anaconda3\lib\site-packages\sklearn_pandas\pipeline.py in _call_fit(fit_method, X, y, **kwargs)
22 """
23 try:
---> 24 return fit_method(X, y, **kwargs)
25 except TypeError:
26 # fit takes only one argument
e:\Anaconda3\lib\site-packages\sklearn\base.py in fit_transform(self, X, y, **fit_params)
688 if y is None:
689 # fit method of arity 1 (unsupervised transformation)
--> 690 return self.fit(X, **fit_params).transform(X)
691 else:
692 # fit method of arity 2 (supervised transformation)
e:\Anaconda3\lib\site-packages\sklearn\preprocessing\_data.py in fit(self, X, y)
1510 self : instance
1511 """
-> 1512 n_samples, n_features = self._validate_data(
1513 X, accept_sparse=True).shape
1514 combinations = self._combinations(n_features, self.degree,
e:\Anaconda3\lib\site-packages\sklearn\base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
418 f"requires y to be passed, but the target y is None."
419 )
--> 420 X = check_array(X, **check_params)
421 out = X
422 else:
e:\Anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
70 FutureWarning)
71 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 72 return f(**kwargs)
73 return inner_f
74
e:\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
617 # If input is 1D raise error
618 if array.ndim == 1:
--> 619 raise ValueError(
620 "Expected 2D array, got 1D array instead:\narray={}.\n"
621 "Reshape your data either using array.reshape(-1, 1) if "
ValueError: houseAge_income: Expected 2D array, got 1D array instead:
array=[341.3332 174.3294 377.3848 ... 28.9 33.6096 38.2176].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
When I tried using
houseAge_income.reshape(-1, 1)
inside DataFrameMapper(), I got another issue:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
2645 try:
-> 2646 return self._engine.get_loc(key)
2647 except KeyError:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'houseAge_income.reshape(-1, 1)'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
5 frames
/usr/local/lib/python3.6/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
2646 return self._engine.get_loc(key)
2647 except KeyError:
-> 2648 return self._engine.get_loc(self._maybe_cast_indexer(key))
2649 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
2650 if indexer.ndim > 1 or indexer.size > 1:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'houseAge_income.reshape(-1, 1)'
Can anyone let me know, what am I missing?
I understand there's something wrong with the shape but unable to figure out. Shape error when using PolynomialFeatures doesn't help.
Note: houseAge_income is an interaction term created by
housing['houseAge_income'] = housing['housing_median_age']*housing['median_income']
'column'
(as a simple string) and ['column']
(as a list with one element) is the shape of the array that is passed to the transformer. In the first case, a one dimensional array will be passed, while in the second case it will be a 2-dimensional array with one column, i.e. a column vector.list
, since there's a list
of non-transformed columns to keep.import pandas as pd
from sklearn.preprocessing import PolynomialFeatures
from sklearn_pandas import DataFrameMapper
# load data
df = pd.read_csv('https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.csv')
# create houseAge_income
df['houseAge_income'] = df.housing_median_age.mul(df.median_income)
# configure mapper with all columns passed as lists
mapper = DataFrameMapper([(['houseAge_income'], PolynomialFeatures(2)),
(['median_income'], PolynomialFeatures(2)),
(['latitude', 'housing_median_age', 'total_rooms', 'population', 'median_house_value', 'ocean_proximity'], None)])
# fit
poly_feature = mapper.fit_transform(df)
# display(pd.DataFrame(poly_feature).head())
0 1 2 3 4 5 6 7 8 9 10 11
0 1 341.33 1.1651e+05 1 8.3252 69.309 37.88 41 880 322 4.526e+05 NEAR BAY
1 1 174.33 30391 1 8.3014 68.913 37.86 21 7099 2401 3.585e+05 NEAR BAY
2 1 377.38 1.4242e+05 1 7.2574 52.67 37.85 52 1467 496 3.521e+05 NEAR BAY
3 1 293.44 86108 1 5.6431 31.845 37.85 52 1274 558 3.413e+05 NEAR BAY
4 1 200 40001 1 3.8462 14.793 37.85 52 1627 565 3.422e+05 NEAR BAY