Search code examples
pythonnumpyscikit-learnreshapesklearn-pandas

Error in using DataFrameMapper() for PolynomialFeature() in sklearn


For housing data set, I am trying to use DataFrameMapper() from sklearn_pandas to apply polynomial features on selected columns.

My code:

 from sklearn.preprocessing import PolynomialFeatures
 from sklearn_pandas import DataFrameMapper

 mapper = DataFrameMapper([
('houseAge_income', PolynomialFeatures(2)),
('median_income', PolynomialFeatures(2)),
(['latitude', 'housing_median_age', 'total_rooms', 'population', 'median_house_value', 
'ocean_proximity']], None)
 ])

 poly_feature = mapper.fit_transform(housing) 

I was getting this error:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-44-30679ae791ae> in <module>
     11 
     12 # fit
---> 13 poly_feature = mapper.fit_transform(df)

e:\Anaconda3\lib\site-packages\sklearn_pandas\dataframe_mapper.py in fit_transform(self, X, y)
    397         y       the target vector relative to X, optional
    398         """
--> 399         return self._transform(X, y, True)

e:\Anaconda3\lib\site-packages\sklearn_pandas\dataframe_mapper.py in _transform(self, X, y, do_fit)
    308                 with add_column_names_to_exception(columns):
    309                     if do_fit and hasattr(transformers, 'fit_transform'):
--> 310                         Xt = _call_fit(transformers.fit_transform, Xt, y)
    311                     else:
    312                         if do_fit:

e:\Anaconda3\lib\site-packages\sklearn_pandas\pipeline.py in _call_fit(fit_method, X, y, **kwargs)
     22     """
     23     try:
---> 24         return fit_method(X, y, **kwargs)
     25     except TypeError:
     26         # fit takes only one argument

e:\Anaconda3\lib\site-packages\sklearn\base.py in fit_transform(self, X, y, **fit_params)
    688         if y is None:
    689             # fit method of arity 1 (unsupervised transformation)
--> 690             return self.fit(X, **fit_params).transform(X)
    691         else:
    692             # fit method of arity 2 (supervised transformation)

e:\Anaconda3\lib\site-packages\sklearn\preprocessing\_data.py in fit(self, X, y)
   1510         self : instance
   1511         """
-> 1512         n_samples, n_features = self._validate_data(
   1513             X, accept_sparse=True).shape
   1514         combinations = self._combinations(n_features, self.degree,

e:\Anaconda3\lib\site-packages\sklearn\base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
    418                     f"requires y to be passed, but the target y is None."
    419                 )
--> 420             X = check_array(X, **check_params)
    421             out = X
    422         else:

e:\Anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
     70                           FutureWarning)
     71         kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 72         return f(**kwargs)
     73     return inner_f
     74 

e:\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
    617             # If input is 1D raise error
    618             if array.ndim == 1:
--> 619                 raise ValueError(
    620                     "Expected 2D array, got 1D array instead:\narray={}.\n"
    621                     "Reshape your data either using array.reshape(-1, 1) if "

ValueError: houseAge_income: Expected 2D array, got 1D array instead:
array=[341.3332 174.3294 377.3848 ...  28.9     33.6096  38.2176].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

When I tried using

houseAge_income.reshape(-1, 1)

inside DataFrameMapper(), I got another issue:

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2645             try:
-> 2646                 return self._engine.get_loc(key)
   2647             except KeyError:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'houseAge_income.reshape(-1, 1)'

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
5 frames
/usr/local/lib/python3.6/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2646                 return self._engine.get_loc(key)
   2647             except KeyError:
-> 2648                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2649         indexer = self.get_indexer([key], method=method, tolerance=tolerance)
   2650         if indexer.ndim > 1 or indexer.size > 1:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()


pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'houseAge_income.reshape(-1, 1)'

Can anyone let me know, what am I missing?

I understand there's something wrong with the shape but unable to figure out. Shape error when using PolynomialFeatures doesn't help.

Note: houseAge_income is an interaction term created by

housing['houseAge_income'] = housing['housing_median_age']*housing['median_income']

Solution

    • From the documentation
      • The difference between specifying the column selector as 'column' (as a simple string) and ['column'] (as a list with one element) is the shape of the array that is passed to the transformer. In the first case, a one dimensional array will be passed, while in the second case it will be a 2-dimensional array with one column, i.e. a column vector.
    • All of the columns must be passed with the same type of column selector.
      • In this case, a list, since there's a list of non-transformed columns to keep.
    import pandas as pd
    from sklearn.preprocessing import PolynomialFeatures
    from sklearn_pandas import DataFrameMapper
    
    # load data
    df = pd.read_csv('https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.csv')
    
    # create houseAge_income
    df['houseAge_income'] = df.housing_median_age.mul(df.median_income)
    
    # configure mapper with all columns passed as lists
    mapper = DataFrameMapper([(['houseAge_income'], PolynomialFeatures(2)),
                              (['median_income'], PolynomialFeatures(2)),
                              (['latitude', 'housing_median_age', 'total_rooms', 'population', 'median_house_value', 'ocean_proximity'], None)])
    
    # fit
    poly_feature = mapper.fit_transform(df)
    
    # display(pd.DataFrame(poly_feature).head())
      0       1           2  3       4       5      6   7     8     9          10        11
    0  1  341.33  1.1651e+05  1  8.3252  69.309  37.88  41   880   322  4.526e+05  NEAR BAY
    1  1  174.33       30391  1  8.3014  68.913  37.86  21  7099  2401  3.585e+05  NEAR BAY
    2  1  377.38  1.4242e+05  1  7.2574   52.67  37.85  52  1467   496  3.521e+05  NEAR BAY
    3  1  293.44       86108  1  5.6431  31.845  37.85  52  1274   558  3.413e+05  NEAR BAY
    4  1     200       40001  1  3.8462  14.793  37.85  52  1627   565  3.422e+05  NEAR BAY