Search code examples
pythonpandasone-hot-encoding

Pandas df.get_dummies() returns "ValueError: could not convert string to float"


I am trying to one-hot encode several categorical columns using Pandas' df.get_dummies() and it is returning an error that I don't understand. The error says ValueError: could not convert string to float: 'Warm Cool'. What might be causing this issue and how can I successfully one-hot encode all of the columns with dtype == object?

My dataset comes from the DC_Properties.CSV file found here.

My code and the error message:

"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
Import packages section
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
Read data section
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
df = pd.read_csv('DC_Properties.csv', index_col=0)

"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
Preprocess data section
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
# remove rows without sales prices
df = df[df.PRICE.notnull()]

# create month sold column
df['MONTHSOLD'] = [i[:i.find('/')] if type(i) == str else i for i in df.SALEDATE]

# create year sold column
df['YEARSOLD'] = [i[-4:] if type(i) == str else i for i in df.SALEDATE]

# join GBA and Living GBA
df['GBA'] = df['GBA'].fillna(df['LIVING_GBA'])

# remove unused columns
unused_cols = ['SALEDATE',
               'GIS_LAST_MOD_DTTM', 
               'CMPLX_NUM', 
               'LIVING_GBA', 
               'FULLADDRESS', 
               'CITY', 
               'STATE', 
               'NATIONALGRID',
               'ASSESSMENT_SUBNBHD',
               'CENSUS_TRACT',
               'CENSUS_BLOCK',
               'X',
               'Y']
df = df.drop(unused_cols, axis=1)

# one-hot encode categorical variables
pd.get_dummies(df, dummy_na=True)


# standardize the data 
scaler = StandardScaler()
dataset = scaler.fit_transform(df)

# specify x and y variables
x = dataset[:,-y_idx]
y = dataset[:,'PRICE']

# split data into a train and test set
np.random.seed(123)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-81-62c3931b3dfa> in <module>
     33 # standardize the data
     34 scaler = StandardScaler()
---> 35 dataset = scaler.fit_transform(df)
     36 
     37 # specify x and y variables

~\Anaconda3\lib\site-packages\sklearn\base.py in fit_transform(self, X, y, **fit_params)
    551         if y is None:
    552             # fit method of arity 1 (unsupervised transformation)
--> 553             return self.fit(X, **fit_params).transform(X)
    554         else:
    555             # fit method of arity 2 (supervised transformation)

~\Anaconda3\lib\site-packages\sklearn\preprocessing\data.py in fit(self, X, y)
    637         # Reset internal state before fitting
    638         self._reset()
--> 639         return self.partial_fit(X, y)
    640 
    641     def partial_fit(self, X, y=None):

~\Anaconda3\lib\site-packages\sklearn\preprocessing\data.py in partial_fit(self, X, y)
    661         X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy,
    662                         estimator=self, dtype=FLOAT_DTYPES,
--> 663                         force_all_finite='allow-nan')
    664 
    665         # Even in the case of `with_mean=False`, we update the mean anyway

~\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
    494             try:
    495                 warnings.simplefilter('error', ComplexWarning)
--> 496                 array = np.asarray(array, dtype=dtype, order=order)
    497             except ComplexWarning:
    498                 raise ValueError("Complex data not supported\n"

~\Anaconda3\lib\site-packages\numpy\core\_asarray.py in asarray(a, dtype, order)
     83 
     84     """
---> 85     return array(a, dtype, copy=False, order=order)
     86 
     87 

ValueError: could not convert string to float: 'Warm Cool'

Solution

  • It's actually the StandardScaler that throws an error because it encounters strings.

    The reason is that you are using pd.dummies, but you never assign the returned dataframe.

    # one-hot encode categorical variables
    pd.get_dummies(df, dummy_na=True) # <------ is lost
    

    To fix it change it to:

    # one-hot encode categorical variables
    df = pd.get_dummies(df, dummy_na=True)