python dataframe scikit-learn categorical-data one-hot-encoding

IndexError: arrays used as indices must be of integer (or boolean) type Error during OneHotEncoding

I have a dataframe containing categorical variables which I'd like to apply OneHotEncoder. My problem is solved using LabelEncoder before OneHotEncoder but it does not make sense to me since with the latest updates OneHotEncoder accepts strings for categorical variables.

Example dataframe you can test the code on :

data = pd.DataFrame({'col1': {0: 'ab321', 1: 'ab568', 2: 'mkld78'},
 'col2': {0: 'Red', 1: 'Blue', 2: 'Green'},
 'col3': {0: 'First', 1: 'Second', 2: 'Third'},
 'col4': {0: 'Wisconsin', 1: 'California', 2: 'Portland'},
 'col5': {0: 'a', 1: 'f', 2: 'g'},
 'col6': {0: 1, 1: 2, 2: 3},
 'target': {0: 0, 1: 0, 2: 1}})

Here's what I tried:

I have tried both using both the index values and the names of the columns to solve the error:

#Index
# OneHotEncoding

from sklearn.preprocessing import OneHotEncoder
import numpy as np
import pandas as pd

#Load data
train = pd.read_csv("data_train.csv")
test = pd.read_csv("data_test.csv")

X= train.drop(["target"], axis = 1)
y= train["target"]
# Filter categorical columns
categorical_columns = ["col1","col2","col3","col4","col5"]
categorical_indexes = np.where(X.dtypes == 'object')[0]

# OHE
ohe = OneHotEncoder(categorical_features = categorical_columns)
# reshape data
for index in categorical_indexes:
    X.iloc[:,index] = ohe.fit_transform(X.iloc[:,index].values.reshape(-1,1))

#Column Names

# OneHotEncoding

import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

train = pd.read_csv("data_train.csv")
test = pd.read_csv("data_test.csv")

X= train.drop(["target"], axis = 1)
y= train["target"]

# Filter categorical columns
categorical_columns = ["col1","col2","col3","col4","col5"]
categorical_indexes = np.where(X.dtypes == 'object')[0]

# OHE
ohe = OneHotEncoder(categorical_features = categorical_columns)
# reshape data
for column in categorical_columns:
    X[column] = ohe.fit_transform(X[column].values.reshape(-1,1))

Error Traceback:

IndexError                                Traceback (most recent call last)
<ipython-input-86-17c86bf649e2> in <module>
     11 # reshape data
     12 for index in categorical_indexes:
---> 13     X.iloc[:,index] = ohe.fit_transform(X.iloc[:,index].values.reshape(-1,1))
     14 

c:\users\m\appdata\local\programs\python\python37\lib\site-packages\sklearn\preprocessing\_encoders.py in fit_transform(self, X, y)
    622         self._validate_keywords()
    623 
--> 624         self._handle_deprecations(X)
    625 
    626         if self._legacy_mode:

c:\users\m\appdata\local\programs\python\python37\lib\site-packages\sklearn\preprocessing\_encoders.py in _handle_deprecations(self, X)
    453                 n_features = X.shape[1]
    454                 sel = np.zeros(n_features, dtype=bool)
--> 455                 sel[np.asarray(self.categorical_features)] = True
    456                 if sum(sel) == 0:
    457                     self.categories_ = []

IndexError: arrays used as indices must be of integer (or boolean) type

Solution

You are missing the concept of the OnehotEncoder. The way to use it is to fit it on the whole training set.

Use this:

data = pd.DataFrame({'col1': {0: 'ab321', 1: 'ab568', 2: 'mkld78'},
 'col2': {0: 'Red', 1: 'Blue', 2: 'Green'},
 'col3': {0: 'First', 1: 'Second', 2: 'Third'},
 'col4': {0: 'Wisconsin', 1: 'California', 2: 'Portland'},
 'col5': {0: 'a', 1: 'f', 2: 'g'},
 'col6': {0: 1, 1: 2, 2: 3},
 'target': {0: 0, 1: 0, 2: 1}})


# OneHotEncoding

import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

train = data.iloc[0:2,:]
test = data.iloc[2:,:]

X= train.drop(["target"], axis = 1)
y= train["target"]

# Filter categorical columns
categorical_columns = ["col1","col2","col3","col4","col5"]
categorical_indexes = np.where(X.dtypes == 'object')[0]

# OHE
ohe = OneHotEncoder()
X_ = ohe.fit_transform(X)

X_
# <2x12 sparse matrix of type '<type 'numpy.float64'>'
#  with 12 stored elements in Compressed Sparse Row format>