I have a dataframe containing categorical variables which I'd like to apply OneHotEncoder. My problem is solved using LabelEncoder before OneHotEncoder but it does not make sense to me since with the latest updates OneHotEncoder accepts strings for categorical variables.
Example dataframe you can test the code on :
data = pd.DataFrame({'col1': {0: 'ab321', 1: 'ab568', 2: 'mkld78'},
'col2': {0: 'Red', 1: 'Blue', 2: 'Green'},
'col3': {0: 'First', 1: 'Second', 2: 'Third'},
'col4': {0: 'Wisconsin', 1: 'California', 2: 'Portland'},
'col5': {0: 'a', 1: 'f', 2: 'g'},
'col6': {0: 1, 1: 2, 2: 3},
'target': {0: 0, 1: 0, 2: 1}})
Here's what I tried:
I have tried both using both the index values and the names of the columns to solve the error:
#Index
# OneHotEncoding
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import pandas as pd
#Load data
train = pd.read_csv("data_train.csv")
test = pd.read_csv("data_test.csv")
X= train.drop(["target"], axis = 1)
y= train["target"]
# Filter categorical columns
categorical_columns = ["col1","col2","col3","col4","col5"]
categorical_indexes = np.where(X.dtypes == 'object')[0]
# OHE
ohe = OneHotEncoder(categorical_features = categorical_columns)
# reshape data
for index in categorical_indexes:
X.iloc[:,index] = ohe.fit_transform(X.iloc[:,index].values.reshape(-1,1))
#Column Names
# OneHotEncoding
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
train = pd.read_csv("data_train.csv")
test = pd.read_csv("data_test.csv")
X= train.drop(["target"], axis = 1)
y= train["target"]
# Filter categorical columns
categorical_columns = ["col1","col2","col3","col4","col5"]
categorical_indexes = np.where(X.dtypes == 'object')[0]
# OHE
ohe = OneHotEncoder(categorical_features = categorical_columns)
# reshape data
for column in categorical_columns:
X[column] = ohe.fit_transform(X[column].values.reshape(-1,1))
Error Traceback:
IndexError Traceback (most recent call last)
<ipython-input-86-17c86bf649e2> in <module>
11 # reshape data
12 for index in categorical_indexes:
---> 13 X.iloc[:,index] = ohe.fit_transform(X.iloc[:,index].values.reshape(-1,1))
14
c:\users\m\appdata\local\programs\python\python37\lib\site-packages\sklearn\preprocessing\_encoders.py in fit_transform(self, X, y)
622 self._validate_keywords()
623
--> 624 self._handle_deprecations(X)
625
626 if self._legacy_mode:
c:\users\m\appdata\local\programs\python\python37\lib\site-packages\sklearn\preprocessing\_encoders.py in _handle_deprecations(self, X)
453 n_features = X.shape[1]
454 sel = np.zeros(n_features, dtype=bool)
--> 455 sel[np.asarray(self.categorical_features)] = True
456 if sum(sel) == 0:
457 self.categories_ = []
IndexError: arrays used as indices must be of integer (or boolean) type
You are missing the concept of the OnehotEncoder
. The way to use it is to fit it on the whole training set.
Use this:
data = pd.DataFrame({'col1': {0: 'ab321', 1: 'ab568', 2: 'mkld78'},
'col2': {0: 'Red', 1: 'Blue', 2: 'Green'},
'col3': {0: 'First', 1: 'Second', 2: 'Third'},
'col4': {0: 'Wisconsin', 1: 'California', 2: 'Portland'},
'col5': {0: 'a', 1: 'f', 2: 'g'},
'col6': {0: 1, 1: 2, 2: 3},
'target': {0: 0, 1: 0, 2: 1}})
# OneHotEncoding
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
train = data.iloc[0:2,:]
test = data.iloc[2:,:]
X= train.drop(["target"], axis = 1)
y= train["target"]
# Filter categorical columns
categorical_columns = ["col1","col2","col3","col4","col5"]
categorical_indexes = np.where(X.dtypes == 'object')[0]
# OHE
ohe = OneHotEncoder()
X_ = ohe.fit_transform(X)
X_
# <2x12 sparse matrix of type '<type 'numpy.float64'>'
# with 12 stored elements in Compressed Sparse Row format>