I have a list of variables with values encoded in a way which throws Pandas off. For example: I have a column named "Alley" and it has a list of values, one of which is NA
, which stands for "No Alley". However, Pandas interprets this as NaN
. To come across this problem, I am encoding all NaN
values with an arbitrary symbol like XX
. These variables don't actuall have null/missing values. These are just variables whose values are being misinterpreted by Pandas. I am gathering them in a list:
na_data = ['Alley', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'FireplaceQu',
'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']
And replacing each NaN
reading with XX
:
for i in na_data:
df[i] = df[i].fillna('XX')
This was the old error I was getting:
Traceback (most recent call last):
File "C:\Users\security\AppData\Roaming\Python\Python37\site-packages\pandas\core\indexes\base.py", line 2657, in get_loc
return self._engine.get_loc(key)
File "pandas\_libs\index.pyx", line 108, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx", line 129, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index_class_helper.pxi", line 91, in pandas._libs.index.Int64Engine._check_type
KeyError: 'Alley'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:/Users/security/Downloads/AP/Boston-Kaggle/Model.py", line 67, in <module>
print(feature_encoding(train, categorical_columns))
File "C:/Users/security/Downloads/AP/Boston-Kaggle/Model.py", line 50, in feature_encoding
df[i] = df[i].fillna('XX')
File "C:\Users\security\AppData\Roaming\Python\Python37\site-packages\pandas\core\frame.py", line 2927, in __getitem__
indexer = self.columns.get_loc(key)
File "C:\Users\security\AppData\Roaming\Python\Python37\site-packages\pandas\core\indexes\base.py", line 2659, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas\_libs\index.pyx", line 108, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx", line 129, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index_class_helper.pxi", line 91, in pandas._libs.index.Int64Engine._check_type
KeyError: 'Alley'
The variable Alley
definitely exists in the dataset! I copy/pasta the name from the dataset just for good measure.
This is my entire code (updated):
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
train = pd.read_csv("https://raw.githubusercontent.com/oo92/Boston-Kaggle/master/train.csv")
test = pd.read_csv("https://raw.githubusercontent.com/oo92/Boston-Kaggle/master/test.csv")
categorical_columns = ['MSSubClass', 'MSZoning', 'LotShape', 'LandContour', 'LotConfig', 'Neighborhood', 'Condition1',
'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd',
'Foundation', 'Heating', 'Electrical', 'Functional', 'GarageType', 'PavedDrive', 'Fence',
'MiscFeature', 'SaleType', 'SaleCondition', 'Street', 'CentralAir', 'Utilities', 'ExterQual',
'LandSlope', 'ExterCond', 'HeatingQC', 'KitchenQual']
ranked_columns = ['Utilities', 'LandSlope', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond',
'PoolQC', 'OverallQual', 'OverallCond']
numerical_columns = ['LotArea', 'LotFrontage', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
'BsmtUnfSF','TotalBsmtSF', '1stFlrSF', '2ndFlrSf', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath',
'BsmtHalfBath', 'FullBath', 'HalfBath', 'Bedroom', 'Kitchen', 'TotRmsAbvGrd', 'Fireplaces',
'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch',
'3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']
na_data = ['Alley', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'FireplaceQu',
'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']
for i in na_data:
train[i] = train[i].fillna('XX')
#Replaced the NaN values of LotFrontage and MasVnrArea with the mean of their column
train['LotFrontage'] = train['LotFrontage'].fillna(train['LotFrontage'].mean())
train['MasVnrArea'] = train['MasVnrArea'].fillna(train['MasVnrArea'].mean())
concatenated_list = categorical_columns + na_data
# take one-hot encoding
OHE_sdf = pd.get_dummies(train[concatenated_list])
# drop the old categorical column from original df
train.drop(columns = categorical_columns, axis = 1, inplace = True)
# attach one-hot encoded columns to original data frame
train = pd.concat([train, OHE_sdf], axis = 1, ignore_index = False)
x_train, x_test, y_train, y_test = train_test_split(train, train['SalePrice'], test_size = 0.3, random_state = 42)
sel = SelectFromModel(RandomForestClassifier(n_estimators = 100), threshold = 300 * "mean")
sel.fit(x_train, y_train)
sel.get_support()
selected_feat = x_train.columns[sel.get_support()]
print(selected_feat())
This is the new error:
Traceback (most recent call last):
File "/home/onur/Documents/Boston-Kaggle/Model.py", line 49, in <module>
sel.fit(x_train, y_train)
File "/opt/anaconda/envs/lib/python3.7/site-packages/sklearn/feature_selection/from_model.py", line 196, in fit
self.estimator_.fit(X, y, **fit_params)
File "/opt/anaconda/envs/lib/python3.7/site-packages/sklearn/ensemble/forest.py", line 249, in fit
X = check_array(X, accept_sparse="csc", dtype=DTYPE)
File "/opt/anaconda/envs/lib/python3.7/site-packages/sklearn/utils/validation.py", line 496, in check_array
array = np.asarray(array, dtype=dtype, order=order)
File "/opt/anaconda/envs/lib/python3.7/site-packages/numpy/core/_asarray.py", line 85, in asarray
return array(a, dtype, copy=False, order=order)
ValueError: could not convert string to float: 'XX'
Your concenating the data on the wrong axis
df = pd.concat([df, OHE_sdf], axis = 1, ignore_index = True)
# Should be
df = pd.concat([df, OHE_sdf], axis = 0, ignore_index = True)
However this will cause another error to throw in that you one hot encoded some of columns listed in na_columns, for instance Garage_Type
has been encoded into multiple columns one for each potential value as such it no longer exists so it can't have its nan values replaced.
Edit:
I've updated several parts of the question code to ensure that it runs in it's entirety.
Firstly we need to import all the libraries we will be using, note the addition of numpy
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
import numpy as np
secondly we need to get the data from the source
train = pd.read_csv("https://raw.githubusercontent.com/oo92/Boston-Kaggle/master/train.csv")
test = pd.read_csv("https://raw.githubusercontent.com/oo92/Boston-Kaggle/master/test.csv")
Now we will remove all the NaN's from the data set
# Create a series of how many NaN's are in each column
nanCounts = train.isna().sum()
# Find the total number of NaN's and print it (used to check that this bits doing somethin)
nanTotal = train.isna().sum().sum()
print('NaN\'s found: ', nanTotal)
# Create a template list
nanCols = []
# Iterate over the series and if the value is more than 0 (i.e there are some NaN's present)
for i in range(0,len(nanCounts)):
if nanCounts[i] > 0:
# If it is append the current column to the list of columns that contain NaN's
nanCols.append(train.columns[i])
# Iterate through all the columns which are known to have NaN's
for i in nanCols:
if train[nanCols][i].dtypes == 'float64':
# If the column is of the data type float64 (a floating point number), replace it with the mean of the column
train[i] = train[i].fillna(train[i].mean())
elif train[nanCols][i].dtypes == 'object':
# If it's of the data type object (a text string) replace it with XX
train[i] = train[i].fillna('XX')
# Reprint the total number of NaN's
nanTotal = train.isna().sum().sum()
print('NaN\'s after removal: ', nanTotal)
Now that there are no NaN's in the dataset it is possible to assemble a list of the categorical data
# Create a template list
categorical = []
# Iterate across all the columns checking if they're of the object datatype and if they are appending them to the categorical list
for i in range(0, len(train.dtypes)):
if train.dtypes[i] == 'object':
categorical.append(train.columns[i])
# Print out the list of categorical features
print('Categorical columns are: \n', categorical)
Now the code is very similar to the original with a few minor changes due to variable changes
# take one-hot encoding
OHE_sdf = pd.get_dummies(train[categorical])
# drop the old categorical column from original df
train.drop(columns = categorical, axis = 1, inplace = True)
# attach one-hot encoded columns to original data frame
train = pd.concat([train, OHE_sdf], axis = 1, ignore_index = False)
print('splitting dataset')
x_train, x_test, y_train, y_test = train_test_split(train, train['SalePrice'], test_size = 0.3, random_state = 42)
print('Selecting features')
# Note that here i changed the threshold so that it would actually show some features to use
sel = SelectFromModel(RandomForestClassifier(n_estimators = 100), threshold = '1.25*mean')
sel.fit(x_train, y_train)
# Also just straight up save the boolean array it will be quicker and i prefer the formatting this way
selected = sel.get_support()
# Print the boolean array of selected features
print(selected)
# Print the finally selected features
print(train.columns[selected])
All together it looks like
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
import numpy as np
train = pd.read_csv("https://raw.githubusercontent.com/oo92/Boston-Kaggle/master/train.csv")
test = pd.read_csv("https://raw.githubusercontent.com/oo92/Boston-Kaggle/master/test.csv")
nanCounts = train.isna().sum()
nanTotal = train.isna().sum().sum()
print('NaN\'s found: ', nanTotal)
nanCols = []
for i in range(0,len(nanCounts)):
if nanCounts[i] > 0:
nanCols.append(train.columns[i])
for i in nanCols:
if train[nanCols][i].dtypes == 'float64':
train[i] = train[i].fillna(train[i].mean())
elif train[nanCols][i].dtypes == 'object':
train[i] = train[i].fillna('XX')
nanTotal = train.isna().sum().sum()
print('NaN\'s after removal: ', nanTotal)
categorical = []
for i in range(0, len(train.dtypes)):
if train.dtypes[i] == 'object':
categorical.append(train.columns[i])
print('Categorical columns are: \n', categorical)
# take one-hot encoding
OHE_sdf = pd.get_dummies(train[categorical])
# drop the old categorical column from original df
train.drop(columns = categorical, axis = 1, inplace = True)
# attach one-hot encoded columns to original data frame
train = pd.concat([train, OHE_sdf], axis = 1, ignore_index = False)
print('splitting dataset')
x_train, x_test, y_train, y_test = train_test_split(train, train['SalePrice'], test_size = 0.3, random_state = 42)
print('Selecting features')
sel = SelectFromModel(RandomForestClassifier(n_estimators = 100), threshold = '1.25*mean')
sel.fit(x_train, y_train)
selected = sel.get_support()
print(selected)
print(train.columns[selected])