Search code examples
machine-learningpythonpandaskaggle

Dataset throwing KeyError while looping through a list of variables


I have a list of variables with values encoded in a way which throws Pandas off. For example: I have a column named "Alley" and it has a list of values, one of which is NA, which stands for "No Alley". However, Pandas interprets this as NaN. To come across this problem, I am encoding all NaN values with an arbitrary symbol like XX. These variables don't actuall have null/missing values. These are just variables whose values are being misinterpreted by Pandas. I am gathering them in a list:

na_data = ['Alley', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'FireplaceQu',
           'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']

And replacing each NaN reading with XX:

for i in na_data:
    df[i] = df[i].fillna('XX')

This was the old error I was getting:

Traceback (most recent call last):
  File "C:\Users\security\AppData\Roaming\Python\Python37\site-packages\pandas\core\indexes\base.py", line 2657, in get_loc
    return self._engine.get_loc(key)
  File "pandas\_libs\index.pyx", line 108, in pandas._libs.index.IndexEngine.get_loc
  File "pandas\_libs\index.pyx", line 129, in pandas._libs.index.IndexEngine.get_loc
  File "pandas\_libs\index_class_helper.pxi", line 91, in pandas._libs.index.Int64Engine._check_type
KeyError: 'Alley'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:/Users/security/Downloads/AP/Boston-Kaggle/Model.py", line 67, in <module>
    print(feature_encoding(train, categorical_columns))
  File "C:/Users/security/Downloads/AP/Boston-Kaggle/Model.py", line 50, in feature_encoding
    df[i] = df[i].fillna('XX')
  File "C:\Users\security\AppData\Roaming\Python\Python37\site-packages\pandas\core\frame.py", line 2927, in __getitem__
    indexer = self.columns.get_loc(key)
  File "C:\Users\security\AppData\Roaming\Python\Python37\site-packages\pandas\core\indexes\base.py", line 2659, in get_loc
    return self._engine.get_loc(self._maybe_cast_indexer(key))
  File "pandas\_libs\index.pyx", line 108, in pandas._libs.index.IndexEngine.get_loc
  File "pandas\_libs\index.pyx", line 129, in pandas._libs.index.IndexEngine.get_loc
  File "pandas\_libs\index_class_helper.pxi", line 91, in pandas._libs.index.Int64Engine._check_type
KeyError: 'Alley'

The variable Alley definitely exists in the dataset! I copy/pasta the name from the dataset just for good measure.

This is my entire code (updated):

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split

train = pd.read_csv("https://raw.githubusercontent.com/oo92/Boston-Kaggle/master/train.csv")
test = pd.read_csv("https://raw.githubusercontent.com/oo92/Boston-Kaggle/master/test.csv")

categorical_columns = ['MSSubClass', 'MSZoning', 'LotShape', 'LandContour', 'LotConfig', 'Neighborhood', 'Condition1',
                       'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd',
                       'Foundation', 'Heating', 'Electrical', 'Functional', 'GarageType', 'PavedDrive', 'Fence',
                       'MiscFeature', 'SaleType', 'SaleCondition', 'Street', 'CentralAir', 'Utilities', 'ExterQual',
                       'LandSlope', 'ExterCond', 'HeatingQC', 'KitchenQual']

ranked_columns = ['Utilities', 'LandSlope', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
                  'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond',
                  'PoolQC', 'OverallQual', 'OverallCond']

numerical_columns = ['LotArea', 'LotFrontage', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
                     'BsmtUnfSF','TotalBsmtSF', '1stFlrSF', '2ndFlrSf', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath',
                     'BsmtHalfBath', 'FullBath', 'HalfBath', 'Bedroom', 'Kitchen', 'TotRmsAbvGrd', 'Fireplaces',
                     'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch',
                     '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']

na_data = ['Alley', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'FireplaceQu',
           'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']

for i in na_data:
    train[i] = train[i].fillna('XX')

#Replaced the NaN values of LotFrontage and MasVnrArea with the mean of their column
train['LotFrontage'] = train['LotFrontage'].fillna(train['LotFrontage'].mean())
train['MasVnrArea'] = train['MasVnrArea'].fillna(train['MasVnrArea'].mean())

concatenated_list = categorical_columns + na_data

# take one-hot encoding
OHE_sdf = pd.get_dummies(train[concatenated_list])

# drop the old categorical column from original df
train.drop(columns = categorical_columns, axis = 1, inplace = True)

# attach one-hot encoded columns to original data frame
train = pd.concat([train, OHE_sdf], axis = 1, ignore_index = False)

x_train, x_test, y_train, y_test = train_test_split(train, train['SalePrice'], test_size = 0.3, random_state = 42)

sel = SelectFromModel(RandomForestClassifier(n_estimators = 100), threshold = 300 * "mean")
sel.fit(x_train, y_train)
sel.get_support()

selected_feat = x_train.columns[sel.get_support()]

print(selected_feat())

This is the new error:

Traceback (most recent call last):
  File "/home/onur/Documents/Boston-Kaggle/Model.py", line 49, in <module>
    sel.fit(x_train, y_train)
  File "/opt/anaconda/envs/lib/python3.7/site-packages/sklearn/feature_selection/from_model.py", line 196, in fit
    self.estimator_.fit(X, y, **fit_params)
  File "/opt/anaconda/envs/lib/python3.7/site-packages/sklearn/ensemble/forest.py", line 249, in fit
    X = check_array(X, accept_sparse="csc", dtype=DTYPE)
  File "/opt/anaconda/envs/lib/python3.7/site-packages/sklearn/utils/validation.py", line 496, in check_array
    array = np.asarray(array, dtype=dtype, order=order)
  File "/opt/anaconda/envs/lib/python3.7/site-packages/numpy/core/_asarray.py", line 85, in asarray
    return array(a, dtype, copy=False, order=order)
ValueError: could not convert string to float: 'XX'

Solution

  • Your concenating the data on the wrong axis

    df = pd.concat([df, OHE_sdf], axis = 1, ignore_index = True)
    # Should be
    df = pd.concat([df, OHE_sdf], axis = 0, ignore_index = True)
    

    However this will cause another error to throw in that you one hot encoded some of columns listed in na_columns, for instance Garage_Type has been encoded into multiple columns one for each potential value as such it no longer exists so it can't have its nan values replaced.

    Edit:

    I've updated several parts of the question code to ensure that it runs in it's entirety.

    Firstly we need to import all the libraries we will be using, note the addition of numpy

    import pandas as pd
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.feature_selection import SelectFromModel
    from sklearn.model_selection import train_test_split
    import numpy as np
    

    secondly we need to get the data from the source

    train = pd.read_csv("https://raw.githubusercontent.com/oo92/Boston-Kaggle/master/train.csv")
    test = pd.read_csv("https://raw.githubusercontent.com/oo92/Boston-Kaggle/master/test.csv")
    

    Now we will remove all the NaN's from the data set

    # Create a series of how many NaN's are in each column
    nanCounts = train.isna().sum()
    # Find the total number of NaN's and print it (used to check that this bits doing somethin)
    nanTotal = train.isna().sum().sum()
    print('NaN\'s found: ', nanTotal)
    
    # Create a template list
    nanCols = []
    # Iterate over the series and if the value is more than 0 (i.e there are some NaN's present)
    for i in range(0,len(nanCounts)):
        if nanCounts[i] > 0:
            # If it is append the current column to the list of columns that contain NaN's
            nanCols.append(train.columns[i])
    
    # Iterate through all the columns which are known to have NaN's
    for i in nanCols:
        if train[nanCols][i].dtypes == 'float64':
            # If the column is of the data type float64 (a floating point number), replace it with the mean of the column
            train[i] = train[i].fillna(train[i].mean())
        elif train[nanCols][i].dtypes == 'object':
            # If it's of the data type object (a text string) replace it with XX
            train[i] = train[i].fillna('XX')
    
    # Reprint the total number of NaN's
    nanTotal = train.isna().sum().sum()
    print('NaN\'s after removal: ', nanTotal)
    

    Now that there are no NaN's in the dataset it is possible to assemble a list of the categorical data

    # Create a template list
    categorical = []
    # Iterate across all the columns checking if they're of the object datatype and if they are appending them to the categorical list
    for i in range(0, len(train.dtypes)):
        if train.dtypes[i] == 'object':
            categorical.append(train.columns[i])
    # Print out the list of categorical features
    print('Categorical columns are: \n', categorical)
    

    Now the code is very similar to the original with a few minor changes due to variable changes

    # take one-hot encoding
    OHE_sdf = pd.get_dummies(train[categorical])
    
    # drop the old categorical column from original df
    train.drop(columns = categorical, axis = 1, inplace = True)
    
    # attach one-hot encoded columns to original data frame
    train = pd.concat([train, OHE_sdf], axis = 1, ignore_index = False)
    
    print('splitting dataset')
    x_train, x_test, y_train, y_test = train_test_split(train, train['SalePrice'], test_size = 0.3, random_state = 42)
    
    print('Selecting features')
    # Note that here i changed the threshold so that it would actually show some features to use
    sel = SelectFromModel(RandomForestClassifier(n_estimators = 100), threshold = '1.25*mean')
    sel.fit(x_train, y_train)
    # Also just straight up save the boolean array it will be quicker and i prefer the formatting this way
    selected = sel.get_support()
    
    # Print the boolean array of selected features
    print(selected)
    # Print the finally selected features
    print(train.columns[selected])
    

    All together it looks like

    import pandas as pd
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.feature_selection import SelectFromModel
    from sklearn.model_selection import train_test_split
    import numpy as np
    
    train = pd.read_csv("https://raw.githubusercontent.com/oo92/Boston-Kaggle/master/train.csv")
    test = pd.read_csv("https://raw.githubusercontent.com/oo92/Boston-Kaggle/master/test.csv")
    
    nanCounts = train.isna().sum()
    nanTotal = train.isna().sum().sum()
    print('NaN\'s found: ', nanTotal)
    
    nanCols = []
    for i in range(0,len(nanCounts)):
        if nanCounts[i] > 0:
            nanCols.append(train.columns[i])
    
    for i in nanCols:
        if train[nanCols][i].dtypes == 'float64':
            train[i] = train[i].fillna(train[i].mean())
        elif train[nanCols][i].dtypes == 'object':
            train[i] = train[i].fillna('XX')
    
    nanTotal = train.isna().sum().sum()
    
    print('NaN\'s after removal: ', nanTotal)
    
    categorical = []
    for i in range(0, len(train.dtypes)):
        if train.dtypes[i] == 'object':
            categorical.append(train.columns[i])
    
    print('Categorical columns are: \n', categorical)
    
    # take one-hot encoding
    OHE_sdf = pd.get_dummies(train[categorical])
    
    # drop the old categorical column from original df
    train.drop(columns = categorical, axis = 1, inplace = True)
    
    # attach one-hot encoded columns to original data frame
    train = pd.concat([train, OHE_sdf], axis = 1, ignore_index = False)
    
    print('splitting dataset')
    x_train, x_test, y_train, y_test = train_test_split(train, train['SalePrice'], test_size = 0.3, random_state = 42)
    
    print('Selecting features')
    sel = SelectFromModel(RandomForestClassifier(n_estimators = 100), threshold = '1.25*mean')
    sel.fit(x_train, y_train)
    selected = sel.get_support()
    
    print(selected)
    print(train.columns[selected])