While performing Step 5 of Exercise: Categorical Variables on Kaggle Learn, I got the ValueError: Input contains NaN, infinity or a value too large for dtype('float32')
during the predict phase with the test set.
Full jupyter notebook available here. Full code used displayed at the end of the post.
The code aims to prepare a submission dataset for the "Housing Prices Competition for Kaggle Learn Users".
The problem is to pre-process the X_test
dataset that contains the test set. At first I've used the SimpleImputer
with a most_frequent
strategy. Then performed a one hot encoding for categorical variables of the dataset.
I found that betwen the X_train
(and X_valid
) datasets and the X_test
, a few features have different datatypes. Specifically columns ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageCars', 'GarageArea']
are of int64
type in the training data (X_train
and X_valid
) while they are of 'float64' in the test data (X_test
). I guess that the problem may be here but I'm unable to solve it. Tried by casting the values with the following chunk
# normalize datatypes columns
#for colName in ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageCars', 'GarageArea']:
# OH_X_train[colName] = OH_X_train[colName].astype('float64')
# OH_X_valid[colName] = OH_X_train[colName].astype('float64')
but it didn't work. Any suggestions?
#### DATASETS LOAD ####
import pandas as pd
from sklearn.model_selection import train_test_split
# Read the data
X = pd.read_csv('../input/train.csv', index_col='Id')
X_test = pd.read_csv('../input/test.csv', index_col='Id')
# Remove rows with missing target, separate target from predictors
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X.SalePrice
X.drop(['SalePrice'], axis=1, inplace=True)
# To keep things simple, we'll drop columns with missing values
cols_with_missing = [col for col in X.columns if X[col].isnull().any()]
X.drop(cols_with_missing, axis=1, inplace=True)
X_test.drop(cols_with_missing, axis=1, inplace=True)
# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y,
train_size=0.8, test_size=0.2,
random_state=0)
#### IMPUTATION OF MISSING VALUES FOR X_TEST ####
from sklearn.impute import SimpleImputer
# All categorical columns
object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]
# Columns that will be one-hot encoded
low_cardinality_cols = [col for col in object_cols if X_train[col].nunique() < 10]
# Fill in the lines below: imputation
my_imputer = SimpleImputer(strategy='most_frequent')
imputed_X_test = pd.DataFrame(my_imputer.fit_transform(X_test))
# Fill in the lines below: imputation removed column names; put them back
imputed_X_test.columns = X_test.columns
#### ONEHOT ENCODING FOR DATA #####
from sklearn.preprocessing import OneHotEncoder
# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[low_cardinality_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[low_cardinality_cols]))
OH_cols_test = pd.DataFrame(OH_encoder.transform(imputed_X_test[low_cardinality_cols]))
# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index
OH_cols_test.index = X_test.index
# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)
num_X_test = X_test.drop(object_cols, axis=1)
# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)
OH_X_test = pd.concat([num_X_test, OH_cols_test], axis=1)
##### BUILD MODEL AND CREATE SUBMISSION ####
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
# normalize datatypes columns
#for colName in ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageCars', 'GarageArea']:
# OH_X_train[colName] = OH_X_train[colName].astype('float64')
# OH_X_valid[colName] = OH_X_train[colName].astype('float64')
# Build model
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(OH_X_train, y_train)
preds_test = model.predict(OH_X_test)
# Save test predictions to file
#output = pd.DataFrame({'Id': OH_X_test.index,
# 'SalePrice': preds_test})
#output.to_csv('submission.csv', index=False)
And here the full error log:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-2-2d85be0f6b26> in <module>
74 model = RandomForestRegressor(n_estimators=100, random_state=0)
75 model.fit(OH_X_train, y_train)
---> 76 preds_test = model.predict(OH_X_test)
77
78 # Save test predictions to file
/opt/conda/lib/python3.6/site-packages/sklearn/ensemble/forest.py in predict(self, X)
691 check_is_fitted(self, 'estimators_')
692 # Check data
--> 693 X = self._validate_X_predict(X)
694
695 # Assign chunk of trees to jobs
/opt/conda/lib/python3.6/site-packages/sklearn/ensemble/forest.py in _validate_X_predict(self, X)
357 "call `fit` before exploiting the model.")
358
--> 359 return self.estimators_[0]._validate_X_predict(X, check_input=True)
360
361 @property
/opt/conda/lib/python3.6/site-packages/sklearn/tree/tree.py in _validate_X_predict(self, X, check_input)
389 """Validate X whenever one tries to predict, apply, predict_proba"""
390 if check_input:
--> 391 X = check_array(X, dtype=DTYPE, accept_sparse="csr")
392 if issparse(X) and (X.indices.dtype != np.intc or
393 X.indptr.dtype != np.intc):
/opt/conda/lib/python3.6/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
540 if force_all_finite:
541 _assert_all_finite(array,
--> 542 allow_nan=force_all_finite == 'allow-nan')
543
544 if ensure_min_samples > 0:
/opt/conda/lib/python3.6/site-packages/sklearn/utils/validation.py in _assert_all_finite(X, allow_nan)
54 not allow_nan and not np.isfinite(X).all()):
55 type_err = 'infinity' if allow_nan else 'NaN, infinity'
---> 56 raise ValueError(msg_err.format(type_err, X.dtype))
57 # for object dtype data, we only check for NaNs (GH-13254)
58 elif X.dtype == np.dtype('object') and not allow_nan:
ValueError: Input contains NaN, infinity or a value too large for dtype('float32').
The problem is caused, as stated by the error message, from NaN
values in the OH_X_test
. Those values are introduced in the concat
statement since the indices of the dataframes are mixed up.
I've therefore added 3 fixes in the code below: look at the ###FIX
tag.
#### DATASETS LOAD ####
import pandas as pd
from sklearn.model_selection import train_test_split
# Read the data
X = pd.read_csv('../input/train.csv', index_col='Id')
X_test = pd.read_csv('../input/test.csv', index_col='Id')
# Remove rows with missing target, separate target from predictors
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X.SalePrice
X.drop(['SalePrice'], axis=1, inplace=True)
# To keep things simple, we'll drop columns with missing values
cols_with_missing = [col for col in X.columns if X[col].isnull().any()]
X.drop(cols_with_missing, axis=1, inplace=True)
X_test.drop(cols_with_missing, axis=1, inplace=True)
# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y,
train_size=0.8, test_size=0.2,
random_state=0)
#### IMPUTATION OF MISSING VALUES FOR X_TEST ####
from sklearn.impute import SimpleImputer
# All categorical columns
object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]
# Columns that will be one-hot encoded
low_cardinality_cols = [col for col in object_cols if X_train[col].nunique() < 10]
# Fill in the lines below: imputation
my_imputer = SimpleImputer(strategy='most_frequent')
imputed_X_test = pd.DataFrame(my_imputer.fit_transform(X_test))
# Fill in the lines below: imputation removed column names; put them back
imputed_X_test.columns = X_test.columns
imputed_X_test.index = X_test.index ###FIX
#### ONEHOT ENCODING FOR DATA #####
from sklearn.preprocessing import OneHotEncoder
# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[low_cardinality_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[low_cardinality_cols]))
OH_cols_test = pd.DataFrame(OH_encoder.transform(imputed_X_test[low_cardinality_cols]))
# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index
OH_cols_test.index = imputed_X_test.index ####FIX
# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)
num_X_test = imputed_X_test.drop(object_cols, axis=1) ####FIX
# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)
OH_X_test = pd.concat([num_X_test, OH_cols_test], axis=1)
##### BUILD MODEL AND CREATE SUBMISSION ####
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
# normalize datatypes columns
#for colName in ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageCars', 'GarageArea']:
# OH_X_train[colName] = OH_X_train[colName].astype('float64')
# OH_X_valid[colName] = OH_X_train[colName].astype('float64')
# Build model
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(OH_X_train, y_train)
preds_test = model.predict(OH_X_test)
# Save test predictions to file
output = pd.DataFrame({'Id': OH_X_test.index,
'SalePrice': preds_test})
output.to_csv('submission.csv', index=False)