Search code examples
pythonlinear-regressionstatsmodels

statsmodels.api returning MissingDataError: exog contains inf or nans when trying to fit multivariate regression


I am trying to fit a multivariate linear regression model with statsmodels.api. I get an error MissingDataError: exog contains inf or nans. I have checked for nans and inf and find none. How is this possible? why am I getting this error?

CODE

import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
import pandas as pd
import numpy as np

df = pd.read_csv('clean_df.csv')
x_multi = df.drop('price', axis=1) #feature variables.
x_multi_cons = sm.add_constant(x_multi) #add row of constants.

I checked all the exog variables for na values and found none.

x_multi_cons.isna().sum()

const                       0
crime_rate                  0
resid_area                  0
air_qual                    0
room_num                    0
age                         0
teachers                    0
poor_prop                   0
n_hos_beds                  8
n_hot_rooms                 0
rainfall                    0
parks                       0
avg_dist                    0
airport_YES                 0
waterbody_Lake              0
waterbody_Lake and River    0
waterbody_River             0
dtype: int64

I also checked the exog variables for inf values and found none.

np.isinf(x_multi_cons).sum()
const                       0
crime_rate                  0
resid_area                  0
air_qual                    0
room_num                    0
age                         0
teachers                    0
poor_prop                   0
n_hos_beds                  0
n_hot_rooms                 0
rainfall                    0
parks                       0
avg_dist                    0
airport_YES                 0
waterbody_Lake              0
waterbody_Lake and River    0
waterbody_River             0
dtype: int64

Here I am fitting the model.

 y_multi = df['price'] # Dependent variable.
 lm_multi = sm.OLS(y_multi, x_multi_cons).fit() 

But I am still getting the Error: "MissingDataError: exog contains inf or nans". How is this possible?

ERROR: 
MissingDataError                          Traceback (most recent call last)
<ipython-input-67-ca6d2e9ba2c0> in <module>
----> 1 lm_multi = sm.OLS(y_multi, x_multi_cons).fit()

~/anaconda3/envs/python3/lib/python3.6/site-packages/statsmodels/regression/linear_model.py in __init__(self, endog, exog, missing, hasconst, **kwargs)
    871                  **kwargs):
    872         super(OLS, self).__init__(endog, exog, missing=missing,
--> 873                                   hasconst=hasconst, **kwargs)
    874         if "weights" in self._init_keys:
    875             self._init_keys.remove("weights")

~/anaconda3/envs/python3/lib/python3.6/site-packages/statsmodels/regression/linear_model.py in __init__(self, endog, exog, weights, missing, hasconst, **kwargs)
    702             weights = weights.squeeze()
    703         super(WLS, self).__init__(endog, exog, missing=missing,
--> 704                                   weights=weights, hasconst=hasconst, **kwargs)
    705         nobs = self.exog.shape[0]
    706         weights = self.weights

~/anaconda3/envs/python3/lib/python3.6/site-packages/statsmodels/regression/linear_model.py in __init__(self, endog, exog, **kwargs)
    188     """
    189     def __init__(self, endog, exog, **kwargs):
--> 190         super(RegressionModel, self).__init__(endog, exog, **kwargs)
    191         self._data_attr.extend(['pinv_wexog', 'weights'])
    192 

~/anaconda3/envs/python3/lib/python3.6/site-packages/statsmodels/base/model.py in __init__(self, endog, exog, **kwargs)
    235 
    236     def __init__(self, endog, exog=None, **kwargs):
--> 237         super(LikelihoodModel, self).__init__(endog, exog, **kwargs)
    238         self.initialize()
    239 

~/anaconda3/envs/python3/lib/python3.6/site-packages/statsmodels/base/model.py in __init__(self, endog, exog, **kwargs)
     76         hasconst = kwargs.pop('hasconst', None)
     77         self.data = self._handle_data(endog, exog, missing, hasconst,
---> 78                                       **kwargs)
     79         self.k_constant = self.data.k_constant
     80         self.exog = self.data.exog

~/anaconda3/envs/python3/lib/python3.6/site-packages/statsmodels/base/model.py in _handle_data(self, endog, exog, missing, hasconst, **kwargs)
     99 
    100     def _handle_data(self, endog, exog, missing, hasconst, **kwargs):
--> 101         data = handle_data(endog, exog, missing, hasconst, **kwargs)
    102         # kwargs arrays could have changed, easier to just attach here
    103         for key in kwargs:

~/anaconda3/envs/python3/lib/python3.6/site-packages/statsmodels/base/data.py in handle_data(endog, exog, missing, hasconst, **kwargs)
    671     klass = handle_data_class_factory(endog, exog)
    672     return klass(endog, exog=exog, missing=missing, hasconst=hasconst,
--> 673                  **kwargs)

~/anaconda3/envs/python3/lib/python3.6/site-packages/statsmodels/base/data.py in __init__(self, endog, exog, missing, hasconst, **kwargs)
     85         self.const_idx = None
     86         self.k_constant = 0
---> 87         self._handle_constant(hasconst)
     88         self._check_integrity()
     89         self._cache = {}

~/anaconda3/envs/python3/lib/python3.6/site-packages/statsmodels/base/data.py in _handle_constant(self, hasconst)
    131             exog_max = np.max(self.exog, axis=0)
    132             if not np.isfinite(exog_max).all():
--> 133                 raise MissingDataError('exog contains inf or nans')
    134             exog_min = np.min(self.exog, axis=0)
    135             const_idx = np.where(exog_max == exog_min)[0].squeeze()

MissingDataError: exog contains inf or nans

Solution

  • I am not so sure how you conclude there's no na values, if you look at your table:

    x_multi_cons.isna().sum()
    
    [...]
    n_hos_beds                  8
    [...]
    

    This means there are 8 missing values for n_hos_beds . If it doesn't hurt you model, just remove the nans at the start:

    df = df.dropna()