Missing DataΒΆ
All of the models can handle missing data. For performance reasons, the default is not to do any checking for missing data. If, however, you would like for missing data to be handled internally, you can do so by using the missing keyword argument. The default is to do nothing
In [1]: import statsmodels.api as sm
In [2]: data = sm.datasets.longley.load()
In [3]: data.exog = sm.add_constant(data.exog)
# add in some missing data
In [4]: missing_idx = np.array([False] * len(data.endog))
In [5]: missing_idx[[4, 10, 15]] = True
In [6]: data.endog[missing_idx] = np.nan
In [7]: ols_model = sm.OLS(data.endog, data.exog)
In [8]: ols_fit = ols_model.fit()
In [9]: print(ols_fit.params)
[ nan nan nan nan nan nan nan]
This silently fails and all of the model parameters are NaN, which is probably not what you expected. If you are not sure whether or not you have missing data you can use missing = ‘raise’. This will raise a MissingDataError during model instantiation if missing data is present so that you know something was wrong in your input data.
In [10]: ols_model = sm.OLS(data.endog, data.exog, missing='raise')
---------------------------------------------------------------------------
MissingDataError Traceback (most recent call last)
<ipython-input-10-6b74d5399bc3> in <module>()
----> 1 ols_model = sm.OLS(data.endog, data.exog, missing='raise')
/build/statsmodels-0.9.0/.pybuild/pythonX.Y_3.5/build/statsmodels/regression/linear_model.py in __init__(self, endog, exog, missing, hasconst, **kwargs)
815 **kwargs):
816 super(OLS, self).__init__(endog, exog, missing=missing,
--> 817 hasconst=hasconst, **kwargs)
818 if "weights" in self._init_keys:
819 self._init_keys.remove("weights")
/build/statsmodels-0.9.0/.pybuild/pythonX.Y_3.5/build/statsmodels/regression/linear_model.py in __init__(self, endog, exog, weights, missing, hasconst, **kwargs)
661 weights = weights.squeeze()
662 super(WLS, self).__init__(endog, exog, missing=missing,
--> 663 weights=weights, hasconst=hasconst, **kwargs)
664 nobs = self.exog.shape[0]
665 weights = self.weights
/build/statsmodels-0.9.0/.pybuild/pythonX.Y_3.5/build/statsmodels/regression/linear_model.py in __init__(self, endog, exog, **kwargs)
177 """
178 def __init__(self, endog, exog, **kwargs):
--> 179 super(RegressionModel, self).__init__(endog, exog, **kwargs)
180 self._data_attr.extend(['pinv_wexog', 'wendog', 'wexog', 'weights'])
181
/build/statsmodels-0.9.0/.pybuild/pythonX.Y_3.5/build/statsmodels/base/model.py in __init__(self, endog, exog, **kwargs)
210
211 def __init__(self, endog, exog=None, **kwargs):
--> 212 super(LikelihoodModel, self).__init__(endog, exog, **kwargs)
213 self.initialize()
214
/build/statsmodels-0.9.0/.pybuild/pythonX.Y_3.5/build/statsmodels/base/model.py in __init__(self, endog, exog, **kwargs)
62 hasconst = kwargs.pop('hasconst', None)
63 self.data = self._handle_data(endog, exog, missing, hasconst,
---> 64 **kwargs)
65 self.k_constant = self.data.k_constant
66 self.exog = self.data.exog
/build/statsmodels-0.9.0/.pybuild/pythonX.Y_3.5/build/statsmodels/base/model.py in _handle_data(self, endog, exog, missing, hasconst, **kwargs)
85
86 def _handle_data(self, endog, exog, missing, hasconst, **kwargs):
---> 87 data = handle_data(endog, exog, missing, hasconst, **kwargs)
88 # kwargs arrays could have changed, easier to just attach here
89 for key in kwargs:
/build/statsmodels-0.9.0/.pybuild/pythonX.Y_3.5/build/statsmodels/base/data.py in handle_data(endog, exog, missing, hasconst, **kwargs)
631 klass = handle_data_class_factory(endog, exog)
632 return klass(endog, exog=exog, missing=missing, hasconst=hasconst,
--> 633 **kwargs)
/build/statsmodels-0.9.0/.pybuild/pythonX.Y_3.5/build/statsmodels/base/data.py in __init__(self, endog, exog, missing, hasconst, **kwargs)
63 if missing != 'none':
64 arrays, nan_idx = self.handle_missing(endog, exog, missing,
---> 65 **kwargs)
66 self.missing_row_idx = nan_idx
67 self.__dict__.update(arrays) # attach all the data arrays
/build/statsmodels-0.9.0/.pybuild/pythonX.Y_3.5/build/statsmodels/base/data.py in handle_missing(cls, endog, exog, missing, **kwargs)
279
280 elif missing == 'raise':
--> 281 raise MissingDataError("NaNs were encountered in the data")
282
283 elif missing == 'drop':
MissingDataError: NaNs were encountered in the data
If you want statsmodels to handle the missing data by dropping the observations, use missing = ‘drop’.
In [11]: ols_model = sm.OLS(data.endog, data.exog, missing='drop')
We are considering adding a configuration framework so that you can set the option with a global setting.