Search code examples
pythonscikit-learnstatsmodels

RFE from scikit-learn feature_selection with NegativeBinomial from statsmodels as estimator


I'm trying to use RFE from scikit-learn with an estimator from statsmodels NegativeBinomial.

So I created my own class:

from sklearn.datasets import make_friedman1
from sklearn.feature_selection import RFE
from sklearn.base import BaseEstimator
import statsmodels.api as sm

class MyEstimator(BaseEstimator):
    def __init__(self, formula_, data_, family_):
        self.model = sm.formula.glm(formula, data=data_, family=family_)

    def fit(self, **kwargs):
        self.model.fit()
        self.coef_ = self.model.params.values

    def predict(self, X):
        result = self.model.predict(X)    
        return np.array(result)

X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)


dataset = pd.DataFrame({'X1':X[:,0], 'X2':X[:,1], 'X3':X[:,2], 'y':y})

estimator = MyEstimator("y ~ X1 + X2 + X3", dataset, sm.families.NegativeBinomial())

selector = RFE(estimator, n_features_to_select=5, step=1)
selector = selector.fit()

But I get this error:

TypeError: fit() missing 2 required positional arguments: 'X' and 'y'

Does someone has an idea?


Solution

  • You can modify your code to require endog and exog variables, instead of using the formula API:

    import numpy as np
    import pandas as pd
    from sklearn.datasets import make_friedman1
    from sklearn.feature_selection import RFE
    from sklearn.base import BaseEstimator
    import statsmodels.api as sm
    
    class MyEstimator(BaseEstimator):
        def __init__(self, family_):
            self.family_ = family_
    
        def fit(self, exog, endog):
            self.model = sm.GLM(endog, exog, family=self.family_)
            fit_results = self.model.fit()
            self.coef_ = fit_results.params
    
        def predict(self, X):
            result = self.model.predict(X)    
            return np.array(result)
    
    X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)
    
    estimator = MyEstimator(sm.families.NegativeBinomial())
    
    selector = RFE(estimator, n_features_to_select=5, step=1)
    selector = selector.fit(X, y.reshape(-1,1))
    print(selector.ranking_)
    # [1 1 3 1 1 5 1 6 4 2]