Search code examples
pythonapistatsmodelsquantile-regression

python statsmodels: Difference in output "formula.api" vs. ""regression.quantile_regression"


For the modul statsmodels using python, I would please like to know how differences in calling the same procedures using statsmodels.formula.api versus statsmodels.regression.quantile_regression come about. In particular, I obtain differences in parameter estimates.

A minimum working example is attached.

#%% Moduls;
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.regression.quantile_regression import QuantReg


#%% Load in sample data;
data = sm.datasets.engel.load_pandas().data

#%% smf-Version;
model1 = smf.quantreg(formula='foodexp ~ income', data=data, missing="drop")
result1 = model1.fit(q=0.5, vcov='robust', kernel='epa', bandwidth='hsheather', max_iter=1000, p_tol=1e-06)

#%% QuantReg-Version;
model2 = QuantReg \
    (
        data['foodexp'].values,
        exog            =           sm.tools.tools.add_constant(data['income']).values,
        missing         =           'drop'
    )
result2 = model2.fit \
    (
        q              =           0.5, vcov='robust', kernel='epa', bandwidth='hsheather', max_iter=1000, p_tol=1e-06
    )

#%% Compare Results;
print(result1.params[0])
print(result2.params[0])
print('Difference times 10^9:       ' + str(abs(10**9*(result1.params[0]-result2.params[0]))))

EDIT:

I need to edit my question; the workaround proposed by below, for which I am still very grateful, does not work in the applied setting; reason: I do not have only 1 regressor. Please find the modified version attached.

#%% Moduls;
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.regression.quantile_regression import QuantReg


#%% Load in sample data;
data = sm.datasets.engel.load_pandas().data
data['income2'] = data['income']**2

#%% smf-Version;
model1 = smf.quantreg(formula='foodexp ~ income + income2', data=data, missing="drop")
result1 = model1.fit(q=0.5, vcov='robust', kernel='epa', bandwidth='hsheather', max_iter=1000, p_tol=1e-06)

#%% QuantReg-Version;
model2 = QuantReg \
    (
        data['foodexp'].values,
        exog            =           sm.tools.tools.add_constant(data[['income', 'income2']].values),
        missing         =           'drop'
    )
result2 = model2.fit \
    (
        q              =           0.5, vcov='robust', kernel='epa', bandwidth='hsheather', max_iter=1000, p_tol=1e-06
    )

#%% Compare Results;
print(result1.params[0])
print(result2.params[0])
print('Difference times 10^9:       ' + str(abs(10**9*(result1.params[0]-result2.params[0]))))

Solution

  • You need a small change in your code. That's making a big difference

    #%% QuantReg-Version;
    model2 = QuantReg ( data['foodexp'].values, exog = sm.tools.tools.add_constant(data['income'].values), missing = 'drop')
    

    As you are putting it outside is making a big difference in internal implementation.

    Final implementation

        #%% Moduls;
        import numpy as np
        import pandas as pd
        import statsmodels.api as sm
        import statsmodels.formula.api as smf
        from statsmodels.regression.quantile_regression import QuantReg
    
    
        #%% Load in sample data;
        data = sm.datasets.engel.load_pandas().data
    
        #%% smf-Version;
        model1 = smf.quantreg(formula='foodexp ~ income', data=data, missing="drop")
        result1 = model1.fit(q=0.5, vcov='robust', kernel='epa', bandwidth='hsheather', 
        max_iter=1000, p_tol=1e-06)
    
        #%% QuantReg-Version;
        model2 = QuantReg \
            (
                data['foodexp'].values,
                exog  =   sm.tools.tools.add_constant(data['income'].values),
                missing  = "drop"
            )
        result2 = model2.fit(q=0.5, vcov='robust', kernel='epa', bandwidth='hsheather', max_iter=1000, p_tol=1e-06)
    
        #%% Compare Results;
        print(result1.params[0])
        print(result2.params[0])
        print('Difference times 10^9:       ' + str(abs(10**9*(result1.params[0]-result2.params[0]))))
    

    Addition to my above code. I have copied exog from model 2 to model 1

        #%% Moduls;
    import statsmodels.api as sm
    import statsmodels.formula.api as smf
    from statsmodels.regression.quantile_regression import QuantReg
    
    
    #%% Load in sample data;
    data = sm.datasets.engel.load_pandas().data
    data['income2'] = data['income']**2
    
    model1 = smf.quantreg(formula='foodexp ~ income + income2', data=data, missing="drop")
    model2 = QuantReg (data['foodexp'].values, exog = sm.tools.tools.add_constant(data[['income', 'income2']].values), missing = 'drop')
    model1.exog = model2.exog 
    
    result1 = model1.fit(q=0.5, vcov='robust', kernel='epa', bandwidth='hsheather', max_iter=1000, p_tol=1e-06)
    result2 = model2.fit(q=0.5, vcov='robust', kernel='epa', bandwidth='hsheather', max_iter=1000, p_tol=1e-06)
    
    #%% Compare Results;
    print(result1.params[0])
    print(result2.params[0])
    print('Difference times 10^9:       ' + str(abs(10**9*(result1.params[0]-result2.params[0]))))
    

    And second approach:- I have copied exog from model 1 to model 2

    #%% Moduls;
    import statsmodels.api as sm
    import statsmodels.formula.api as smf
    from statsmodels.regression.quantile_regression import QuantReg
    
    
    #%% Load in sample data;
    data = sm.datasets.engel.load_pandas().data
    data['income2'] = data['income']**2
    
    model1 = smf.quantreg(formula='foodexp ~ income + income2', data=data, missing="drop")
    model2 = QuantReg (data['foodexp'].values, exog = sm.tools.tools.add_constant(data[['income', 'income2']].values), missing = 'drop')
    model2.exog = model1.exog 
    
    result1 = model1.fit(q=0.5, vcov='robust', kernel='epa', bandwidth='hsheather', max_iter=1000, p_tol=1e-06)
    result2 = model2.fit(q=0.5, vcov='robust', kernel='epa', bandwidth='hsheather', max_iter=1000, p_tol=1e-06)
    
    #%% Compare Results;
    print(result1.params[0])
    print(result2.params[0])
    print('Difference times 10^9:       ' + str(abs(10**9*(result1.params[0]-result2.params[0]))))
    

    If i keep both exog to same values, answers are equal. So there is clear difference in implementation for data conversion i stated previously.