Search code examples
pythongroup-bystatsmodels

How to use ols with groupby?


The following code is from "Python for Data Analysis",chp 11,group transforms and analysis. I show the version of each library as below.

# -*- coding: utf-8 -*-
""" Created on Sun Jun  4 13:33:47 2017
"Python for Data Analysis",chp 11,group transforms and analysis. 
"""
import numpy as np    # np.__version__'1.12.1'
import pandas as pd   # pd.__version__ '0.20.2'
import random; random.seed(a=0,version=2)
import statsmodels.api as sm  # statsmodels.__version__  '0.8.0'
import string

# generate tickers from random
N=1000
def rands(n):
    choices=string.ascii_uppercase
    return (''.join([random.choice(choices) for _ in range(n)]))
tickers=np.array([rands(5) for _ in range(N)])

# generate data for tickers
M=500
df=pd.DataFrame({'Momentum': np.random.randn(M)/200+0.03,
                 'Value':np.random.randn(M)/200+0.08,
                 'ShortInterest':np.random.randn(M)/200-0.02},
                index=tickers[:M])
# create industry
ind_names=np.array(['Financial','Tech'])

sampler=np.random.randint(low=0,high=len(ind_names),size=N, dtype='l')
industries=pd.Series(ind_names[sampler],index=tickers, 
                     name='industry')

#%% factor analysis
fac1,fac2,fac3=np.random.rand(3,1000)
ticker_subset=tickers.take(np.random.permutation(N)[:1000])

port=pd.Series(0.7*fac1-1.2*fac2+0.3*fac3+np.random.rand(1000),
               index=ticker_subset)
factors=pd.DataFrame({'f1':fac1,'f2':fac2,'f3':fac3},
                          index=ticker_subset)

by_ind=port.groupby(industries)

This part is from the book, while pd.ols has been depreciated.

#%% use pd.ols, which is depreciated.
# AttributeError: module 'pandas' has no attribute 'ols'
def beta_exposure(chuck,factors=None):
    return pd.ols(y=chuck, x=factors).beta
exposures_pd=by_ind.apply(beta_exposure,factors=factors)
print('\nexposures_pd\n',exposures_pd.unstack())

I would like to use sm.OLS, while I have trouble in selecting corresponding rows for x. How should I deal with it?

#%% use sm.OLS, which is not show in the book.
def exposure(chuck,factors):
    y=np.array(chuck).reshape(len(chuck),1)
#   The following code is wrong, as the rows number is not the corresponding rows as y
#   I use [:len(chuck)] just to keep x have same rows number as y.
    x=factors[['f1','f2','f3']][:len(chuck)]
    print(x[:5])
    print(x.shape)
    sx=sm.OLS(y,x).fit()
    print(sx.summary())
    return sm.OLS(y,x).fit()
exposures_sm=exposure(port, factors)

Solution

  • after several try, I think maybe I can do it by combine Series and DataFrame.

    factors_data['port']=port
    
    def group_ols(fts):
        results=[]
        for ind, ft in fts:
            y=ft.loc[:,'port']
            x=ft.loc[:,['f1','f2','f3']]
            result=sm.OLS(y,x).fit()
            results.append((ind,result.summary()))
        return results
    
    exposures_sm=group_ols(factors_data.groupby(industries))
    exposures_sm
    

    the result is like this.

    [('Financial', <class 'statsmodels.iolib.summary.Summary'>
      """
                                  OLS Regression Results                            
      ==============================================================================
      Dep. Variable:                   port   R-squared:                       0.746
      Model:                            OLS   Adj. R-squared:                  0.744
      Method:                 Least Squares   F-statistic:                     482.4
      Date:                Thu, 29 Jun 2017   Prob (F-statistic):          2.37e-146
      Time:                        17:13:34   Log-Likelihood:                -134.55
      No. Observations:                 497   AIC:                             275.1
      Df Residuals:                     494   BIC:                             287.7
      Df Model:                           3                                         
      Covariance Type:            nonrobust                                         
      ==============================================================================
                       coef    std err          t      P>|t|      [0.025      0.975]
      ------------------------------------------------------------------------------
      f1             1.0231      0.043     23.894      0.000       0.939       1.107
      f2            -0.9639      0.042    -23.146      0.000      -1.046      -0.882
      f3             0.6397      0.042     15.391      0.000       0.558       0.721
      ==============================================================================
      Omnibus:                       34.466   Durbin-Watson:                   1.916
      Prob(Omnibus):                  0.000   Jarque-Bera (JB):               12.724
      Skew:                          -0.063   Prob(JB):                      0.00173
      Kurtosis:                       2.226   Cond. No.                         3.24
      ==============================================================================
    
      Warnings:
      [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
      """), ('Tech', <class 'statsmodels.iolib.summary.Summary'>
      """
                                  OLS Regression Results                            
      ==============================================================================
      Dep. Variable:                   port   R-squared:                       0.738
      Model:                            OLS   Adj. R-squared:                  0.736
      Method:                 Least Squares   F-statistic:                     468.9
      Date:                Thu, 29 Jun 2017   Prob (F-statistic):          7.30e-145
      Time:                        17:13:34   Log-Likelihood:                -172.76
      No. Observations:                 503   AIC:                             351.5
      Df Residuals:                     500   BIC:                             364.2
      Df Model:                           3                                         
      Covariance Type:            nonrobust                                         
      ==============================================================================
                       coef    std err          t      P>|t|      [0.025      0.975]
      ------------------------------------------------------------------------------
      f1             1.0530      0.045     23.525      0.000       0.965       1.141
      f2            -0.8811      0.045    -19.742      0.000      -0.969      -0.793
      f3             0.5762      0.046     12.538      0.000       0.486       0.667
      ==============================================================================
      Omnibus:                       45.191   Durbin-Watson:                   2.013
      Prob(Omnibus):                  0.000   Jarque-Bera (JB):               15.547
      Skew:                          -0.123   Prob(JB):                     0.000421
      Kurtosis:                       2.175   Cond. No.                         3.29
      ==============================================================================
    
      Warnings:
      [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
      """)]