Search code examples
pythonlinear-regressionstatsmodelsvariable-selection

Using AIC for variable selection and to evaluate criterion in Multiple Regression


i am fairly new to R and Python. I like to perform multiple regression using Akaike Information Criterion for variable selection and to evaluate my criterion.

I have written some code to select my variables using the F Statistic P value. The dataset consists of information of housing prices

i am planning to regress the variables (i.e., xvar) onto resale price (i.e., yvar).

instead of using minpv to select my variables, i would like to use the AIC to select the variables.

Would also like to evaluate the resale price criterion using AIC instead of fpv

# Multiple Regression Variable Selection
def mr(selection=False):
    import os
    os.chdir(r'C:\Users\Path')
    import pandas as pd
    h=pd.read_csv('Dataset.csv',index_col=0)
    #print(h.head(0)) # dataset's variable names

    yvar='resale_price'
    modeleq = yvar + ' ~'
    for xvar in ( # Insert new 'x variable' into a row, ending with ','
        'storey_range_lower',
        'storey_range_lower_rt',
        'storey_range_lower_sq',
        'storey_range_upper',
        'storey_range_upper_rt',
        'storey_range_upper_sq',
        'floor_area_sqm',
        'floor_area_sqm_rt',
        'floor_area_sqm_sq',
        'lease_commence_year',
        'lease_commence_year_rt',
        'lease_commence_year_sq',
        'transaction_month',
        'transaction_month_rt',
        'transaction_month_sq',
        'town',
        'flat_model',
        'flat_type',
        'no_of_rooms',
        'block_number',
        'block_number_rt',
        'block_number_sq',
        'postal_code',
        'postal_code_rt',
        'postal_code_sq',
        'postal_code_2digit',
        'postal_code_2digit_rt',
        'postal_code_2digit_sq',
        ):
        if modeleq[-1] == '~':
            modeleq = modeleq + ' ' + xvar
        else:
            modeleq = modeleq + ' + ' + xvar

    #import matplotlib.pyplot as pl
    #%matplotlib inline
    #import numpy as np

    import statsmodels.api as sm
    from statsmodels.formula.api import ols

    bmodeleq=modeleq
    if selection :
        print('Variable Selection using p-value & PR(>F):')
        minfpv = 1.0

        while True :
            #Specify C() for Categorical, else could be interpreted as numeric:
            #hout=ols('resale_price ~ floor_area_sqm + C(flat_type)', data=h).fit()
            hout=ols(modeleq, data=h).fit()
            if modeleq.find(' + ') == -1 :
                # 1 xvar left
                break

            #print(dir(hout)) gives all the attributes of .fit(), e.g. .fvalue & .f_pvalue
            fpv=hout.f_pvalue
            if fpv < minfpv :
                minfpv=fpv
                bmodeleq=modeleq
            print('\nF-statistic =',hout.fvalue,'       PR(>F) =',fpv)

            prf = sm.stats.anova_lm(hout, typ=3)['PR(>F)']
            maxp=max(prf[1:])

            #print('\n',dict(prf))

            xdrop = prf[maxp==prf].axes[0][0] # 1st element of row-label .axes[0]
            #if xdrop.find('Intercept') != -1 :
            #    break

            # xdrop removed from model equation:
            if (modeleq.find('~ ' + xdrop + ' + ') != -1): 
                modeleq = modeleq.replace('~ ' + xdrop + ' + ','~ ') 
            elif (modeleq.find('+ ' + xdrop + ' + ') != -1): 
                modeleq = modeleq.replace('+ ' + xdrop + ' + ','+ ')
            else:
                modeleq = modeleq.replace(' + ' + xdrop,'') 
            #print('Model equation:',modeleq,'\n')

            print('Variable to drop:',xdrop,'       p-value =',prf[xdrop])
            #print('\nVariable left:\n'+str(prf[maxp!=prf][:-1]),'\n')

        print('\nF-statistic =',hout.fvalue,'       PR(>F) =',hout.f_pvalue)
        print('Variable left:\n'+str(prf[maxp!=prf][:-1]),'\n')
        #input("found intercept")
        print('Best model equation:',bmodeleq)
        print('Minimum PR(>F) =',minfpv,'\n')

    hout=ols(bmodeleq, data=h).fit()

    print(sm.stats.anova_lm(hout, typ=1))
    #print(anova) # Anova table with 'Treatment' broken up
    hsum=hout.summary()

    print('\n',hsum)

    last=3 #number of bottom p-values to display with more precision
    #p-values are not in general the same as PR(>F) from ANOVA
    print("\nLast",last,"x-coefficients' p-values:")
    nxvar=len(hout.pvalues)
    for i in range(last,0,-1):
        print('    ',hout.pvalues.axes[0][nxvar-i],'    ',hout.pvalues[nxvar-i])

    # Output Coefficient table:
    #from IPython.core.display import HTML
    #HTML(hout.summary().tables[1].as_html()) #.tables[] from 0 to 3

mr(True) # do Variable Selection
#mr() # do multiple regression once

Appreciate any form of help i can get and thank you in advance!!


Solution

  • In your codes, Line 66 gives the clues on answering your question.

    #print(dir(hout)) gives all the attributes of .fit(), e.g. .fvalue & .f_pvalue
    

    hout has an aic attribute that you can call using hout.aic

    The straight-out answer is to use hout.aic instead of hout.f_pvalue for Line 67.

    However, you need to re-specify the initial check value minfpv since 1.0 would be too small for AIC in this case. That is for Line 56.

    Try it out and see what the initial minfpv should be.

    Neo :)