Search code examples
pythontheanopymc3

Prediction using Bayesian logistic regression using pymc3


I am trying to perform Bayesian logistic regression using pymc3, but I am facing an issue in using the model to perform prediction.

Data:

My dataset is of the housing loan default data, with sample data as follows:

BAD LOAN MORTDUE VALUE  REASON  JOB     YOJ DEROG   DELINQ  CLAGE     NINQ  CLNO    DEBTINC
1   1700 0548    40320  HomeImp Other   9   0       0       101.466002  1   8      37.113614
1   1800 28502   43034  HomeImp Other   11  0       0       88.766030   0   8      36.884894
0   2300 102370  120953 HomeImp Office  2   0       0       90.992533   0   13     31.588503

Problem:

I want to perform prediction on the test dataset, one way to do this is using the shared variable approach:

X_shared = theano.shared(X_train)

with pm.Model() as logistic_model_pred:
    pm.glm.GLM(x=X_shared,
               y=y_train, 
               labels=labels,
               family=pm.glm.families.Binomial())


X_shared.set_value(X_test)
ppc = pm.sample_ppc(pred_trace,
                model=logistic_model_pred,
                samples=100)

However, using the above code (theano shared variable) is causing the following issue:

Error message:

AsTensorError: ('Variable type field must be a TensorType.', <Generic>, <theano.gof.type.Generic object at 0x00000216ABB16730>)

Possible Solution:

Using the following code does solve the issue but I do not know how to use the same model for test data.

with pm.Model() as logistic_model_pred:
    pm.glm.GLM.from_formula('BAD ~ DELINQ + DEROG + DEBTINC + NINQ + CLNO + VALUE + MORTDUE + YOJ + LOAN + CLAGE + JOB',
                            data=pd.concat([y_train.reset_index(drop=True), X_train], axis=1),
                            family=pm.glm.families.Binomial())
    pred_trace = pm.sample(tune=1500,
                      draws=1000,
                         chains=4,
                         cores=1,
                      init='adapt_diag')

Full code:

%matplotlib inline
from pathlib import Path
import pickle
from collections import OrderedDict
import pandas as pd
import numpy as np
from scipy import stats
import multiprocessing
import arviz as az

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import (roc_curve, roc_auc_score, confusion_matrix, accuracy_score, f1_score, 
                             precision_recall_curve, balanced_accuracy_score) 
from mlxtend.plotting import plot_confusion_matrix

import theano
import pymc3 as pm
from pymc3.variational.callbacks import CheckParametersConvergence
import statsmodels.formula.api as smf

import arviz as az
import matplotlib.pyplot as plt
import matplotlib.cm as cm

import seaborn as sns
from IPython.display import HTML

import sys

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")


# intialise data of lists. 
data = {'BAD':[1,1,0,1,0,0,0,1,1,0,0,1,0,0,1,0,1],
        'LOAN':[1700,1800,2300,2400,2400,2900,2900,2900,2900,
3000,3200,3300,3600,3600,3700,3800,3900],
        'MORTDUE':[30548,28502,102370,34863,98449,103949,104373,7750,61962,104570,
74864,130518,100693,52337,17857,51180,29896],
        'VALUE':[40320,43034,120953,47471,117195,112505,120702,67996,70915,121729,
87266,164317,114743,63989,21144,63459,45960],
        'REASON':['HomeImp','HomeImp','HomeImp','HomeImp','HomeImp',
'HomeImp','HomeImp','HomeImp',
                  'DebtCon','HomeImp','HomeImp','DebtCon','HomeImp','HomeImp',
'HomeImp','HomeImp','HomeImp'],
        'JOB':['Other','Other','Office','Mgr','Office','Office','Office',
'Other','Mgr','Office','ProfExe',
               'Other','Office','Office','Other','Office','Other'],
        'YOJ':[9,11,2,12,4,1,2,16,2,2,7,9,6,20,5,20,11],
        'DEROG':[0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0],
        'DELINQ':[0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0],
        'CLAGE':[101.4660019,88.76602988,90.99253347,70.49108003,
93.81177486,96.10232967,101.5402975,
                 122.2046628,282.8016592,85.8843719,250.6312692,
192.289149,88.47045214,204.2724988,
                 129.7173231,203.7515336,146.1232422],
        'NINQ':[1,0,0,1,0,0,0,2,3,0,0,0,0,0,1,0,0],
        'CLNO':[8,8,13,21,13,13,13,8,37,14,12,33,14,20,9,20,14],
        'DEBTINC':[37.11361356,36.88489409,31.58850318,38.26360073,
29.68182705,30.05113629,29.91585903,
                   36.211348,49.20639579,32.05978327,42.90999735,
35.73055919,29.39354338,20.47091551,
                   26.63434752,20.06704205,24.47888119]
       } 
  
# Create DataFrame 
data = pd.DataFrame(data) 

# datatype defining
data[['BAD', 'LOAN', 'MORTDUE', 'VALUE', 'YOJ', 'DEROG', 'DELINQ', 
'CLAGE', 'NINQ', 'CLNO', 'DEBTINC']] = data[['BAD', 'LOAN', 'MORTDUE', 
'VALUE', 'YOJ', 'DEROG', 'DELINQ', 'CLAGE', 'NINQ', 'CLNO', 'DEBTINC' ]].apply(pd.to_numeric) 
data[['REASON', 'JOB']] = data[['REASON', 'JOB']].apply(lambda x: x.astype('category'))
print(data.dtypes) 
data.dropna(axis=0, how='any',inplace=True)    

# test train split
X = data.drop('BAD', axis=1)
y = data.BAD
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12345)
labels = X_train.columns


# model training (error cause)
X_shared = theano.shared(X_train)

with pm.Model() as logistic_model_pred:
    pm.glm.GLM(x=X_shared,
               y=y_train, 
               labels=labels,
               family=pm.glm.families.Binomial())


# Prediction on test data

X_shared = theano.shared(X_test)

ppc = pm.sample_ppc(pred_trace,
                    model=logistic_model_pred,
                    samples=100)


# AUC
np.mean(ppc['y'], axis=0).shape
y_score = np.mean(ppc['y'], axis=0)
roc_auc_score(y_score=np.mean(ppc['y'], axis=0), 
              y_true=y_test)

pred_scores = dict(y_true=y_test, y_score=y_score)
cols = ['False Positive Rate', 'True Positive Rate', 'threshold']
roc = pd.DataFrame(dict(zip(cols, roc_curve(**pred_scores))))

precision, recall, ts = precision_recall_curve(y_true=y_test, probas_pred=y_score)
pr_curve = pd.DataFrame({'Precision': precision, 'Recall': recall})

f1 = pd.Series({t: f1_score(y_true=y_test, y_pred=y_score>t) for t in ts})
best_threshold = f1.idxmax()




# Alternative solution

with pm.Model() as logistic_model_pred:
    pm.glm.GLM.from_formula('BAD ~ DELINQ + DEROG + DEBTINC + NINQ + 
CLNO + VALUE + MORTDUE + YOJ + LOAN + CLAGE + JOB',
                            data=pd.concat([y_train.reset_index(drop=True), X_train], axis=1),
                            family=pm.glm.families.Binomial())
    pred_trace = pm.sample(tune=1500,
                      draws=1000,
                         chains=4,
                         cores=1,
                      init='adapt_diag')

Solution

  • If you replace the code between your # model training (error cause) and # AUC comments with the below you should be able to run it and start getting some results:

    # model training (error cause)
    X_train2 = X_train[X_train.columns[0:3]].values
    scaler = preprocessing.StandardScaler()
    scaler.fit(X_train2)
    X_train2 = scaler.transform(X_train2)
    
    X_shared = theano.shared(X_train2) #theano.shared(X_train)
    
    with pm.Model() as logistic_model_pred:
        pm.glm.GLM(x=X_shared,
                   y=y_train.values, 
                   labels=labels[0:3],
                   family=pm.glm.families.Binomial())
        trace = pm.sample()
    
    
    # Prediction on test data
    X_test2 = scaler.transform(X_test[X_train.columns[0:3]].values)
    
    #X_shared = theano.shared(X_test2)
    X_shared.set_value(X_test2)
    
    ppc = pm.sample_ppc(trace,
                        model=logistic_model_pred,
                        samples=100)
    # AUC
    

    I've made the following changes:

    • I've changed variable that goes into theano.shared to a numpy array.
    • This means that the string columns in X_train need to be transformed using one hot encoding or something similar. I haven't done that therefore I only used the first 3 columns that happens to be numerical
    • I've also used the standard scaler to rescale the inputs before running pymc3.
    • Lastly for the posterior predictions I changed the values of the shared variable using .set_value

    The sampling gives many divergences but I think the above provides you with the setup.