python-3.x pytorch logistic-regression bayesian pyro

How to run a proper Bayesian Logistic Regression

I'm trying to run a bayesian logistic regression on the wine dataset provided from the sklearn package. As variables, I decided to use alcohol, color_intensity, flavanoids, hue and magnesium where alcohol is my response variable and the rest the predictors. To do so, I'm using pyro and torch packages:

import pyro
import torch
import pyro.distributions as dist
import pyro.optim as optim
from pyro.infer import SVI, Trace_ELBO
import pandas as pd
import numpy as np
from pyro.infer import Predictive
import torch.distributions.constraints as constraints
from sklearn import datasets
pyro.set_rng_seed(0)

#loading data and prepearing dataframe
 wine = datasets.load_wine()
 data = pd.DataFrame(columns = wine['feature_names'], data=wine['data'] )

#choosiing variables: response and predictors
 variables = data[['alcohol', 'color_intensity', 'flavanoids', 'hue', 'magnesium']]

#standardization
 variables = (variables-variables.min())/(variables.max()-variables.min())

#tensorizing
 alcohol = torch.tensor(variables['alcohol'].values, dtype=torch.float)
 predictors = torch.stack([torch.tensor(variables[column].values, dtype=torch.float)
               for column in ['alcohol', 'color_intensity', 'flavanoids', 'hue', 'magnesium']], 1)

#splitting data
 k = int(0.8 * len(variables))
 x_train, y_train = predictors[:k], alcohol[:k]
 x_test, y_test = predictors[k:], alcohol[k:]

#modelling

def model_alcohol(predictors, alcohol):
    n_observations, n_predictors = predictors.shape

 #weights
    w = pyro.sample('w', dist.Normal(torch.zeros(n_predictors), torch.ones(n_predictors)))
    epsilon = pyro.sample('epsilon', dist.Normal(0.,1.))

#non-linearity
    y_hat = torch.sigmoid((w*predictors).sum(dim=1) + epsilon)
    sigma = pyro.sample("sigma", dist.Uniform(0.,3.))
    with pyro.plate('alcohol', len(alcohol)):
       y=pyro.sample('y', dist.Normal(y_hat, sigma), obs=alcohol)

def guide_alcohol(predictors, alcohol=None):
    n_observations, n_predictors = predictors.shape

    w_loc = pyro.param('w_loc', torch.rand(n_predictors))
    w_scale = pyro.param('w_scale', torch.rand(n_predictors), constraint=constraints.positive)
    w = pyro.sample('w', dist.Normal(w_loc, w_scale))

    epsilon_loc = pyro.param('b_loc', torch.rand(1))
    epsilon_scale = pyro.param('b_scale', torch.rand(1), constraint=constraints.positive)
    epsilon = pyro.sample('epsilon', dist.Normal(epsilon_loc, epsilon_scale))

    sigma_loc = pyro.param('sigma_loc', torch.rand(n_predictors))
    sigma_scale = pyro.param('sigma_scale', torch.rand(n_predictors), 
                   constraint=constraints.positive)
    sigma = pyro.sample('sigma', dist.Normal(sigma_loc, sigma_scale))


alcohol_svi = SVI(model=model_alcohol, guide=guide_alcohol, optim=optim.ClippedAdam({'lr' : 0.0002}),
                loss=Trace_ELBO())

losses = []
for step in range(10000):
   loss = alcohol_svi.step(x_train, y_train)/len(x_train)
   losses.append(loss)

As I have to use Stochastic Variational Inference, I have defined both the model and the guide. My problem is now at matching tensor sizes, as I now I get the error:

RuntimeError: The size of tensor a (142) must match the size of tensor b (5) at non-singleton 
dimension 0
Trace Shapes:      
Param Sites:      
Sample Sites:      
   w dist   5 |
    value   5 |
 epsilon dist |
    value   1 |
 sigma dist   |
    value   5 |
 alcohol dist |
    value 142 |

I'm kinda new to the idea of modelling on my own, so clearly there are mistakes around the code (hopefully not on the theory behind it). Still, I see I should adjust dimension on the guide maybe? I'm not entirely sure on how to honestly.

Solution

Your main problem is that w is not declared as a single event (.to_event(1)), and your variance (sigma) should have the same dim as your observations (()). The model and guide below fix this; I suggest you look at auto-generated guides in Pyro, and a different prior on sigma.

def model_alcohol(predictors, alcohol):
    n_observations, n_predictors = predictors.shape

    # weights
    # w is a single event
    w = pyro.sample('w', dist.Normal(torch.zeros(n_predictors), torch.ones(n_predictors)).to_event(1))
    epsilon = pyro.sample('epsilon', dist.Normal(0., 1.))

    # non-linearity
    y_hat = torch.sigmoid(predictors @ w + epsilon)  # (predictors * weight).sum(1) == predictors @ w
    sigma = pyro.sample("sigma", dist.Uniform(0., 3.))
    with pyro.plate('alcohol', len(alcohol)):
        pyro.sample('y', dist.Normal(y_hat, sigma), obs=alcohol)


def guide_alcohol(predictors, alcohol=None):
    n_observations, n_predictors = predictors.shape

    w_loc = pyro.param('w_loc', torch.rand(n_predictors))
    w_scale = pyro.param('w_scale', torch.rand(n_predictors), constraint=constraints.positive)
    pyro.sample('w', dist.Normal(w_loc, w_scale).to_event(1))

    epsilon_loc = pyro.param('b_loc', torch.rand(1))
    epsilon_scale = pyro.param('b_scale', torch.rand(1), constraint=constraints.positive)
    epsilon = pyro.sample('epsilon', dist.Normal(epsilon_loc, epsilon_scale))

    sigma_loc = pyro.param('sigma_loc', torch.rand(1))
    sigma_scale = pyro.param('sigma_scale', torch.rand(1),
                             constraint=constraints.positive)
    pyro.sample('sigma', dist.HalfNormal(sigma_loc, sigma_scale))  # MUST BE POSITIVE