Search code examples
pythonbayesianpymc

Blackbox likelihood example


I'm trying to understand how to use a black box likelihood function in pymc. Basically, this is explained here. I have tried implementing this on my own with a very simple Python model (a double logistic function), and no gradient. In addition to the code in the link I mentioned, that sets up the theano wrappers around the loglikelihood function, I have the following code

# Define the model
def my_model(p, t, m_m, m_M):
    """An simple double logistic model"""
    return m_m + (m_M - m_m) * (
        1.0 / (1 + np.exp(-p[0] * (t - p[1] * 365)))
        + 1.0 / (1 + np.exp(p[2] * (t - p[3] * 365)))
        - 1
    )

# The loglikelihood function
def log_lklhood(theta, t, data, sigma):
    """Normal log-likelihoood"""
    y_pred = my_model(theta, t, data.max(), data.min())
    retval = -(0.5 / sigma ** 2) * np.sum((data - y_pred) ** 2)
    return retval

## Experimental data

data = np.array([0.104, 0.133, 0.131, 0.329, 0.669, 0.822, 0.847, 0.807, 0.638, 0.486, 0.162, 0.125])
t = np.array([1, 32, 61, 92, 122, 153, 183, 214, 245, 275, 306, 336])
sigma = 0.02 # Uncertainty in the observations
# Straight outta internet...
logl = LogLike(log_lklhood, data, t, sigma)
ndraws = 500  # number of draws from the distribution
nburn = 100  # number of "burn-in points" (which we'll discard)

# use PyMC3 to sampler from log-likelihood
with pm.Model():
    theta1 = pm.Normal("theta_1", mu=0, sigma=1, testval=0.1)
    theta3 = pm.Normal("theta_3", mu=0, sigma=1, testval=0.1)
    theta2 = pm.Uniform("theta_2", lower=0.01, upper=0.5, testval=120.0 / 365)
    theta4 = pm.Uniform("theta_4", lower=0.51, upper=0.99, testval=250.0 / 365)

    # convert theta1...theta4 to a tensor vector
    theta = tt.as_tensor_variable([theta1, theta2, theta3, theta4])
    # use a DensityDist (use a lamdba function to "call" the Op)
    pm.DensityDist("likelihood", lambda v: logl(v), observed={"v": theta})
    # Use simple metropolis
    step = pm.Metropolis()
    trace = pm.sample(ndraws, tune=nburn, step=step, discard_tuned_samples=True)

This goes over the sampling but then fails with rather cryptic error:

Multiprocess sampling (2 chains in 2 jobs)
CompoundStep
>Metropolis: [theta_4]
>Metropolis: [theta_2]
>Metropolis: [theta_3]
>Metropolis: [theta_1]

100.00% [1010/1010 00:01<00:00 Sampling 2 chains, 0 divergences]

Sampling 2 chains for 5 tune and 500 draw iterations (10 + 1_000 draws total) took 2 seconds.

---------------------------------------------------------------------------
MissingInputError                         Traceback (most recent call last)
<ipython-input-14-3e5ab1ff0971> in <module>
     17     pm.DensityDist('likelihood', lambda v: logl(v), observed={'v': theta})
     18     step = pm.Metropolis()
---> 19     trace = pm.sample(ndraws, tune=nburn, step=step, discard_tuned_samples=True)

[...]


MissingInputError: Input 0 of the graph (indices start from 0), used to compute sigmoid(theta_4_interval__), was not provided and not given a value. Use the Theano flag exception_verbosity='high', for more information on this error.

I'm very new to pymc, so don't really know what this error means, or what I am doing wrong. There are a number of SO questions(eg0 on this that suggest I am calling a python function with a theano array as the basis of the problem. However, I don't see where this is happening.


Solution

  • So it turns out that there's an issue with the blackbox likelihood example: Don't use pm.DensityDist, but rather pm.Potential (see this arviz issue). The example now works correctly, even using scipy.optimize.approx_fprime to approximate the gradient of the log-likelihood:

    import numpy as np
    import pymc3 as pm
    import theano.tensor as tt
    import arviz as az
    import scipy.optimize # Can't be bothered calculating derivatives by hand
    
    ## Experimental data
    
    data = np.array([0.104, 0.133, 0.131, 0.329, 0.669, 0.822, 0.847, 0.807, 0.638, 0.486, 0.162, 0.125])
    t = np.array([1, 32, 61, 92, 122, 153, 183, 214, 245, 275, 306, 336])
    
    ## LogLikelihood and gradient of the LogLikelihood functions
    def log_likelihood(theta, data, t):
        if type(theta) == list:
            theta = theta[0]
        (
            theta1,
            theta2,
            theta3,
            theta4,
            sigma,
        ) = theta
    
        m_m = data.min()
        m_M = data.max()
        y_pred = m_m + (m_M - m_m) * (
            1.0 / (1 + np.exp(-theta1 * (t - theta2)))
            + 1.0 / (1 + np.exp(theta3 * (t - theta4)))
            - 1
        )
    
        logp = -len(data) * np.log(np.sqrt(2.0 * np.pi) * sigma)
        logp += -np.sum((data - y_pred) ** 2.0) / (2.0 * sigma ** 2.0)
        return logp
    
    
    def der_log_likelihood(theta, data, t):
        def lnlike(values):
            return log_likelihood(values, data, t)
    
        eps = np.sqrt(np.finfo(float).eps)
        grads = scipy.optimize.approx_fprime(theta[0], lnlike, eps * np.ones(len(theta)))
        return grads
    
    ## Wrapper classes to theano-ize LogLklhood and gradient...
    class Loglike(tt.Op):
        itypes = [tt.dvector]
        otypes = [tt.dscalar]
    
        def __init__(self, data, t):
            self.data = data
            self.t = t
            self.loglike_grad = LoglikeGrad(self.data, self.t)
    
        def perform(self, node, inputs, outputs):
            logp = log_likelihood(inputs, self.data, self.t)
            outputs[0][0] = np.array(logp)
    
        def grad(self, inputs, grad_outputs):
            (theta,) = inputs
            grads = self.loglike_grad(theta)
            return [grad_outputs[0] * grads]
    
    
    class LoglikeGrad(tt.Op):
        itypes = [tt.dvector]
        otypes = [tt.dvector]
    
        def __init__(self, data, t):
            self.der_likelihood = der_log_likelihood
            self.data = data
            self.t = t
    
        def perform(self, node, inputs, outputs):
            (theta,) = inputs
            grads = self.der_likelihood(inputs, self.data, self.t)
            outputs[0][0] = grads
    
    ## Sample!
    loglike = Loglike(data, t)
    with pm.Model() as model:
        theta1 = pm.Normal("theta_1", mu=0, sigma=1, testval=0.1)
        theta3 = pm.Normal("theta_3", mu=0, sigma=1, testval=0.1)
        theta2 = pm.DiscreteUniform("theta_2", lower=1, upper=180, testval=120)
        theta4 = pm.DiscreteUniform("theta_4", lower=181, upper=365, testval=250)
        sigma = pm.HalfNormal("sigma", sigma=0.6, testval=0.01)
        # convert parameters to a tensor vector
        theta = tt.as_tensor_variable([theta1, theta2, theta3, theta4, sigma])
        # Do not use a DensityDist!
        pm.Potential("like", loglike(theta))
    
    with model:
        trace = pm.sample(step=pm.NUTS())
        print(pm.summary(trace).to_string())
    

    Happily, the above code results in

    Multiprocess sampling (2 chains in 2 jobs)
    CompoundStep
    >NUTS: [sigma, theta_3, theta_1]
    >CompoundStep
    >>Metropolis: [theta_4]
    >>Metropolis: [theta_2]
    
    100.00% [4000/4000 00:16<00:00 Sampling 2 chains, 0 divergences]
    
    Sampling 2 chains for 1_000 tune and 1_000 draw iterations (2_000 + 2_000 draws total) took 17 seconds.
    The number of effective samples is smaller than 25% for some parameters.
    
                mean     sd   hdi_3%  hdi_97%  mcse_mean  mcse_sd  ess_mean  ess_sd  ess_bulk  ess_tail  r_hat
    theta_1    0.073  0.019    0.049    0.095      0.001    0.001     258.0   230.0     641.0     305.0   1.01
    theta_3    0.052  0.010    0.037    0.070      0.000    0.000     404.0   361.0     681.0     379.0   1.01
    theta_2  104.616  3.004  100.000  110.000      0.178    0.126     285.0   283.0     307.0     312.0   1.01
    theta_4  270.178  3.139  264.000  275.000      0.147    0.104     455.0   455.0     454.0     606.0   1.01
    sigma      0.040  0.013    0.020    0.063      0.001    0.001     324.0   298.0     410.0     422.0   1.01