I have a table of counts of binary outcomes and I would like to fit a beta binomial distribution to estimate $\alpha$ and $\beta$ parameters, but I am getting errors when I try to fit/sample the model distribution the way I do for other cases:
import pymc3 as pm
import pandas as pd
df = pd.read_csv('~/data.csv', low_memory=False)
df = df[df.Clicks >= 0]
C0=df.C.values
I0=df.N.values
N0 = C0 + I0
with pm.Model() as model:
C=pm.constant(C0)
I=pm.constant(I0)
C1=pm.constant(C0 + 1)
I1=pm.constant(I0 + 1)
N=pm.constant(N0)
alpha = pm.Exponential('alpha', 1/(C0.sum()+1))
beta = pm.Exponential('beta', 1/(I0.sum()+1))
obs = pm.BetaBinomial('obs', alpha, beta, N, observed=C0)
with model:
advi_fit = pm.variational.advi(n=int(1e4))
trace1 = pm.variational.sample_vp(advi_fit, draws=int(1e4))
pm.traceplot(trace1[::10])
with model:
step = pm.NUTS()
#step = pm.Metropolis() # <== same problem
trace2 = pm.sample(int(1e3), step)
pm.traceplot(trace2[::10])
In both cases the sampling fails with:
MissingInputError: ("An input of the graph, used to compute Elemwise{neg,no_inplace}(P_logodds_), was not provided and not given a value.Use the Theano flag exception_verbosity='high',for more information on this error.", P_logodds
In the advi
case the full stack trace is:
MissingInputError Traceback (most recent call last)
<ipython-input-46-8947c7c798e5> in <module>()
----> 1 import codecs, os;__pyfile = codecs.open('''/tmp/py7996Jip''', encoding='''utf-8''');__code = __pyfile.read().encode('''utf-8''');__pyfile.close();os.remove('''/tmp/py7996Jip''');exec(compile(__code, '''/home/dmahler/Scripts/adops-bayes2.py''', 'exec'));
/home/dmahler/Scripts/adops-bayes2.py in <module>()
59 advi_fit = pm.variational.advi(n=int(J*6.4e4), learning_rate=1e-3/J, epsilon=1e-8, accurate_elbo=False)
60 #advi_fit = pm.variational.advi_minibatch(minibatch_RVs=[alpha, beta, p], minibatch_tensors=[C,I,N])
---> 61 trace = pm.variational.sample_vp(advi_fit, draws=int(2e4))
62
63 pm.traceplot(trace[::10])
/home/dmahler/Anaconda/lib/python2.7/site-packages/pymc3/variational/advi.pyc in sample_vp(vparams, draws, model, local_RVs, random_seed, hide_transformed)
317
318 varnames = [str(var) for var in model.unobserved_RVs]
--> 319 trace = NDArray(model=model, vars=vars_sampled)
320 trace.setup(draws=draws, chain=0)
321
/home/dmahler/Anaconda/lib/python2.7/site-packages/pymc3/backends/ndarray.pyc in __init__(self, name, model, vars)
21 """
22 def __init__(self, name=None, model=None, vars=None):
---> 23 super(NDArray, self).__init__(name, model, vars)
24 self.draw_idx = 0
25 self.draws = None
/home/dmahler/Anaconda/lib/python2.7/site-packages/pymc3/backends/base.pyc in __init__(self, name, model, vars)
34 self.vars = vars
35 self.varnames = [var.name for var in vars]
---> 36 self.fn = model.fastfn(vars)
37
38
/home/dmahler/Anaconda/lib/python2.7/site-packages/pymc3/model.pyc in fastfn(self, outs, mode, *args, **kwargs)
374 Compiled Theano function as point function.
375 """
--> 376 f = self.makefn(outs, mode, *args, **kwargs)
377 return FastPointFunc(f)
378
/home/dmahler/Anaconda/lib/python2.7/site-packages/pymc3/memoize.pyc in memoizer(*args, **kwargs)
12
13 if key not in cache:
---> 14 cache[key] = obj(*args, **kwargs)
15
16 return cache[key]
/home/dmahler/Anaconda/lib/python2.7/site-packages/pymc3/model.pyc in makefn(self, outs, mode, *args, **kwargs)
344 on_unused_input='ignore',
345 accept_inplace=True,
--> 346 mode=mode, *args, **kwargs)
347
348 def fn(self, outs, mode=None, *args, **kwargs):
/home/dmahler/Anaconda/lib/python2.7/site-packages/theano/compile/function.pyc in function(inputs, outputs, mode, updates, givens, no_default_updates, accept_inplace, name, rebuild_strict, allow_input_downcast, profile, on_unused_input)
318 on_unused_input=on_unused_input,
319 profile=profile,
--> 320 output_keys=output_keys)
321 # We need to add the flag check_aliased inputs if we have any mutable or
322 # borrowed used defined inputs
/home/dmahler/Anaconda/lib/python2.7/site-packages/theano/compile/pfunc.pyc in pfunc(params, outputs, mode, updates, givens, no_default_updates, accept_inplace, name, rebuild_strict, allow_input_downcast, profile, on_unused_input, output_keys)
477 accept_inplace=accept_inplace, name=name,
478 profile=profile, on_unused_input=on_unused_input,
--> 479 output_keys=output_keys)
480
481
/home/dmahler/Anaconda/lib/python2.7/site-packages/theano/compile/function_module.pyc in orig_function(inputs, outputs, mode, accept_inplace, name, profile, on_unused_input, output_keys)
1774 profile=profile,
1775 on_unused_input=on_unused_input,
-> 1776 output_keys=output_keys).create(
1777 defaults)
1778
/home/dmahler/Anaconda/lib/python2.7/site-packages/theano/compile/function_module.pyc in __init__(self, inputs, outputs, mode, accept_inplace, function_builder, profile, on_unused_input, fgraph, output_keys)
1426 # OUTPUT VARIABLES)
1427 fgraph, additional_outputs = std_fgraph(inputs, outputs,
-> 1428 accept_inplace)
1429 fgraph.profile = profile
1430 else:
/home/dmahler/Anaconda/lib/python2.7/site-packages/theano/compile/function_module.pyc in std_fgraph(input_specs, output_specs, accept_inplace)
175
176 fgraph = gof.fg.FunctionGraph(orig_inputs, orig_outputs,
--> 177 update_mapping=update_mapping)
178
179 for node in fgraph.apply_nodes:
/home/dmahler/Anaconda/lib/python2.7/site-packages/theano/gof/fg.pyc in __init__(self, inputs, outputs, features, clone, update_mapping)
169
170 for output in outputs:
--> 171 self.__import_r__(output, reason="init")
172 for i, output in enumerate(outputs):
173 output.clients.append(('output', i))
/home/dmahler/Anaconda/lib/python2.7/site-packages/theano/gof/fg.pyc in __import_r__(self, variable, reason)
358 # Imports the owners of the variables
359 if variable.owner and variable.owner not in self.apply_nodes:
--> 360 self.__import__(variable.owner, reason=reason)
361 if (variable.owner is None and
362 not isinstance(variable, graph.Constant) and
/home/dmahler/Anaconda/lib/python2.7/site-packages/theano/gof/fg.pyc in __import__(self, apply_node, check, reason)
472 "for more information on this error."
473 % str(node)),
--> 474 r)
475
476 for node in new_nodes:
MissingInputError: ("An input of the graph, used to compute Elemwise{neg,no_inplace}(P_logodds_), was not provided and not given a value.Use the Theano flag exception_verbosity='high',for more information on this error.", P_logodds_)
> /home/dmahler/Anaconda/lib/python2.7/site-packages/theano/gof/fg.py(474)__import__()
472 "for more information on this error."
473 % str(node)),
--> 474 r)
475
476 for node in new_nodes:
Before I was aware of pymc3.BetaBinomial
I was fitting trying to achieve the same result using separate Beta
and Binomial
instances:
with pm.Model() as model:
C=pm.constant(C0)
I=pm.constant(I0)
C1=pm.constant(C0 + 1)
I1=pm.constant(I0 + 1)
N=pm.constant(N0)
alpha = pm.Exponential('alpha', 1/(C0.sum()+1))
beta = pm.Exponential('beta', 1/(I0.sum()+1))
p = pm.Beta('P', alpha, beta, shape=K)
b = pm.Binomial('B', N, p, observed=C0)
This completes successfully but different methods produce rather different results. I thought this could be partly due to the extra level of indirection between the priors and the observations makes the search space larger. When I came across BetaBinomial
I figured it would make the search easier as well as being the right thing ^TM. Otherwise I believe the to models should be logically equivalent. Unfortunatelly I cannot figure out how to make batebinomial
work and I am unable to find any examples using BetaBinomial
on the internet.
BetaBinomial
modwel work?Your model should run, and you can write this
with pm.Model() as model:
alpha = pm.Exponential('alpha', 1/(C0.sum()+1))
beta = pm.Exponential('beta', 1/(I0.sum()+1))
obs = pm.BetaBinomial('obs', alpha, beta, N0, observed=C0)
That is, (C, I C1, I1) were defined in you model but not used. Anyway that was not the problem. The error you are getting is because PyMC3 is expecting a variable P
(like in the second model you have) but the variable is not defined. May be you are working with a Jupyter notebook and you delete/comment a theano variable. Try running the Notebook again.
Theoretically using a Beta and a Binomial or a BetaBinomial should give the same results. From a practical point of view. Sampling from the BetaBinomial should be faster than from a beta and a binomial, since part of the work has already being done analytically!
Assuming a correct sampling both models should provide equivalent results. To check both results are equivalent try increasing the number of samples (and avoid thinning). Also compare the results between and within models (the differences should be about the same). If you don't need to estimate the P
variable (the beta distribution), then use the BetaBinomial.