I've just started reading through the PyMC3
documentation (I'm much more comfortable with sklearn
) and came across the Rugby hierarchical model example:
# Imports and Rugby data setup -- model in next section
import numpy as np
import pandas as pd
import pymc3 as pm
import theano.tensor as tt
import matplotlib.pyplot as plt
import seaborn as sns
games = [
['Wales', 'Italy', 23, 15],
['France', 'England', 26, 24],
['Ireland', 'Scotland', 28, 6],
['Ireland', 'Wales', 26, 3],
['Scotland', 'England', 0, 20],
['France', 'Italy', 30, 10],
['Wales', 'France', 27, 6],
['Italy', 'Scotland', 20, 21],
['England', 'Ireland', 13, 10],
['Ireland', 'Italy', 46, 7],
['Scotland', 'France', 17, 19],
['England', 'Wales', 29, 18],
['Italy', 'England', 11, 52],
['Wales', 'Scotland', 51, 3],
['France', 'Ireland', 20, 22],
]
columns = ['home_team', 'away_team', 'home_score', 'away_score']
df = pd.DataFrame(games, columns=columns)
teams = df.home_team.unique()
teams = pd.DataFrame(teams, columns=['team'])
teams['i'] = teams.index
df = pd.merge(df, teams, left_on='home_team', right_on='team', how='left')
df = df.rename(columns = {'i': 'i_home'}).drop('team', 1)
df = pd.merge(df, teams, left_on='away_team', right_on='team', how='left')
df = df.rename(columns = {'i': 'i_away'}).drop('team', 1)
observed_home_goals = df.home_score.values
observed_away_goals = df.away_score.values
home_team = df.i_home.values
away_team = df.i_away.values
num_teams = len(df.i_home.drop_duplicates())
num_games = len(home_team)
g = df.groupby('i_away')
att_starting_points = np.log(g.away_score.mean())
g = df.groupby('i_home')
def_starting_points = -np.log(g.away_score.mean())
Here's the main PyMC3
model setup:
with pm.Model() as model:
# Global model parameters
home = pm.Normal('home', 0, tau=.0001)
tau_att = pm.Gamma('tau_att', .1, .1)
tau_def = pm.Gamma('tau_def', .1, .1)
intercept = pm.Normal('intercept', 0, tau=.0001)
# Team-specific model parameters
atts_star = pm.Normal('atts_star', mu=0, tau=tau_att, shape=num_teams)
defs_star = pm.Normal('defs_star', mu=0, tau=tau_def, shape=num_teams)
atts = pm.Deterministic('atts', atts_star - tt.mean(atts_star))
defs = pm.Deterministic('defs', defs_star - tt.mean(defs_star))
home_theta = tt.exp(intercept + home + atts[home_team] + defs[away_team])
away_theta = tt.exp(intercept + atts[away_team] + defs[home_team])
# Likelihood of observed data
home_points = pm.Poisson('home_points', mu=home_theta, observed=observed_home_goals)
away_points = pm.Poisson('away_points', mu=away_theta, observed=observed_away_goals)
start = pm.find_MAP()
step = pm.NUTS(state=start)
trace = pm.sample(20000, step, init=start)
I know how to plot the trace
:
pm.traceplot(trace[5000:])
And generate posterior predictive samples:
ppc = pm.sample_ppc(trace[5000:], samples=500, model=model)
What I'm unsure of: how do I ask questions of the model/posterior?
For example, I'm assuming the distribution of scores for Wales vs Italy
matchups would be:
# Wales vs Italy is the first matchup in our dataset
home_wales = ppc['home_points'][:, 0]
away_italy = ppc['away_points'][:, 0]
But what about matchups that aren't recorded in the original data?
Thanks for any help/insights.
I’m fairly certain I was able to figure this out after reading through the PyMC3
Hierarchical Partial Pooling example. Answering the questions in order:
Yes, that is what the distribution for Wales vs Italy
matchups would be (since it’s the first game in the observed data).
In order to predict Italy vs France
(since the two teams did not play against each other in our original dataset), we would need prediction thetas
.
Here’s the code to the updated model:
# Setup the model similarly to the previous one...
with pm.Model() as model:
# Global model parameters
home = pm.Normal('home', 0, tau=.0001)
tau_att = pm.Gamma('tau_att', .1, .1)
tau_def = pm.Gamma('tau_def', .1, .1)
intercept = pm.Normal('intercept', 0, tau=.0001)
# Team-specific model parameters
atts_star = pm.Normal('atts_star', mu=0, tau=tau_att, shape=num_teams)
defs_star = pm.Normal('defs_star', mu=0, tau=tau_def, shape=num_teams)
atts = pm.Deterministic('atts', atts_star - tt.mean(atts_star))
defs = pm.Deterministic('defs', defs_star - tt.mean(defs_star))
home_theta = tt.exp(intercept + home + atts[home_team] + defs[away_team])
away_theta = tt.exp(intercept + atts[away_team] + defs[home_team])
# Likelihood of observed data
home_points = pm.Poisson('home_points', mu=home_theta, observed=observed_home_goals)
away_points = pm.Poisson('away_points', mu=away_theta, observed=observed_away_goals)
# Now for predictions with no games played...
with model:
# IDs from `teams` DataFrame
italy, france = 4, 1
# New `thetas` for Italy vs France predictions
pred_home_theta = tt.exp(intercept + home + atts[italy] + defs[france])
pred_away_theta = tt.exp(intercept + atts[france] + defs[italy])
pred_home_points = pm.Poisson('pred_home_points', mu=pred_home_theta)
pred_away_points = pm.Poisson('pred_away_points', mu=pred_away_theta)
# Sample the final model
with model:
start = pm.find_MAP()
step = pm.NUTS(state=start)
trace = pm.sample(20000, step, init=start)
Once the trace
is completed, we can plot the predictions:
# Use 5,000 as MCMC burn in
pred = pd.DataFrame({
"italy": trace["pred_home_points"][5000:],
"france": trace["pred_away_points"][5000:],
})
# Plot the distributions
sns.kdeplot(pred.italy, shade=True, label="Italy")
sns.kdeplot(pred.france, shade=True, label="France")
plt.show()
How often does Italy win at home?
# 19% of the time
(pred.italy > pred.france).mean()
How often do both teams score under 15?
# 0.7% of the time
1.0 * len(pred[(pred.italy <= 15) & (pred.france <= 15)]) / len(pred)