Search code examples
pythonpymcpymc3

Porting PyMC2 code to PyMC3 - hierarchical model for sports analytics


I tried the following code, but I ran into problems. I think .values is the problem but how do I encode this as a Theano object?

The following is my data source

home_team,away_team,home_score,away_score
Wales,Italy,23,15
France,England,26,24
Ireland,Scotland,28,6
Ireland,Wales,26,3
Scotland,England,0,20
France,Italy,30,10
Wales,France,27,6
Italy,Scotland,20,21
England,Ireland,13,10
Ireland,Italy,46,7
Scotland,France,17,19
England,Wales,29,18
Italy,England,11,52
Wales,Scotland,51,3
France,Ireland,20,22

Here is the PyMC2 Code which works: data_file = DATA_DIR + 'results_2014.csv'

df = pd.read_csv(data_file, sep=',')
# Or whatever it takes to get this into a data frame.
teams = df.home_team.unique()
teams = pd.DataFrame(teams, columns=['team'])
teams['i'] = teams.index
df = pd.merge(df, teams, left_on='home_team', right_on='team', how='left')
df = df.rename(columns = {'i': 'i_home'}).drop('team', 1)
df = pd.merge(df, teams, left_on='away_team', right_on='team', how='left')
df = df.rename(columns = {'i': 'i_away'}).drop('team', 1)
observed_home_goals = df.home_score.values
observed_away_goals = df.away_score.values
home_team = df.i_home.values
away_team = df.i_away.values
num_teams = len(df.i_home.drop_duplicates())
num_games = len(home_team)
g = df.groupby('i_away')
att_starting_points = np.log(g.away_score.mean())
g = df.groupby('i_home')
def_starting_points = -np.log(g.away_score.mean())

#hyperpriors
home = pymc.Normal('home', 0, .0001, value=0)
tau_att = pymc.Gamma('tau_att', .1, .1, value=10)
tau_def = pymc.Gamma('tau_def', .1, .1, value=10)
intercept = pymc.Normal('intercept', 0, .0001, value=0)
#team-specific parameters
atts_star = pymc.Normal("atts_star", 
                        mu=0, 
                        tau=tau_att, 
                        size=num_teams, 
                        value=att_starting_points.values)
defs_star = pymc.Normal("defs_star", 
                        mu=0, 
                        tau=tau_def, 
                        size=num_teams, 
                        value=def_starting_points.values) 

# trick to code the sum to zero constraint
@pymc.deterministic
def atts(atts_star=atts_star):
    atts = atts_star.copy()
    atts = atts - np.mean(atts_star)
    return atts

@pymc.deterministic
def defs(defs_star=defs_star):
    defs = defs_star.copy()
    defs = defs - np.mean(defs_star)
    return defs

@pymc.deterministic
def home_theta(home_team=home_team, 
               away_team=away_team, 
               home=home, 
               atts=atts, 
               defs=defs, 
               intercept=intercept): 
    return np.exp(intercept + 
                  home + 
                  atts[home_team] + 
                  defs[away_team])

@pymc.deterministic
def away_theta(home_team=home_team, 
               away_team=away_team, 
               home=home, 
               atts=atts, 
               defs=defs, 
               intercept=intercept): 
    return np.exp(intercept + 
                  atts[away_team] + 
                  defs[home_team])   

home_points = pymc.Poisson('home_points', 
                          mu=home_theta, 
                          value=observed_home_goals, 
                          observed=True)
away_points = pymc.Poisson('away_points', 
                          mu=away_theta, 
                          value=observed_away_goals, 
                          observed=True)

mcmc = pymc.MCMC([home, intercept, tau_att, tau_def, 
                  home_theta, away_theta, 
                  atts_star, defs_star, atts, defs, 
                  home_points, away_points])
map_ = pymc.MAP( mcmc )
map_.fit()

mcmc.sample(200000, 40000, 20)

My attempt at porting to PyMC3 :) And I include the wrangling code. I defined my own data directory etc.

data_file = DATA_DIR + 'results_2014.csv'

df = pd.read_csv(data_file, sep=',')
# Or whatever it takes to get this into a data frame.
teams = df.home_team.unique()
teams = pd.DataFrame(teams, columns=['team'])
teams['i'] = teams.index
df = pd.merge(df, teams, left_on='home_team', right_on='team', how='left')
df = df.rename(columns = {'i': 'i_home'}).drop('team', 1)
df = pd.merge(df, teams, left_on='away_team', right_on='team', how='left')
df = df.rename(columns = {'i': 'i_away'}).drop('team', 1)
observed_home_goals = df.home_score.values
observed_away_goals = df.away_score.values
home_team = df.i_home.values
away_team = df.i_away.values
num_teams = len(df.i_home.drop_duplicates())
num_games = len(home_team)
g = df.groupby('i_away')
att_starting_points = np.log(g.away_score.mean())
g = df.groupby('i_home')
def_starting_points = -np.log(g.away_score.mean())

import theano.tensor as T
import pymc3 as pm3
#hyperpriors


x = att_starting_points.values
y = def_starting_points.values
model = pm.Model()
with pm3.Model() as model:
    home3 = pm3.Normal('home', 0, .0001)
    tau_att3 = pm3.Gamma('tau_att', .1, .1)
    tau_def3 = pm3.Gamma('tau_def', .1, .1)
    intercept3 = pm3.Normal('intercept', 0, .0001)
    #team-specific parameters
    atts_star3 = pm3.Normal("atts_star", 
                        mu=0, 
                        tau=tau_att3, 
                        observed=x)
    defs_star3 = pm3.Normal("defs_star", 
                        mu=0, 
                        tau=tau_def3,  
                        observed=y) 
    #Seems to be the error here. 
    atts = pm3.Deterministic('regression', 
    atts_star3 - np.mean(atts_star3))
    home_theta3 = pm3.Deterministic('regression', 
    T.exp(intercept3 + atts[away_team] + defs[home_team]))
atts = pm3.Deterministic('regression', atts_star3 - np.mean(atts_star3))
    home_theta3 = pm3.Deterministic('regression', T.exp(intercept3 +     atts[away_team] + defs[home_team]))
    # Unknown model parameters
    home_points3 = pm3.Poisson('home_points', mu=home_theta3, observed=observed_home_goals)
    away_points3 = pm3.Poisson('away_points', mu=home_theta3, observed=observed_away_goals)
    start = pm3.find_MAP()
    step = pm3.NUTS(state=start)
    trace = pm3.sample(2000, step, start=start, progressbar=True)

    pm3.traceplot(trace)

And I get an error like values isn't a Theano object. I think this is the .values part above. But i'm confused about how to convert this into a Theano tensor. The tensors are confusing me :)

And the error for clarity, because I've misunderstood something in PyMC3 syntax.

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-71-ce51c1a64412> in <module>()
     23 
     24     #Seems to be the error here.
---> 25     atts = pm3.Deterministic('regression', atts_star3 - np.mean(atts_star3))
     26     home_theta3 = pm3.Deterministic('regression', T.exp(intercept3 + atts[away_team] + defs[home_team]))
     27 

/Users/peadarcoyle/anaconda/lib/python3.4/site-packages/numpy/core/fromnumeric.py in mean(a, axis, dtype, out, keepdims)
   2733 
   2734     return _methods._mean(a, axis=axis, dtype=dtype,
-> 2735                             out=out, keepdims=keepdims)
   2736 
   2737 def std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):

/Users/peadarcoyle/anaconda/lib/python3.4/site-packages/numpy/core/_methods.py in _mean(a, axis, dtype, out, keepdims)
     71         ret = ret.dtype.type(ret / rcount)
     72     else:
---> 73         ret = ret / rcount
     74 
     75     return ret

TypeError: unsupported operand type(s) for /: 'ObservedRV' and 'int'

Solution

  • Here is my translation of your PyMC2 model:

    model = pm.Model()
    with pm.Model() as model:
        # global model parameters
        home        = pm.Normal('home',      0, .0001)
        tau_att     = pm.Gamma('tau_att',   .1, .1)
        tau_def     = pm.Gamma('tau_def',   .1, .1)
        intercept   = pm.Normal('intercept', 0, .0001)
    
        # team-specific model parameters
        atts_star   = pm.Normal("atts_star", 
                               mu   =0,
                               tau  =tau_att, 
                               shape=num_teams)
        defs_star   = pm.Normal("defs_star", 
                               mu   =0,
                               tau  =tau_def,  
                               shape=num_teams)
    
        atts        = pm.Deterministic('atts', atts_star - tt.mean(atts_star))
        defs        = pm.Deterministic('defs', defs_star - tt.mean(defs_star))
        home_theta  = tt.exp(intercept + home + atts[home_team] + defs[away_team]
        away_theta  = tt.exp(intercept + atts[away_team] + defs[home_team])
    
        # likelihood of observed data
        home_points = pm.Poisson('home_points', mu=home_theta, observed=observed_home_goals)
        away_points = pm.Poisson('away_points', mu=away_theta, observed=observed_away_goals)
    

    The big difference, as I see it, between PyMC2 and 3 model building is that the whole business of initial values in PyMC2 is not included in model building in PyMC3. It is pushed off into the model fitting portion of the code.

    Here is a notebook that puts this model in context with your data and some fitting code: http://nbviewer.ipython.org/gist/aflaxman/55e23195fe0a0b089103