Stan Model Posterior Predictions Outside Possible Range of Data

I am having a lot of fun right now learning the ropes of modeling in Stan. Right now I'm wrestling with my model of a mixed between- and within-subjects factorial experimental design. There are different groups of subjects, Each subject indicates how much they expect each of three different beverages (water, decaf, and coffee) to reduce their caffeine withdrawal. The outcome variable - expectancy of withdrawal reduction - was measured via a Visual Analog Scale from 0 - 10 with 0 indicating no expectation of withdrawal reduction and 10 indicating a very high expectation of withdrawal reduction. I want to test if there are between-group differences in the amount of expected withdrawal-reduction potential of the three different beverages.

Here is the data

df <- data.frame(id = rep(1:46, each = 3),
                 group = c(3,3,3,1,1,1,3,3,3,1,1,1,3,3,3,1,1,1,3,3,3,2,2,2,1,1,1,3,3,3,3,3,3,2,2,2,3,3,3,1,1,1,2,2,2,3,3,3,2,2,2,2,2,2,3,3,3,1,1,1,2,2,2,3,3,3,2,2,2,3,3,3,3,3,3,2,2,2,3,3,3,3,3,3,1,1,1,3,3,3,3,3,3,1,1,1,2,2,2,2,2,2,1,1,1,2,2,2,2,2,2,1,1,1,1,1,1,2,2,2,2,2,2,1,1,1,1,1,1,3,3,3,1,1,1,3,3,3),
                 bevType = rep(c(3,2,1), times = 46),
                 score = c(2.9,1.0,0.0,9.5,5.0,4.5,9.0,3.0,5.0,5.0,0.0,3.0,9.5,2.0,3.0,8.5,0.0,6.0,5.2,3.0,4.0,8.4,7.0,2.0,10.0,0.0,3.0,7.3,1.0,1.8,8.5,2.0,9.0,10.0,5.0,10.0,8.3,2.0,5.0,6.0,0.0,5.0,6.0,0.0,5.0,10.0,0.0,5.0,6.8,1.0,4.8,8.0,1.0,4.0,7.0,4.0,6.0,6.5,1.0,3.1,9.0,1.0,0.0,6.0,0.0,2.0,9.5,4.0,6.0,8.0,1.0,3.8,0.4,0.0,7.0,7.0,0.0,3.0,9.0,2.0,5.0,9.5,2.0,7.0,7.9,5.0,4.9,8.0,1.0,1.0,9.3,5.0,7.9,6.5,2.0,3.0,8.0,2.0,6.0,10.0,0.0,5.0,6.0,0.0,5.0,6.8,0.1,7.0,8.0,3.0,9.1,8.2,0.0,7.9,8.2,5.0,0.0,9.2,1.0,3.1,9.1,3.0,0.6,5.7,2.0,5.1,7.0,0.0,7.4,8.0,1.0,1.5,9.1,4.0,4.3,8.5,8.0,5.0))

Now for the model. The model has a grand mean parameter a, a categorical predictor representing groups deflections from the grand mean bGroup, a term for deflections of the different beverage types from the grand mean bBev, a term for each subject's intercept bSubj, and a term for the group by beverage interaction bGxB. I also estimated separate noise parameters for each beverage type.

To allow posterior predictive checks I drew from the joint posterior using the generated quantities block and the normal_rng function.

### Step 1: Put data into list
dList <- list(N = 138,
              nSubj = 46,
              nGroup = 3,
              nBev = 3,
              sIndex = df$id,
              gIndex = df$group,
              bIndex = df$bevType,
              score = df$score,
              gMean = 4.718841,
              gSD = 3.17)


#### Step 1 model
write("
      data{
      int<lower=1> N;
      int<lower=1> nSubj;
      int<lower=1> nGroup;
      int<lower=1> nBev;
      int<lower=1,upper=nSubj> sIndex[N];
      int<lower=1,upper=nGroup> gIndex[N];
      int<lower=1,upper=nBev> bIndex[N];
      real score[N];
      real gMean;
      real gSD;
      }

      parameters{
      real a;
      vector[nSubj] bSubj;
      vector[nGroup] bGroup;
      vector[nBev] bBev;
      vector[nBev] bGxB[nGroup];      // vector of vectors, stan no good with matrix
      vector[nBev] sigma;  
      real<lower=0> sigma_a;
      real<lower=0> sigma_s;
      real<lower=0> sigma_g;
      real<lower=0> sigma_b;
      real<lower=0> sigma_gb;
      }

      model{
      vector[N] mu;

      //hyper-priors
      sigma_s ~ normal(0,10);     
      sigma_g ~ normal(0,10);
      sigma_b ~ normal(0,10);
      sigma_gb ~ normal(0,10);


      //priors
      sigma ~ cauchy(0,1);
      a ~ normal(gMean, gSD);
      bSubj ~ normal(0, sigma_s);
      bGroup ~ normal(0,sigma_g);
      bBev ~ normal(0,sigma_b);
      for (i in 1:nGroup) {               //hierarchical prior on interaction
      bGxB[i] ~ normal(0, sigma_gb);
      }

      // likelihood
      for (i in 1:N){
      score[i] ~ normal(a + bGroup[gIndex[i]] + bBev[bIndex[i]] + bSubj[sIndex[i]] + bGxB[gIndex[i]][bIndex[i]], sigma[bIndex[i]]);
      }
      }

      generated quantities{
      real y_draw[N];
      for (i in 1:N) {
      y_draw[i] = normal_rng(a + bGroup[gIndex[i]] + bBev[bIndex[i]] + bSubj[sIndex[i]] + bGxB[gIndex[i]][bIndex[i]], sigma[bIndex[i]]);
      }
      }
      ", file = "temp.stan")

##### Step 3: generate the chains
mod <-  stan(file = "temp.stan",
             data = dList,
             iter = 5000,  
             warmup = 3000, 
             cores = 1,
             chains = 1)

Next we extract the draws from the joint posterior, and generate estimates of the group mean, upper and lower 95% HPDI. First we need a function to calculate the HPDI

HPDIFunct <- function (vector) { 
  sortVec <- sort(vector)
  ninetyFiveVec <- ceiling(.95*length(sortVec))
  fiveVec <- length(sortVec) - length(ninetyFiveVec)
  diffVec <- sapply(1:fiveVec, function (i) sortVec[i + ninetyFiveVec] - sortVec[i])
  minVal <- sortVec[which.min(diffVec)]
  maxVal <- sortVec[which.min(diffVec) + ninetyFiveVec]
  return(list(sortVec, minVal, maxVal))
}

Now to extract the draws from the posterior

#### Step 5: Posterior predictive checks
y_draw <- data.frame(extract(mod, pars = "y_draw"))


And plot the mean, lower HPDI and upper HPDI draws of these draws against the actual data. 

df$drawMean <- apply(y_draw, 2, mean)
df$HPDI_Low <- apply(y_draw, 2, function(i) HPDIFunct(i)[[2]][1])
df$HPDI_Hi <- apply(y_draw, 2, function(i) HPDIFunct(i)[[3]][1])

### Step 6: plot posterior draws against actual data
ggplot(df, aes(x = factor(bevType), colour = factor(group))) +
       geom_jitter(aes(y = score), shape = 1, position = position_dodge(width=0.9)) +
       geom_point(aes(y = drawMean), position = position_dodge(width=0.9), stat = "summary", fun.y = "mean", shape = 3, size = 3, stroke = 2) +
       geom_point(aes(y = HPDI_Low), position = position_dodge(width=0.9), stat = "summary", fun.y = "mean", shape = 1, size = 3, stroke = 1) +
       geom_point(aes(y = HPDI_Hi), position = position_dodge(width=0.9), stat = "summary", fun.y = "mean", shape = 1, size = 3, stroke = 1) +
       scale_colour_manual(name = "Experimental Group", labels = c("Group 1", "Group 2", "Group 3"), values = c("#616a6b", "#00AFBB", "#E7B800")) +  
       scale_x_discrete(labels = c("Water", "Decaf", "Coffee")) + 
       labs(x = "Beverage Type", y = "Expectancy of Withdrawal Alleviation") +
       scale_y_continuous(breaks = seq(0,10,2)) +
       theme(axis.text.x = element_text(size = 12),
             axis.title.x = element_text(face = "bold"),
             axis.title.y = element_text(face = "bold"),
             axis.text.y = element_text(size = 12),
             legend.title = element_text(size = 13, face = "bold"))

Looking at the graph, for Water expectancies the model seems to represent the centre (crosses) and spread (open circles) of the data quite well. But this breaks down for the Decaf and Coffee expectancies. For Decaf expectancies the lower HPDI is below the range of possible values (lower limit = 0) and the spread of the draws from the posterior (represented in each group by the open circles) is too large. The Coffee group's upper HPDI limit is also above the range of the data (upper limit = 10) and the spread is too large for the actual data.

So my question is:

How do I constrain the draws from the joint posterior to the actual range of the data?

Is there some sort of brute-force way to constrain the draws from the posterior in Stan? Or would a more adaptable estimation of differences in the variance across the three beverage conditions be more effective (in which case this would be more of a CV question than a SO question)?

Solution

To expand on @Bob Carpenter's answer, here are two ways you could approach the problem. (I've had cause to use both of these recently and struggled to get them up and running. This may be useful to other beginners like me.)

Method 1: Ordered Logistic Regression

We're going to assume that each user has a "true" expectancy for each response, which is on an arbitrary continuous scale, and model it as a latent variable. If the user's actual responses fall into K categories, we also model K - 1 cutpoints between those categories. The probability that the user selects a given response category is equal to the area under the logistic pdf between the relevant cutpoints.

The Stan model looks like this. The main difference is that the model fits an additional ordered vector of cutpoints, and uses the ordered_logistic distribution. (I also changed the priors on the sigmas to Cauchy, to keep them positive, and switched to non-centered parameterization. But those changes are independent of the question at hand.) Edit: Also added inputs for new (hypothetical) observations about which we want to make predictions, and added a new generated quantity for those predictions.

data {
  // the real data
  int<lower=1> N;
  int<lower=1> nSubj;
  int<lower=1> nGroup;
  int<lower=1> nBev;
  int minResponse;
  int maxResponse;
  int<lower=1,upper=nSubj> sIndex[N];
  int<lower=1,upper=nGroup> gIndex[N];
  int<lower=1,upper=nBev> bIndex[N];
  int<lower=minResponse,upper=maxResponse> score[N];
  // hypothetical observations for new predictions
  int<lower=1> nNewPred;
  int<lower=0> nNewSubj;
  int<lower=0> nNewGroup;
  int<lower=0> nNewBev;
  int<lower=1,upper=nSubj+nNewSubj> sNewIndex[nNewPred];
  int<lower=1,upper=nGroup+nNewGroup> gNewIndex[nNewPred];
  int<lower=1,upper=nBev+nNewBev> bNewIndex[nNewPred];
}

parameters {
  real a;
  vector[nSubj] bSubj;
  vector[nGroup] bGroup;
  vector[nBev] bBev;
  vector[nBev] bGxB[nGroup];
  real<lower=0> sigma_s;
  real<lower=0> sigma_g;
  real<lower=0> sigma_b;
  real<lower=0> sigma_gb;
  ordered[maxResponse - minResponse] cutpoints;
}

model {
  // hyper-priors
  sigma_s ~ cauchy(0, 1);
  sigma_g ~ cauchy(0, 1);
  sigma_b ~ cauchy(0, 1);
  sigma_gb ~ cauchy(0, 1);
  // priors
  a ~ std_normal();
  bSubj ~ std_normal();
  bGroup ~ std_normal();
  bBev ~ std_normal();
  for (i in 1:nGroup) {
    bGxB[i] ~ std_normal();
  }
  // likelihood
  for(i in 1:N) {
    score[i] ~ ordered_logistic(a +
                                  (bGroup[gIndex[i]] * sigma_g) +
                                  (bBev[bIndex[i]] * sigma_b) +
                                  (bSubj[sIndex[i]] * sigma_s) +
                                  (bGxB[gIndex[i]][bIndex[i]] * sigma_gb),
                                cutpoints);
  }
}

generated quantities {
  real y_draw[N];
  real y_new_pred[nNewPred];
  vector[nGroup+nNewGroup] bNewGroup;
  vector[nBev+nNewBev] bNewBev;
  vector[nSubj+nNewSubj] bNewSubj;
  vector[nBev+nNewBev] bNewGxB[nGroup+nNewGroup];
  // generate posterior predictions for the real data
  for (i in 1:N) {
    y_draw[i] = ordered_logistic_rng(a +
                                       (bGroup[gIndex[i]] * sigma_g) +
                                       (bBev[bIndex[i]] * sigma_b) +
                                       (bSubj[sIndex[i]] * sigma_s) +
                                       (bGxB[gIndex[i]][bIndex[i]] * sigma_gb),
                                     cutpoints);
  }
  // generate predictions for the new observations
  for (i in 1:(nGroup+nNewGroup)) {
    if (i <= nGroup) { bNewGroup[i] = bGroup[i]; }
    else { bNewGroup[i] = normal_rng(0, 1); }
  }
  for (i in 1:(nBev+nNewBev)) {
    if (i <= nBev) { bNewBev[i] = bBev[i]; }
    else { bNewBev[i] = normal_rng(0, 1); }
  }
  for (i in 1:(nSubj+nNewSubj)) {
    if (i <= nSubj) { bNewSubj[i] = bSubj[i]; }
    else { bNewSubj[i] = normal_rng(0, 1); }
  }
  for (i in 1:(nBev+nNewBev)) {
    for(j in 1:(nGroup+nNewGroup)) {
      if (i <= nBev && j <= nGroup) { bNewGxB[i][j] = bGxB[i][j]; }
      else { bNewGxB[i][j] = normal_rng(0, 1); }
    }
  }
  for (i in 1:nNewPred) {
    y_new_pred[i] = ordered_logistic_rng(a +
                                           (bNewGroup[gNewIndex[i]] * sigma_g) +
                                           (bNewBev[bNewIndex[i]] * sigma_b) +
                                           (bNewSubj[sNewIndex[i]] * sigma_s) +
                                           (bNewGxB[gNewIndex[i]][bNewIndex[i]] * sigma_gb),
                                         cutpoints);
  }
}

It looks like responses in your dataset are recorded to the nearest tenth, so that gives us 101 possible categories between 0 and 10. To keep everything as Stan-friendly integers, we can multiply all the responses by 10. (I also added one to each response because I had trouble fitting the model when one of the possible categories was zero.) Edit: Added new test data for a hypothetical "subject 47", one observation for each group/beverage.

new.pred.obs = expand.grid(group = 1:3, bevType = 2:3) %>%
  mutate(id = max(df$id) + 1)
dList <- list(N = 138,
              nSubj = 46,
              nGroup = 3,
              nBev = 3,
              minResponse = 1,
              maxResponse = 101,
              sIndex = df$id,
              gIndex = df$group,
              bIndex = df$bevType,
              score = (df$score * 10) + 1,
              nNewPred = nrow(new.pred.obs),
              nNewSubj = 1,
              nNewGroup = 0,
              nNewBev = 0,
              sNewIndex = new.pred.obs$id,
              gNewIndex = new.pred.obs$group,
              bNewIndex = new.pred.obs$bevType)

After we extract y_draw, we can convert it back to the original scale:

y_draw <- (data.frame(extract(mod, pars = "y_draw")) - 1) / 10

Everything else is the same as before. Now the posterior predictions are correctly confined to [0, 10].

To draw inferences on the original scale about differences between beverages, we can use the predictions for our hypothetical data. For each sample, we have one predicted output for a new subject in each group/beverage combination. We can compare the "coffee" vs. "decaf" responses within each sample and group:

# Get predictions for hypothetical observations
new.preds.df = data.frame(rstan::extract(mod, pars = "y_new_pred")) %>%
  rownames_to_column("sample") %>%
  gather(obs, pred, -sample) %>%
  mutate(obs = gsub("y_new_pred\\.", "", obs),
         pred = (pred - 1) / 10) %>%
  inner_join(new.pred.obs %>%
               rownames_to_column("obs") %>%
               mutate(bevType = paste("bev", bevType, sep = ""),
                      group = paste("Group", group)),
             by = c("obs")) %>%
  select(-obs) %>%
  spread(bevType, pred) %>%
  mutate(bevTypeDiff = bev3 - bev2)

(Alternatively, we could have done this prediction for new observations in R, or in a separate Stan model; see here for examples of how this could be done.)

Method 2: Beta Regression

Once we get up to 101 response categories, calling these possibilities discrete categories seems a little strange. It feels more natural to say, as your original model tried to do, that we're capturing a continuous outcome that happens to be bounded between 0 and 10. Also, in ordered logistic regression, the response categories don't have to be regularly spaced with respect to the latent variable. (This is a feature, not a bug; for example, for Likert responses, there's no guarantee that the difference between "Strongly agree" and "Agree" is the same as the difference between "Agree" and "Neither agree not disagree".) as a result, it's difficult to say anything about the "distance" a particular factor causes a response to move on the original scale (as opposed to the scale of the latent variable). But the cutpoints inferred by the model above are pretty regularly spaced, which again suggests that the outcome in your dataset is already reasonably scale-like:

# Get the sampled parameters
sampled.params.df = data.frame(as.array(mod)[,1,]) %>%
  select(-matches("y_draw")) %>%
  rownames_to_column("iteration")

# Plot selected cutpoints
sampled.params.df %>%
  select(matches("cutpoints")) %>%
  gather(cutpoint, value) %>%
  mutate(cutpoint.num = as.numeric(gsub("^cutpoints\\.([0-9]+)\\.$", "\\1", cutpoint))) %>%
  group_by(cutpoint.num) %>%
  summarize(mean.value = mean(value),
            lower.95 = quantile(value, 0.025),
            lower.50 = quantile(value, 0.25),
            upper.50 = quantile(value, 0.75),
            upper.95 = quantile(value, .975)) %>%
  ggplot(aes(x = cutpoint.num, y = mean.value)) +
  geom_point(size = 3) +
  geom_linerange(aes(ymin = lower.95, ymax = upper.95)) +
  geom_linerange(aes(ymin = lower.50, ymax = upper.50), size = 2) +
  scale_x_continuous("cutpoint", breaks = seq(0, 100, 10)) +
  scale_y_continuous("") +
  theme_bw()

(Thick and thin lines represent 50% and 95% intervals, respectively. I'm enjoying the little "jump" every 10 cutpoints, which suggests subjects treated, say, 5.9 vs. 6.0 as a larger difference than 5.8 vs. 5.9. But the effect seems to be quite mild. The scale also seems to stretch out a bit towards the high end, but again, it's not too drastic.)

For a continuous outcome in a bounded interval, we can use the beta distribution; see here and here for further discussion.

For the beta distribution, we need two parameters, mu and phi, both of which must be positive. In this example, I allowed mu to be unbounded and applied inv_logit before feeding it into the beta distribution; I constrained phi to be positive and gave it a Cauchy prior. But you could do it in any number of ways. I also coded a full set of mu parameters but only a single phi; again, you can experiment with other options.

data {
  int<lower=1> N;
  int<lower=1> nSubj;
  int<lower=1> nGroup;
  int<lower=1> nBev;
  int<lower=1,upper=nSubj> sIndex[N];
  int<lower=1,upper=nGroup> gIndex[N];
  int<lower=1,upper=nBev> bIndex[N];
  vector<lower=0,upper=1>[N] score;
}

parameters {
  real a;
  real a_phi;
  vector[nSubj] bSubj;
  vector[nGroup] bGroup;
  vector[nBev] bBev;
  vector[nBev] bGxB[nGroup];
  real<lower=0> sigma_s;
  real<lower=0> sigma_g;
  real<lower=0> sigma_b;
  real<lower=0> sigma_gb;
}

model {
  vector[N] mu;
  //hyper-priors
  sigma_s ~ cauchy(0, 1);
  sigma_g ~ cauchy(0, 1);
  sigma_b ~ cauchy(0, 1);
  sigma_gb ~ cauchy(0, 1);
  //priors
  a ~ std_normal();
  a_phi ~ cauchy(0, 1);
  bSubj ~ std_normal();
  bGroup ~ std_normal();
  bBev ~ std_normal();
  for (i in 1:nGroup) {
    bGxB[i] ~ std_normal();
  }
  // likelihood
  for(i in 1:N) {
    mu[i] = a +
             (bGroup[gIndex[i]] * sigma_g) +
             (bBev[bIndex[i]] * sigma_b) +
             (bSubj[sIndex[i]] * sigma_s) +
             (bGxB[gIndex[i]][bIndex[i]] * sigma_gb);
    score[i] ~ beta(inv_logit(mu[i]) .* a_phi,
                    (1 - inv_logit(mu[i])) .* a_phi);
  }
}

generated quantities {
  real y_draw[N];
  real temp_mu;
  for (i in 1:N) {
    temp_mu = a +
               (bGroup[gIndex[i]] * sigma_g) +
               (bBev[bIndex[i]] * sigma_b) +
               (bSubj[sIndex[i]] * sigma_s) +
               (bGxB[gIndex[i]][bIndex[i]] * sigma_gb);
    y_draw[i] = beta_rng(inv_logit(temp_mu) .* a_phi,
                         (1 - inv_logit(temp_mu)) .* a_phi);
  }
}

The beta distribution is supported on (0, 1), so we divide the observed scores by 10. (The model also fails if we give it scores of exactly 0 or 1, so I converted all such scores to 0.01 and 0.99, respectively.)

dList.beta <- list(N = 138,
                   nSubj = 46,
                   nGroup = 3,
                   nBev = 3,
                   sIndex = df$id,
                   gIndex = df$group,
                   bIndex = df$bevType,
                   score = ifelse(df$score == 0, 0.01,
                                  ifelse(df$score == 10, 0.99,
                                         df$score / 10)))

Undo the transformation when extracting y_draw, and then the procedure is the same as before.

y_draw.beta <- data.frame(extract(mod.beta, pars = "y_draw")) * 10

Once again, the posterior draws are correctly bounded.