I am fitting a mixture model to estimate the average of a trait in each of 3 populations. I have a label switching issue and I am trying to compute the distance between the observed and expected numbers of individuals of each genotype in each population to relabel population clusters. Below is a reproducible example.
For some reasons, JAGS does not compute the square values for distance properly. The corresponding line in code below is: pow(DistNumPerClust[k,j], 2))
Hence, the output matrix results$mean$dist
is different from the matrix, results$mean$DistNumPerClust^2
, computed a posteriori.
Would anyone know a way to solve this?
library(R2jags)
library(runjags)
library(dirmult)
set.seed(123)
############################
## Simulation of the data ##
############################
npop=3
ngeno=2
freqbalance=1
nsamplesizeperpop <- 100
freqMLG <- t(rdirichlet(n=npop, alpha=rep(freqbalance, ngeno)))
samplesizegenoperpop <- sweep(freqMLG, 1, nsamplesizeperpop, "*")
## Compute membership (probability that a genotype comes from pop 1, 2 or 3)
## Genotype as rows and populations as columns
membership <- sweep(freqMLG, 1, rowSums(freqMLG), "/")
# Parameters for simulations
nind=90
N = npop*nind # nb of observations
clust <- rep(1:npop, each=N/npop)
geno <- c()
for (i in 1:N){
geno <- c(geno, sum(rmultinom(n=1, size=1, prob=freqMLG[, clust[i]])*1:ngeno))
}
numgeno <- as.numeric(table(geno))
## Multiply membership probabilities by sample size for each genotype
ExpNumPerClust <- sweep(membership, 1, numgeno, "*")
muOfClustsim <- c(1, 20, 50) # vector of population means
sigma <- 1.5 # residual sd
(tausim <- 1/(sigma*sigma)) # precision
# parameters are treated as data for the simulation step
data <- list(N=N, npop=npop, ngeno=ngeno, geno=geno, muOfClustsim=muOfClustsim, tausim=tausim, samplesizegenoperpop=samplesizegenoperpop)
## JAG model
txtstring <- "
data{
# Likelihood:
for (i in 1:N){
ysim[i] ~ dnorm(eta[i], tausim) # tau is precision (1 / variance)
eta[i] <- muOfClustsim[clust[i]]
clust[i] ~ dcat( pClust[geno[i], 1:npop] )
}
for (k in 1:ngeno){
pClust[k, 1:npop] ~ ddirch( samplesizegenoperpop[k,] )
}
}
model{
fake <- 0
}
"
# Simulate with jags
out <- run.jags(txtstring, data = data, monitor=c("ysim"), sample=1, n.chains=1, summarise=FALSE)
# reformat the outputs
ysim <- coda::as.mcmc(out)[1:N]
## Estimation model
bayes.mod <- function(){
# Likelihood:
for (i in 1:N){
ysim[i] ~ dnorm(eta[i], tau) # tau is precision (1 / variance)
eta[i] <- beta[clust[i]]
clust[i] ~ dcat( pClust[geno[i], 1:npop] )
}
for (k in 1:ngeno){
## pClust membership estimates
pClust[k, 1:npop] ~ ddirch( samplesizegenoperpop[k,] )
}
for (k in 1:ngeno){
for (j in 1:npop){
# problem of label switching: try to compute the distance between ObsNumPerClust and ExpNumPerClust (i.e. between observed and expected number of individuals of each genotype in each population)
ObsNumPerClust[k,j] <- pClust[k, j] * numgeno[k]
DistNumPerClust[k,j] <- ObsNumPerClust[k,j] - ExpNumPerClust[k,j]
dist[k,j] <- pow(DistNumPerClust[k,j], 2)
}
}
# Priors
beta ~ dmnorm(mu, sigma.inv)
mu ~ dmnorm(m, V)
sigma.inv ~ dwish(R, K)
tau ~ dgamma(0.01, 0.01)
# parameters transformations
sig <- sqrt(1/ tau)
}
m = rep(1, npop)
V = diag(rep(0.01, npop))
R = diag(rep(0.1, npop))
K = npop
## Input variables
sim.dat.jags<-list("ysim","N","npop", "ngeno", "geno","m","V","R", "K", "samplesizegenoperpop","numgeno","ExpNumPerClust")
## Variables to monitor
bayes.mod.params <- c("beta","tau","sig","DistNumPerClust","dist")
## Starting values
init1 <- list(beta = c(0, 100, 1000), tau = 1)
bayes.mod.inits <- list(init1)
## Run model
bayes.mod.fit<-jags(data = sim.dat.jags, inits = bayes.mod.inits, parameters.to.save = bayes.mod.params, n.chains=1, n.iter=101000, n.burnin=1000, n.thin=200, model.file = bayes.mod)
results <- print(bayes.mod.fit)
results$mean$dist
results$mean$DistNumPerClust^2
It seems that you expect that the mean of a transformed set of values will give the same result as transforming the mean of the same set of values. But this is not the case - for example:
values <- c(1,2,3,6,8,20)
mean(values)^2
mean(values^2)
Are not the same thing.
The equivalent is happening in your model - you calculate dist[k,j] as the square of DistNumPerClust[k,j] and then summarise to a mean of dist, and expect this to be the same as the square of the mean of DistNumPerClust[k,j]. Or in a simpler example:
library('runjags')
X <- 1:100
Y <- rnorm(length(X), 2*X + 10, 1)
model <- "model {
for(i in 1 : N){
Y[i] ~ dnorm(true.y[i], precision);
true.y[i] <- (m * X[i]) + c
}
m ~ dunif(-1000,1000)
c ~ dunif(-1000,1000)
precision ~ dexp(1)
p2 <- precision^2
}"
data <- list(X=X, Y=Y, N=length(X))
results <- run.jags(model=model, monitor=c("m", "c", "precision", "p2"),
data=data, n.chains=2)
results
More specifically, these should not be expected to be the same:
summary(results)['p2','Mean']
summary(results)['precision','Mean']^2
If you want to calculate the same thing you can extract the full chain of values as an MCMC object and do your transformation on these:
p <- combine.mcmc(results,vars='precision')
p2 <- combine.mcmc(results,vars='p2')
mean(p^2)
mean(p2)
mean(p)
mean(sqrt(p2))
Now everything is equivalent.
Matt