---
title: "metropolis"
author: "Alexander Keil"
date: "`r Sys.Date()`"
#output: rmarkdown::pdf_document 
output: rmarkdown::html_vignette  
vignette: >
  %\VignetteIndexEntry{The metropolis algorithm for fitting Bayesian GLMs}
  %\VignetteEngine{knitr::knitr}
  \usepackage[utf8]{inputenc}
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

## A guided walk through the Metropolis algorithmm
The following vignette contains code to accompany the paper "Markov unchained: a guided walk through the Metropolis algorithm." The code walks the reader through

- Reading in the data
- Finding maximum likelihood estimates of the log odds ratio and risk difference
- Simulating posterior distributions of the log odds ratio and risk difference using the following algorithms (described in the paper)
    - Random walk metropolis
    - Guided metropolis
    - Guided, adaptive metropolis
    - Guided, adaptive metropolis with normal priors
- Using the "metropolis" r package to carry out the same analysis to simulate a posterior distribution for a logistic model under uniform or normal priors
- *Note* each algorithm used here only runs for 10,000 iterations (to save computational time when installing package). The results will vary more than they would with the larger number of iterations used in the paper.

## The data
A case control study of leukemia (y=1) and residence around strong magnetic fields (x=1)
```{r data}
y = c(rep(1, 36), rep(0, 198)) # leukemia cases
x = c(rep(1, 3), rep(0, 33), rep(1, 5), rep(0, 193)) # exposure
```

## Helper functions
These functions will be used throughout
```{r Helper functions}
#helper functions
expit <- function(mu) 1/(1+exp(-mu))

loglik = function(y,x,beta){
  # calculate the log likelihood
  lli = dbinom(y, 1, expit(beta[1] + x*beta[2]), log=TRUE)
  sum(lli)
}

riskdifference = function(y,x,beta){
  # baseline odds (offset)
  # calculate a risk difference
  poprisk = 4.8/100000
  popodds = poprisk/(1-poprisk)
  studyodds = mean(y)/(1-mean(y))
  r1 = expit(log(popodds/studyodds) + beta[1] + beta[2])
  r0 = expit(log(popodds/studyodds) + beta[1])
  mean(r1-r0)
}
```



## Maximum likelihood estimates

```{r mle, echo=TRUE, results="hold"}
data = data.frame(leuk=y, magfield=x)
mod = glm(leuk ~ magfield, family=binomial(), data=data)
summary(mod)$coefficients

beta1 = summary(mod)$coefficients[2,1]
se1 = summary(mod)$coefficients[2,2]
cat("\n\nMaximum likelihood beta coefficient (95% CI)\n")
round(c(beta=beta1, ll=beta1+se1*qnorm(0.025), ul=beta1+se1*qnorm(0.975)), 2)

cat("\n\nMaximum likelihood odds ratio (95% CI)\n")
round(exp(c(beta=beta1, ll=beta1+se1*qnorm(0.025), ul=beta1+se1*qnorm(0.975))), 2)

cat("\n\nMaximum likelihood risk difference (multiplied by 1000) \n")
round(c(rd_1000=riskdifference(y,x,mod$coefficients)*1000), 2)

```



## Random walk metropolis

```{r random walk metropolis}
# initialize
M=10000
set.seed(91828)
beta_post = matrix(nrow=M, ncol=2)
colnames(beta_post) = c('beta0', 'beta1')
accept = numeric(M)
rd = numeric(M)
beta_post[1,] = c(2,-3)
rd[1] = riskdifference(y,x,beta_post[1,])
accept[1] = 1
for(i in 2:M){
  oldb = beta_post[i-1,]
  prop = rnorm(2, sd=0.2)
  newb = oldb+prop
  num = loglik(y,x,newb)
  den = loglik(y,x,oldb)
  acceptprob = exp(num-den)
  acc = (acceptprob > runif(1))
  if(acc){
    beta_post[i,] = newb 
    accept[i] = 1
  }else{
    beta_post[i,] = oldb 
    accept[i] = 0
  }
  rd[i] = 1000*riskdifference(y,x,beta_post[i,])
}

```

## Inspecting output

```{r Inspecting, fig.width =5, fig.height =6}
mean(accept)
summary(beta_post)
init = beta_post[1,]
postmean = apply(beta_post[-c(1:1000),], 2, mean)
cat("Posterior mean\n")
round(postmean, 2)

plot(beta_post, pch=19, col=rgb(0,0,0,0.05), xlab=expression(beta[0]), ylab=expression(beta[1]), xlim=c(-2.5,2.5), ylim=c(-4.5,4.5))
points(init[1], init[2], col="red", pch=19)
points(postmean[1], postmean[2], col="orange", pch=19)
legend("topright", col=c("red", "orange"), legend=c("Initial value", "Post. mean"), pch=19)

plot(beta_post[,2], type='l',  ylab=expression(beta[1]), xlab="Iteration", ylim=c(-4, 4))
plot(rd, type='l',  ylab="RD*1000", xlab="Iteration", ylim=c(-4, 4))

plot(density(beta_post[-c(1:1000),2]), xlab=expression(beta[1]), ylab="Density", main="")
plot(density(rd[-c(1:1000)]), xlab="RD*1000", ylab="Density", main="")
```



## Guided metropolis

```{r guided metropolis}
# initialize
M=10000
set.seed(91828)
beta_post_guide = matrix(nrow=M, ncol=2)
colnames(beta_post_guide) = c('beta0', 'beta1')
accept = numeric(M)
rd_guide = numeric(M)
beta_post_guide[1,] = c(2,-3)
rd_guide[1] = riskdifference(y,x,beta_post_guide[1,])
accept[1] = 1
dir = 1
for(i in 2:M){
  oldb = beta_post_guide[i-1,]
  prop = dir*abs(rnorm(2, sd=0.2))
  newb = oldb+prop
  num = loglik(y,x,newb)
  den = loglik(y,x,oldb)
  acceptprob = exp(num-den)
  acc = (acceptprob > runif(1))
  if(acc){
    beta_post_guide[i,] = newb 
    accept[i] = 1
  }else{
    beta_post_guide[i,] = oldb 
    accept[i] = 0
    dir = dir*-1
  }
  rd_guide[i] = 1000*riskdifference(y,x,beta_post_guide[i,])
}

postmean = apply(beta_post_guide[-c(1:1000),], 2, mean)
cat("Posterior mean, guided\n")
round(postmean, 2)

```

## Contrasting output with random walk

```{r Comparing guided, fig.width =8, fig.height =5}
col1 = rgb(0,0,0,.5)
col2 = rgb(1,0,0,.35)
par(mfcol=c(1,2))

#trace plots
plot(beta_post[1:200,2], type='l',  ylab=expression(beta[1]), xlab="Iteration", ylim=c(-4, 4), col=col1)
lines(beta_post_guide[1:200,2], col=col2)
legend("topright", lty=1, col=c(col1, col2), legend=c("Rand. walk", "Guided"))
plot(9800:10000, beta_post[9800:10000,2], type='l',  ylab=expression(beta[1]), xlab="Iteration", ylim=c(-4, 4), col=col1)
lines(9800:10000, beta_post_guide[9800:10000,2], col=col2)
legend("topright", lty=1, col=c(col1, col2), legend=c("Rand. walk", "Guided"))

# density plots
plot(density(beta_post_guide[-c(1:1000),2]), col=col2, xlab=expression(beta[1]), ylab="Density", main="")
lines(density(beta_post[-c(1:1000),2]), col=col1)
legend("bottomright", lty=1, col=c(col1, col2), legend=c("Rand. walk", "Guided"))


plot(density(rd_guide[-c(1:1000)]), xlab="RD*1000", ylab="Density", main="", col=col2)
lines(density(rd[-c(1:1000)]), col=col1)
legend("topright", lty=1, col=c(col1, col2), legend=c("Rand. walk", "Guided"))
par(mfcol=c(1,1))
```

## Guided, adaptive metropolis algorithm

```{r guided and adaptive metropolis}
# initialize
M=10000
burnin=1000
set.seed(91828)
beta_post_adaptguide = matrix(nrow=M+burnin, ncol=2)
colnames(beta_post_adaptguide) = c('beta0', 'beta1')
accept = numeric(M+burnin)
rd_adaptguide = numeric(M+burnin)
beta_post_adaptguide[1,] = c(2,-3)
rd_adaptguide[1] = riskdifference(y,x,beta_post[1,])
accept[1] = 1
prop.sigma = c(0.2, 0.2)
dir = 1
for(i in 2:(M+burnin)){
  if((i < burnin) & (i > 25)){
    prop.sigma = apply(beta_post_adaptguide[max(1, i-100):(i-1),], 2, sd)
  }
  oldb = beta_post_adaptguide[i-1,]
  prop = dir*abs(rnorm(2, sd=prop.sigma))
  newb = oldb+prop
  num = loglik(y,x,newb)
  den = loglik(y,x,oldb)
  acceptprob = exp(num-den)
  acc = (acceptprob > runif(1))
  if(acc){
    beta_post_adaptguide[i,] = newb 
    accept[i] = 1
  }else{
    beta_post_adaptguide[i,] = oldb 
    accept[i] = 0
    dir = dir*-1
  }
  rd_adaptguide[i] = 1000*riskdifference(y,x,beta_post_adaptguide[i,])
}
postmean = apply(beta_post_adaptguide[-c(1:1000),], 2, mean)
cat("Posterior mean, guided and adaptive\n")
round(postmean, 2)

```
## Contrasting output


```{r Comparing 2, fig.width =8, fig.height =5}
col1 = rgb(0,0,0,.5)
col2 = rgb(1,0,0,.35)
par(mfcol=c(1,2))

#trace plots
plot(beta_post[1:200,2], type='l',  ylab=expression(beta[1]), xlab="Iteration", ylim=c(-4, 4), col=col1)
lines(beta_post_adaptguide[1:200,2], col=col2)
legend("topright", lty=1, col=c(col1, col2), legend=c("Rand. walk", "Guided, adaptive"))
plot(9800:10000, beta_post[9800:10000,2], type='l',  ylab=expression(beta[1]), xlab="Iteration", ylim=c(-4, 4), col=col1)
lines(9800:10000, beta_post_adaptguide[9800:10000,2], col=col2)
legend("topright", lty=1, col=c(col1, col2), legend=c("Rand. walk", "Guided, adaptive"))

# density plots
plot(density(beta_post_adaptguide[-c(1:1000),2]), col=col2, xlab=expression(beta[1]), ylab="Density", main="")
lines(density(beta_post[-c(1:1000),2]), col=col1)
legend("bottomright", lty=1, col=c(col1, col2), legend=c("Rand. walk", "Guided, adaptive"))


plot(density(rd_adaptguide[-c(1:1000)]), xlab="RD*1000", ylab="Density", main="", col=col2)
lines(density(rd[-c(1:1000)]), col=col1)
legend("topright", lty=1, col=c(col1, col2), legend=c("Rand. walk", "Guided, adaptive"))
par(mfcol=c(1,1))
```


## Guided, adaptive metropolis algorithm using normal priors

```{r guided and adaptive metropolis using normal priors}
# initialize
M=10000
burnin=1000
set.seed(91828)
beta_post_adaptguide2 = matrix(nrow=M+burnin, ncol=2)
colnames(beta_post_adaptguide2) = c('beta0', 'beta1')
accept = numeric(M+burnin)
rd_adaptguide2 = numeric(M+burnin)
beta_post_adaptguide2[1,] = c(2,-3)
rd_adaptguide2[1] = riskdifference(y,x,beta_post[1,])
accept[1] = 1
prop.sigma = c(0.2, 0.2)
dir = 1
for(i in 2:(M+burnin)){
  if((i < burnin) & (i > 25)){
    prop.sigma = apply(beta_post_adaptguide2[max(1, i-100):(i-1),], 2, sd)
  }
  oldb = beta_post_adaptguide2[i-1,]
  prop = dir*abs(rnorm(2, sd=prop.sigma))
  newb = oldb+prop
  num = loglik(y,x,newb) + dnorm(newb[1], mean=0, sd=sqrt(100), log=TRUE) + dnorm(newb[2], mean=0, sd=sqrt(0.5), log=TRUE)
  den = loglik(y,x,oldb) + dnorm(oldb[1], mean=0, sd=sqrt(100), log=TRUE) + dnorm(oldb[2], mean=0, sd=sqrt(0.5), log=TRUE)
  acceptprob = exp(num-den)
  acc = (acceptprob > runif(1))
  if(acc){
    beta_post_adaptguide2[i,] = newb 
    accept[i] = 1
  }else{
    beta_post_adaptguide2[i,] = oldb 
    accept[i] = 0
    dir = dir*-1
  }
  rd_adaptguide2[i] = 1000*riskdifference(y,x,beta_post_adaptguide2[i,])
}
postmean = apply(beta_post_adaptguide2[-c(1:1000),], 2, mean)
cat("Posterior mean, guided and adaptive\n")
round(postmean, 2)

```

## Inspecting output

```{r Inspecting output with normal priors, fig.width =8, fig.height =5}
mean(accept)
init = beta_post_adaptguide[1,]
postmean = apply(beta_post_adaptguide[-c(1:1000),], 2, mean)
cat("Posterior mean, uniform priors\n")
round(postmean, 2)

init2 = beta_post_adaptguide2[1,]
postmean2 = apply(beta_post_adaptguide2[-c(1:1000),], 2, mean)
cat("Posterior mean, informative normal priors\n")
round(postmean2, 2)


par(mfcol=c(1,2))
plot(beta_post_adaptguide, pch=19, col=rgb(0,0,0,0.05), xlab=expression(beta[0]), ylab=expression(beta[1]), xlim=c(-2.5,2.5), ylim=c(-4.5,4.5), main="Uniform priors")
points(init[1], init[2], col="red", pch=19)
points(postmean[1], postmean[2], col="orange", pch=19)
legend("topright", col=c("red", "orange"), legend=c("Initial value", "Post. mean"), pch=19)

plot(beta_post_adaptguide2, pch=19, col=rgb(0,0,0,0.05), xlab=expression(beta[0]), ylab=expression(beta[1]), xlim=c(-2.5,2.5), ylim=c(-4.5,4.5), main="Informative priors")
points(init2[1], init2[2], col="red", pch=19)
points(postmean2[1], postmean2[2], col="orange", pch=19)
legend("topright", col=c("red", "orange"), legend=c("Initial value", "Post. mean"), pch=19)

par(mfcol=c(1,2))
plot(beta_post_adaptguide[,2], type='l',  ylab=expression(beta[1]), xlab="Iteration", ylim=c(-4, 4))
plot(beta_post_adaptguide2[,2], type='l',  ylab=expression(beta[1]), xlab="Iteration", ylim=c(-4, 4))

plot(density(beta_post_adaptguide[-c(1:1000),2]), xlab=expression(beta[1]), ylab="Density", main="", xlim=c(-4, 4))
plot(density(beta_post_adaptguide2[-c(1:1000), 2]), xlab=expression(beta[1]), ylab="Density", main="", xlim=c(-4, 4))
par(mfcol=c(2,1))

plot(rd_adaptguide, type='l',  ylab="RD*1000", xlab="Iteration", ylim=c(-.2, .5))
plot(rd_adaptguide2, type='l',  ylab="RD*1000", xlab="Iteration", ylim=c(-.2, .5))

plot(density(rd_adaptguide[-c(1:1000)]), xlab=expression(beta[1]), ylab="Density", main="", xlim=c(-.2, .5))
plot(density(rd_adaptguide2[-c(1:1000)]), xlab="RD*1000", ylab="Density", main="", xlim=c(-.2, .5))
par(mfcol=c(1,1))
```


## Using the R package
Given what you know, running the R package function `metropolis_glm` should 
be fairly straightforward. The following example calls in the case-control data used above
and compares a randome Walk metropolis algorithmn (with N(0, 0.05), N(0, 0.1) proposal distribution) with a guided, adaptive algorithm.

```{r Using the R package, fig.width =8, fig.height =5}
library(metropolis)
data("magfields", package="metropolis")

# random walk
res.rw = metropolis_glm(y ~ x, data=magfields, family=binomial(), iter=20000, burnin=3000, adapt=FALSE, guided=FALSE, block=TRUE, inits=c(2,-3), control = metropolis.control(prop.sigma.start = c(0.05, .1)))

summary(res.rw, keepburn = FALSE)

plot(res.rw, par = 1:2, keepburn=TRUE)

# guided, adaptive
res.ga = metropolis_glm(y ~ x, data=magfields, family=binomial(), iter=20000, burnin=3000, adapt=TRUE, guided=TRUE, block=FALSE, inits=c(2,-3))

summary(res.ga, keepburn = FALSE)

plot(res.ga, par = 1:2, keepburn=TRUE)

```

## Using the R package, smart initial values
Finally, we can use the "glm" option in initial values to initialize the chain with the MLE estimates.
This can eke out slightly more efficiency and allow reduced burnin.

```{r Initial values, fig.width =8, fig.height =5}
# guided, adaptive
res.ga.init = metropolis_glm(y ~ x, data=magfields, family=binomial(), iter=20000, burnin=1000, adapt=TRUE, guided=TRUE, block=FALSE, inits="glm")

summary(res.ga.init, keepburn = FALSE)

plot(res.ga.init, par = 1:2, keepburn=TRUE)

```

## Extending the logistic model results after samples are generated
Using the function, "risk difference" from above, we can use the output from our prior model to get
Bayesian estimates of the risk diffence. In general, this is a useful way to extend MCMC simulations
to new estimands that may not be directly parameterized in the model.

```{r Risk difference after glm_metropolis, fig.width =8, fig.height =5}
# guided, adaptive
beta = as.matrix(res.ga.init$parms[, c("b_0", "b_1")])

y = magfields$y
X = cbind(rep(1, dim(magfields)[1]), magfields$x)

1000*riskdifference(y,X,beta[1,])

# calculate risk difference for every value of model coefficients
rd.ga.init = apply(beta, 1, function(b) 1000*riskdifference(y,X,b))

par(mfcol=c(2,1))
 plot(density(rd.ga.init), xlab = "RD*1000", ylab="Kernel density", main="", xlim=c(-.2, 2.5))
 plot(rd.ga.init, type='l', xlab = "RD*1000", ylab="Iteration", ylim=c(-.2, 2.5))
par(mfcol=c(1,1))

# posterior mean, median, credible intervals
mean(rd.ga.init[-c(1:1000)])
quantile(rd.ga.init[-c(1:1000)], p = c(.5, .025, .975) )

```
