/ #Linear Congruential Method #Inverse CDF Method 

Computational Statistics

1 Libraries

library("ggplot2")
library("dplyr")
library("boot")
library("knitr")

2 Distributions

2.1 Relationship between distributions

2.2 Bernoulli Distribution

  • PDF of Bernoulli

\[ \begin{split} q = 1-p ~~if~k=0 \\ q = p~~if~k=1 \end{split} \] + p is the probability of success + q is the probability of failure, q=1-p

  • CDF of Bernoulli

\[ \begin{split} X(m,n) = \left\{\begin{array}{lr} 0, & \text{if } k<1\\ 1-p, &\text{if } 0\leq k< 1\\ 1, & \text{if } k\geq 1 \end{array}\right\} \end{split} \]

2.3 Beta Distribution

  • PDF of Beta

\[\Gamma(n) = (n-1)! \]

\[ f(x;\alpha,\beta) = \frac{\Gamma(\alpha+\beta)}{\Gamma(\alpha)\Gamma(\beta)} x^{\alpha-1} (1-x)^{\beta-1} \]

2.4 Exponential distribution

  • PDF of Exponential

    • lambda is the rate parameter

\[ \begin{split} f(x;\lambda) = \left\{\begin{array}{lr} \lambda e^{-\lambda \cdot x}, & \text{if } x\geq0\\ 0, &\text{if } x<0 \end{array}\right\} \end{split} \]

  • CDF of Exponential

\[ F(x;\lambda) = 1 - e^{-\lambda \cdot x}\]

2.5 Pareto Distribution

  • PDF of Pareto

    • xm is the (necessarily positive) minimum possible value of X
    • alpha is a positive parameter

\[ \begin{split} f(x) = \left\{\begin{array}{lr} \frac{\alpha x_m^\alpha}{x^\alpha+1}, & \text{if } x\geq x_m\\ 0, &\text{if } x<x_m \end{array}\right\} \end{split} \]

  • CDF of Pareto

\[ \begin{split} F(x) = 1 - (\frac{x_m}{x})^\alpha \end{split} \]

2.6 Uniform Distribution

  • PDF of Uniform

\[ \begin{split} f(x) = \left\{\begin{array}{lr} \frac{1}{b-a}, & \text{if } a\leq x\leq b,\\ 0 &\text{otherwise } \end{array}\right\} \end{split} \]

  • CDF of Uniform

\[ \begin{split} F(x) = \left\{\begin{array}{lr} 0, & \text{if } x < a\\ \frac{x-a}{b-a}, & \text{if } a\leq x \leq b \\ 1 &\text{otherwise } \\ \end{array}\right\} \end{split} \]

2.7 Normal Distribution

  • PDF of normal

    • mu is the mean or expectation of the distribution (and also its median and mode)
    • sigma is the standard deviation
    • sigma squares is the variance

\[ f(x|\mu,\sigma^2) = \frac{1}{\sqrt{2\pi\sigma^2}} e^{-\frac{(x-\mu)^2}{2\sigma^2}} \]

2.8 Extra suggestions

Run the following to get different distributions and their formula

help("Distributions")

3 Random Sampling from Uniform distribution

3.1 Sampling based probabilities proportional to the number of inhabitants of the city

data <- read.csv2("../../static/data/population.csv", header = TRUE)
get_city <- function(data){

data$Municipality <- as.character(data$Municipality)
data$prob <- data$Population/sum(data$Population)
#sorting the dataset
data <- data %>% arrange(prob)

data$cum_prob <- cumsum(data$prob)
data$lead_cum_prob <- lead(data$cum_prob, n=1)

# filling NA
data$lag_cum_prob[1] <- 0
data$lead_cum_prob[NROW(data)] <- 1

set.seed(12345)
num <- runif(1,0,1)

X <- ifelse(((num >= data$cum_prob) & (num <=  data$lead_cum_prob)), row.names(data), NA)
X <- na.omit(X)

data_name <- data[row.names(data) %in% X,]
return(data_name$Municipality)
}
data$Municipality <- as.character(data$Municipality)
# Select one city
get_city(data = data)
## [1] "Umeå"
# Remove one city
df <- data


df <- df[!df$Municipality %in% get_city(data=df),]

# apply function again
get_city(data = df)
## [1] "Lund"
# rempve this city
df <- df[!df$Municipality %in% get_city(data=df),]

# do this till 20 cities left
while(NROW(df) >20){
  df <- df[!df$Municipality %in% get_city(data=df),]
}

4 Numeric Precision

Using a very small number making the numeric precision upto required digits

options(digits = 22)
tol <- 1e-9
x1 <- 1/3
x2 <- 1/4

if(abs(x1-x2-1/12) <= tol){print("Subtraction is correct")
}else{print("Subtraction is wrong")}
## [1] "Subtraction is correct"

5 Random Number Generation

x <- rep(0,10000)
x[1] <- 1

a <- 7^5
c <- 0
m <- 2^31-1

for(i in 1:length(x)){
  x[i+1] <- (a*x[i] + c) %% m
}

u <- x/m

hist(u)

5.1 Implementing the Varience

my_rand_num <- rnorm(n=10000, mean=10^8, sd=1)

myvar_better <- function(x){
  n <- length(x)
  m <- mean(x)
answer <- (1/(n - 1)) * sum((x - m)^2)
return(answer)
}

Y <- vector(mode = "numeric", length = length(my_rand_num))
my_mat2 <- vector(mode = "numeric", length = length(my_rand_num)+1)

for(i in 2:length(my_rand_num)){
  options(digits = 22)
  X_i = my_rand_num[1:i]
  X_i = na.omit(X_i)
  if((myvar_better(X_i) - var(X_i)) <= 1e-15){
    Y = 0
  }else{
      Y = myvar_better(X_i) - var(X_i)
      }
  temp <- cbind(i, Y)
  my_mat2 <- rbind(temp, my_mat2)
}
## Warning in rbind(temp, my_mat2): number of columns of result is not a
## multiple of vector length (arg 2)
my_mat2 <- as.data.frame(my_mat2)

ggplot(my_mat2, aes(x=i, y=Y)) + geom_point() + ggtitle("Plot of Y vs. i")

6 Scaling to get better results

The numeric precision of storage is highest between ‘0’ and ‘1’

kappa(A)

Analysis: Following the ?kappa function in R, we get a very large number for the condition number. A large value for the condition number indicates that this matrix is close to being singular.

X_scale <- scale(X)
Y_scale <- scale(Y)

A_scale <- t(X_scale) %*% X_scale
b_scale <- t(X_scale) %*% Y_scale

B_hat_scale <- solve(A_scale, b_scale)

kappa(A_scale)

7 Split the data into train and test

data <- read.csv2("../../static/data/mortality_rate.csv")
data$LMR <- log(data$Rate)

n=NROW(data)
set.seed(123456)
id=sample(1:n, floor(n*0.5))
train = data[id,]
test = data[-id,]

8 Loess Model with brute force of finding in minimizing of a function

myMSE <- function(pars, lambda){

  X <- pars$X
  Y <- pars$Y
  Xtest <- pars$Xtest
  Ytest <- pars$Ytest
  
model <- loess(Y ~ X, enp.target=lambda)
predicted <- predict(model, Xtest)
n <- length(Ytest)

exp <- c()
for(i in 1:n){
  exp[i] <- (Ytest[i] - predicted[i])^2
}

answer_mse <- (1/n) * sum(exp)
     count <<- count + 1
return(answer_mse)
}
library(dplyr)

X <- train %>% select(c(Day)) %>% as.matrix() 
Y <- train %>% select(c(LMR)) %>% as.matrix() 

Xtest <- test %>% select(c(Day)) %>% as.matrix() 
Ytest <- test %>% select(c(LMR)) %>% as.matrix() 

pars <- list(X=X, Y=Y , Xtest=Xtest, Ytest=Ytest)

final <- NULL
count <- 0 
for(lambda in seq(from = 0.1, to = 40, by = 0.1)){
temp <- myMSE(pars, lambda=lambda)
temp <- cbind(temp, lambda)
final <- rbind(temp, final)
}

colnames(final) <- c("MSE", "lambda")
ggplot(data=data.frame(final), aes(x = lambda, y=MSE)) + geom_point() + 
  ggtitle("Plot of MSE vs. Lambda")

9 Optimize function to find the minimum

count <- 0
optimize(interval = c(0.1, 40), f = myMSE, pars=pars, tol=0.01, maximum = FALSE)
## $minimum
## [1] 10.693610651201773
## 
## $objective
## [1] 0.13214414192037774
cat("iternations: ", count)
## iternations:  18

10 Optim function to find the minimum

10.1 BFGS

count <- 0
optim(35, myMSE, pars=pars, method = c("BFGS"))
## $par
## [1] 35
## 
## $value
## [1] 0.17199961445847106
## 
## $counts
## function gradient 
##        1        1 
## 
## $convergence
## [1] 0
## 
## $message
## NULL
cat("iternations: ", count)
## iternations:  3

10.2 CG

load("../../static/data/data.RData")
my_log_likehood <- function(pars){
  x<-data
  mu <- pars[1] 
  sigma <- pars[2]
  n <- length(x)
answer <- n*0.5*log(2*pi*sigma^2) + (0.5/sigma^2)* sum((x-mu)^2)  

return(answer)
}

gradient <- function(pars){
    x<-data
  mu <- pars[1] 
  sigma <- pars[2]
  n <- length(x)
  grad_mu <- - (1/sigma^2)* sum(x-mu)
  grad_sig <- (n/sigma) - (1/sigma^3) * sum((x-mu)^2)
  return(c(grad_mu, grad_sig))
}

10.3 BFGS with and without gradient

run1 <- optim(c(0,1), fn = my_log_likehood, gr=NULL, method = c("BFGS"))
run2 <- optim(c(0,1), fn = my_log_likehood, gr=gradient, method = c("BFGS"))

10.4 CG with and without gradient

run3 <- optim(c(0,1), fn = my_log_likehood, gr=NULL, method = c("CG"))
run4 <- optim(c(0,1), fn = my_log_likehood, gr=gradient, method = c("CG"))

10.5 Showing all results in table

final <- NULL
final$algorithm <- c("BFGS", "CG", "BFGS+gradient", "CG+gradient")
final$parameters <- rbind(run1$par, run2$par, run3$par, run4$par)
final$counts <- rbind(run1$counts, run2$counts, run3$counts, run4$counts)
final$convergence <- rbind(run1$convergence, run2$convergence, run3$convergence, run4$convergence)
final$value <- rbind(run1$value, run2$value, run3$value, run4$value)


knitr::kable(as.data.frame(final), caption = "Table showing the summary from various optimization techniques")
Table 10.1: Table showing the summary from various optimization techniques
algorithm parameters.1 parameters.2 counts.function counts.gradient convergence value
BFGS 1.2755275244738007 2.0059769626279471 37 15 0 211.50694939149940
CG 1.2755275504025778 2.0059765494524084 39 15 0 211.50694939149349
BFGS+gradient 1.2755277195455159 2.0059765019515114 297 45 0 211.50694939149349
CG+gradient 1.2755275911253063 2.0059764724938893 53 17 0 211.50694939149332

11 Parabolic Interpolation using optim

11.1 Write a function that uses optim() and find values of (a0, a1, a2) for which g interpolates f at user provided points x0,x1,x2. Interpolate means f(x0) = ~g(x0), f(x1) = g(x1) and f(x2) = g(x2). optim() should minimize the squared error, i.e. find (a0,a1,a2) that make (f(x0)-g(x0))^2 + (f(x1)-g(x1))^2 + (f(x2)-g(x2))^2 minimum

# Actual function to estimate the minimum value of
actual <- function(x){
  result <- -x *(1-x)
return(result)  
}

# The estimation function whose parameters a0,a1,a2 are unknown
parabola <- function(par,x){
  a0 <- par[1]
  a1 <- par[2]
  a2 <- par[3]
  result <- a0+a1*x+a2*x^2
return(result)
}

# finding the difference between the functions for three given values (x0,x1,x2)
difference_function <- function(par,x){
  x0 <- par[4]
  x1 <- par[5]
  x2 <- par[6] 
  result <- sum((actual(x0)-parabola(par,x0))^2,(actual(x1)-parabola(par,x1))^2, (actual(x2)-parabola(par,x2))^2)
  return(result)
}

find_parameters <- function(){
temp <- optim(par=c(0,-1,1,0.1,0.8,0.9), fn=difference_function)
a0 <- temp$par[1]
a1 <- temp$par[2]
a2 <- temp$par[3]
return(list=c(a0=a0,a1=a1,a2=a2))
}

find_parameters()
## a0 a1 a2 
##  0 -1  1
# plotting to show the values
ggplot() + 
  geom_point(aes(x=seq(0,1,0.01), y=actual(x=seq(0,1,0.01)))) +
   geom_line(aes(x=seq(0,1,0.01), y=parabola(par=c(0,-1,1),x=seq(0,1,0.01)))) +
  ggtitle("plot of the first function vs. parabola")

12 Inverse CDF

The inverse transform method is a simple algorithm for generating random variables \(x\) from a continuous target distribution \(f(x)\) using random samples from a \(Unif(0,1)\) distribution.

Algorithm:

  1. For target probability density function (pdf) \(f(X)\), calculate the CDF, \(F(X)\)

  2. Set the CDF equal to \(U\), \(F(X) = U\), then solving for \(X\), obtaining \(F^{-1}(U) = X\)

  3. Generate \(n\) random variables from \(u \sim Unif(0,1)\)

  4. Plug in \(u\) observed values in \(F^{-1}(U = u)\) to obtain \(n\) values for which \(x \sim f(X)\)

12.1 Example: Pareto Distribution

For information on the Pareto distribution.

The \(Pareto(a,b)\) distribution has CDF \(F(X \leq x) = 1 - (\frac{b}{x})^a\) for \(x \geq b > 0, \ a > 0\)

  1. First set \(F(x) = U\), where \(U \sim Unif(0,1)\), then solve for \(X\) \[ \displaystyle \begin{aligned} 1 - \left( \frac{b}{x} \right)^2 & = U \\ \ \left(\frac{b}{x} \right)^a & = 1 - U \\ \ \frac{b}{x} & = (1 - U)^{1/a} \\ \ x & = b \times (1 - U)^{-1/a} \\ \ & = F_X^{-1}(U) \\ \end{aligned} \]
set.seed(123)
n = 1000
U =runif(n)
a = 3
b = 2
X = b*(1-U)^(-1/a)
pareto = function(x){(a*(b^a)/x^(a+1))}

summary(X)
##                Min.             1st Qu.              Median 
##  2.0003103289867905  2.2048580148641430  2.5031823508377102 
##                Mean             3rd Qu.                Max. 
##  2.9690832866834609  3.1613937424093539 23.7725736025729120
hist(X, probability = TRUE, breaks = 25, xlim =c(0, 20),
     col = "gray", border = "white",
     main = "Inverse Transform: Pareto(3,2)", xlab = "x")
curve(pareto(x), from = 0, to = 40, add = TRUE, col = "blue")

12.2 The double exponential (Laplace) distribution is given by formula:

\[ DE(\mu, \alpha) = \frac{\alpha}{2} e^{(-\alpha|x - \mu|)} \]

CDF: \[F(x)=\int_{-\infty}^{x} f(x) dx\]

\[F(x)=\int_{-\infty}^{x} \frac{\alpha}{2}e^{-\alpha(x-\mu)} dx ,~~~~~(if~~x>\mu)\] \[=1-\int_{x}^{\infty} \frac{\alpha}{2}e^{-\alpha(x-\mu)} dx\]

\[=1-\frac{1}{2}e^{-\alpha(x-\mu)}\]

\[F(x)=\int_{-\infty}^{x} \frac{\alpha}{2}e^{\alpha(x-\mu)} dx ,~~~~~(if~~x\leq\mu)\]

\[=\frac{1}{2}e^{\alpha(x-\mu)}\] Inverse of CDF

\[For~~x>\mu,~we~got~F(x)=1-\frac{1}{2}e^{-\alpha(x-\mu)}\] \[y=1-\frac{1}{2}e^{-\alpha(x-\mu)}\] \[\frac{ln(2-2y)-\alpha\mu}{-\alpha}=x\] \[For~U\sim U(0,1),~~~~~\frac{ln(2-2U)-\alpha\mu}{-\alpha}=X\] \[For~~x\leq\mu,~we~got~F(x)=\frac{1}{2}e^{\alpha(x-\mu)}\] \[y=\frac{1}{2}e^{\alpha(x-\mu)}\] \[\frac{ln(2y)}{\alpha}+\mu=x\] \[For~U\sim U(0,1),~~~~~\frac{ln(2U)}{\alpha}+\mu=X\]

de_dist <- function(u, mu, a){
  de_distribution <- c()

    for (i in 1:length(u)){
      if (u[i] > 0.5){
        de_distribution[i] <- (log(2-2*u[i])-a*mu)/(-a)
      }
      else {
        de_distribution[i] <- (log(2*u[i])/a)+mu
      }
    }
  return(de_distribution)
}
new_distribution <- de_dist(runif(10000, 0, 1), mu=0, a=1)
hist(new_distribution, breaks = 1000)

13 Generate Normal distribution using uniform

box_muller <- function(n = 1, mean = 0, sd = 1)
{
  x <- vector("numeric", n)

  i <- 1
  while(i <= n)
  {
    u1 <- runif(1, 0, 1)
    u2 <- runif(1, 0, 1)

    x[i] <- sqrt(-2 * log(u1)) * cos(2 * pi * u2)

    if ((i + 1) <= n)
    {
      x[i + 1] <- sqrt(-2 * log(u1)) * sin(2 * pi * u2)
      i <- i + 1
    }

    i <- i + 1
  }

  x * sd + mean
}

hist(box_muller(1000))

14 Accpetance/Rejection Method

To generate \(n\) samples, for(i in 1:n)

  1. Generate \(Y \sim g_Y(t)\) and \(U \sim Unif(0,1)\)

  2. If \(U \leq \frac{f(Y)}{M \times g(Y)}\) then we accept \(Y\), such that \(Y = X\)

  3. Repeat until you have sufficient samples

In order for the algorithm to work we require the following constraings:

  1. \(f\) and \(g\) have to have compatible supports (i.e. \(g(x) > 0\) when \(f(x) > 0\))

  2. There is a constant \(M\) such that \(\frac{f(t)}{g(t)} \leq M\)

14.1 Example: Beta(2,2)

Suppose we’d like to generate samples from \(Beta(2,2)\) distribution. The density function for \(Beta(2,2)\) is simply \(f(x) = 6x(1 - x)\) for \(0 < x < 1\). Since our domain is between 0 and 1, we can use a simple \(Unif(0,1)\) density as our instrumental density, \(g\). Then, by the accept-reject algorithm we can simulate a random variable \(Y \sim g\), and a random variable \(U \sim Unif(0,1)\). Then, if \[ M \times U \leq \frac{f(Y)}{ g(Y)} \] we accept the candidate variable \(Y \sim g\) as \(X\), \(X = Y\). Otherwise, we reject \(Y\) and simulate again until we get an appropriate sample size.

14.1.1 Generating N points and finding maximizing value

Note that the target density \(f\) has a maximum of 1.5, so we can set M = 1.5 (using calculus or use optimize function as shown)

# find M using optimize
f <- function(x){ 6*x*(1 - x)} ## pdf of Beta(2,2)
max_find <- optimize(f=f, lower = 0, upper = 10, maximum = TRUE)
max_find
## $maximum
## [1] 0.49999999999999956
## 
## $objective
## [1] 1.5
M <- max_find$objective



## Accept-Reject
M = 1.5
X = rep(x = NA, 5) ## create a vector of length 5 of NAs
set.seed(123)
f <- function(x){ 6*x*(1 - x)} ## pdf of Beta(2,2)
g <- function(x){ 1 } ## pdf of Unif(0,1) is just 1

n = 10000

Now, say we needed \(n = 10,000\) samples from \(Beta(2,2)\), then a better implementation would be

X = rep(NA, n); M = 1.5
i = 0 ## index set to start at 0
while(sum(is.na(X))){
  U = runif(1); Y = runif(1)
  accept <- U <= f(Y)/(M*g(Y))
  if(accept){
    i = i+1 ## update the index
    X[i] <- Y
  }
}

round(summary(X), 4)
##                  Min.               1st Qu.                Median 
## 0.0037000000000000002 0.3250000000000000111 0.4961999999999999744 
##                  Mean               3rd Qu.                  Max. 
## 0.5000999999999999890 0.6766999999999999682 0.9929999999999999938
round(qbeta(p = c(0, 0.25, 0.5, 0.75, 1), 2, 2), 4)
## [1] 0.00000000000000000 0.32640000000000002 0.50000000000000000
## [4] 0.67359999999999998 1.00000000000000000

14.2 Example

\[ \frac{f(y)}{g(y)} \leq c \]

Thus if we maximize the ratio f(y)/g(y) then that would be the value of c. This can be done by using parital derivate of the fraction.

\[ Majorizing~density~F_{Y}(y)\sim DE(0,1)=\frac{1}{2}e^{-|x|} \]

\[ Target~density~F_{X}(y)\sim N(0,1)=\frac{1}{\sqrt{2\pi}}e^{-\frac{x^2}{2}}\]

\[ C \ge \frac{\sqrt{2}}{\sqrt{\pi}} e^{-\frac{x^2}{2} + |y|} \] Differenitating with respect to x we get that the expression is maximum at x=1, thus C is:

\[ \sqrt{\frac{2}{\pi}}e^\frac{-1}{2}\]

\[=\frac{\sqrt{2}}{C\sqrt{\pi}}\cdotp e^{-\frac{y^2}{2}+|y|}\] \[ = e^{-\frac{y^2}{2}+|y|+\frac{1}{2}} \]

Thus C is

\[ C = \sqrt(\frac{2*e^1}{\pi})\]

generate_n <- function(c){
  x <- NA
  num.reject <- 0
    while (is.na(x)){
      u <- runif(1, 0, 1)
      y <- de_dist(u, 0, 1)
      U <- runif(1)
        if (y>0 && U <= sqrt(2/pi)/c*exp(-(y^2)/2)+y){
          x <- y
        } else if (y<=0 && U<=sqrt(2/pi)/c*exp(-(y^2)/2-y)){
            x <- y
              } else {
                num.reject <- num.reject + 1
            }
    }
  c(x,num.reject)
}


c <- sqrt(2*exp(1)/pi)
set.seed(12345)
nnormal <- sapply(rep(c,2000), generate_n)
hist(nnormal[1,], breaks = 50, freq = FALSE, xlim = c(-4,4),
     main = "Normal distribution generated by Accept/Reject method")
lines(density(nnormal[1,]))

average_rejection <- sum(nnormal[2,])/(ncol(nnormal)+sum(nnormal[2,]))
average_rejection
## [1] 0.15433403805496829
expected_rejection <- 1-(1/c)
expected_rejection
## [1] 0.23982654946685966
rejection_difference <- expected_rejection - average_rejection
rejection_difference
## [1] 0.085492511411891375
newnormal <- nnormal[1,]
random <- rnorm(2000, 0, 1)
df <- as.data.frame(cbind(newnormal, random))
ggplot(df, aes(newnormal, fill = "newnormal")) + geom_histogram(alpha = 0.4, bins = 50) +
  geom_histogram(aes(random, fill = "random"), alpha = 0.4, bins = 50)

15 Monte Carlo Integration

Now, to generalize the method used. Given a function \(h(x)\) whose integral is well defined, where we wish to evaluate at interval \(a\) to \(b\). Then

\[ \begin{aligned} \ \theta &= \int_a^b h(x) dx \\ \ &= (b - a) \int_a^b h(x) \frac{1}{b - a} dx \\ \ &= (b - a) \int_a^b h(x) f(x) dx \\ \end{aligned} \] where \(f(x) = \frac{1}{b - a}\) is \(Unif(a,b)\), and \(x \sim Unif(a,b)\).

The algorithm to calculate \(\hat{\theta}\) is as follows:

  1. Find a density \(f(x)\) from which we can sample \(x\)

  2. Generate \(x_1, ..., x_n \sim f(x)\)

  3. Compute \((b - a) \times \bar{g}_{n}\), where \(\bar{g}_{n} = \frac{1}{n} \sum_{i = 1}^n h(x_i)\)

15.0.1 Example

Suppose we have a function \(h(x) = 3x^2\) for which we wish to integrate over the interval \([0, 2]\).

  • Can apply deterministic numerical approximation methods (see R’s integrate)

  • or we could treat \(x\) as a random variable, \(X = x\), from a \(Unif(0,2)\) whose pdf is simply \(f(x) = \frac{1}{2-0} = \frac{1}{2}\).

If we now generate some \(n\) random values from \(f(x)\) and evaluate them at \(h(x)\), then take the mean, we’d be calculating the expected value of \(h(x)\), \[ \begin{aligned} \ \theta &= \int_0^2 h(x) dx \\ \ &= (\frac{2 - 0}{2 - 0}) \times \int_0^2 h(x) dx = 2 \times \int_0^2 h(x) \frac{1}{2} dx \\ \ &= 2 \times E[h(X)] = 2 \times \int_{- \infty}^{\infty} h(x) f(x) dx \\ \ &\approx 2 \times \frac{1}{n} \displaystyle\sum_{i=1}^{n} h(x_i) \\ \ &= \hat{\theta} \approx \theta \end{aligned} \]

15.1 Monte Carlo Integration, Variance Estimation

We can now calculate the standard error for the former example.

N = 10000 ## sample size
h <- function(x) { 3*x^2 } ## function of interest, h(x)
X <- runif(n = N, min = 0, max = 2)  ## samples from f(x)
h_values <- 2 * h(X)

cumMean <- function(x, n){
  num = cumsum(x)  ## numerator
  denom = 1:n  ## denominator
  result = num/denom
  return(result)
}


cumSE <- function(x, n){
  m = mean(x)
  num = sqrt(cumsum((x - m)^2)) ## numerator
  denom = 1:n ## denominator
  result = num/denom ## cummulative mean of (x_i - theta)^2
  return(result)
}

thetas = cumMean(h_values, N)
SE = cumSE(h_values, N)


plot(x = 1:N, y = thetas, type = "l", ylim = c(4, 12), xlab = "Number of Samples",
     ylab = "Estimate of theta",
     main = "Estimate of Theta with 95% CI")
lines(x = 1:N, y = thetas + 1.96*SE, col = "gray") ## CI
lines(x = 1:N, y = thetas - 1.96*SE, col = "gray") ## CI

## final estimate
thetaHat = mean(h_values)
se <- sd(x = h_values)/sqrt(N)
ci <- thetaHat + 1.96*c(-1,1) * se

print(thetaHat) ## theta estimate
## [1] 7.9538675271615578
print(ci) ## 95% CI
## [1] 7.8134362147371919 8.0942988395859228

16 Metropolis Hastings

The Independent Metropolis-Hastings algorithm as described Robert & Casella goes as follows

Given \(x^{(t)}\)

1. Generate \(Y_t \sim g(y)\)

2. Take

\[ X_{t+1} = \begin{cases} Y_t & \quad \text{with probability }\ \rho(x^{(t)}, Y_t) \\ x^{(t)} & \quad \text{with probability }\ 1 - \rho(x^{(t)}, Y_t) \\ \end{cases} \] where \[ \displaystyle \rho(x^{(t)}, Y_t) = \text{min} \left\{ \frac{f(Y_t)}{f(x^{(t)})} \frac{g(x^{(t)})}{g(Y_t)}, 1 \right\} \]

In simpler terms, as we want to generate \(X \sim f\), we first take an initial value \(x^{(0)}\) (which can almost be any artibrary value in the support of \(f\)).

  1. We generate a value \(Y_0 \sim q(y | x^{(0)})\).
  2. We calculate \(\rho(x^{(t)}, Y_t)\)
  3. Generate a random value \(U \sim Unif(0,1)\)
  4. If \(U < \rho(x^{(t)}, Y_t)\), then we accept \(X^{(1)} = Y_t\); else we take \(X^{(1)} = X^{(0)}\)
  5. Repeate steps 1-4 until you’ve satisfied the number of samples needed

16.0.1 Example

Use Metropolis-Hastings algorithm to generate samples from the below distribution by using proposal distribution as chi^2(floor(X(t)+1)), take some starting point. Plot the chain you obtained as a time series plot. What can you guess about the convergence of the chain? If there is a burn-in period, what can be the size of this period?

\[ f(X) \sim x^5 e^{-x}, x>0\]

#density function from which we want to sample.
dt <- function(x) {
  if (x <= 0) {
    stop("x has to be greater than 0.")
  }
  return(x^5 * exp(-x))
}

#proposal density function, should accept 2 arguments:
  #   x:  value at which to compute density.
  #   x_t: value on which the rq is conditioned.
dp <- function(x, xt) {
  dchisq(x, floor(xt + 1))
}

rp <- function(x) {
  rchisq(1, floor(x+1))
}

metropolis_hastings <- function(x_0, t_max, dt, dp, rp) {
  # Perform Metropolis-Hastings sampling of the specified density.
  # 
  # Args:
  #   x_0   starting value.
  #   t_max maximum numer of iterations.
  #   dt    density function from which we want to sample.
  #   dp    proposal density function, should accept 2 arguments:
  #           x:  value at which to compute density.
  #           x_t: value on which the rq is conditioned.
  #   rp    proposal random number generator, should accept 1 argument: 
  #           x_t: value on which the rq is conditioned.
  #           
  # Returns:
  #   Vector of sampled points of length t_max. 
  
  x_t <- x_0
  x <- vector("numeric", t_max) # pre-allocate
  
  # Metropolis-Hastings
  for (t in 1:t_max) {
    # Generate a candidate
    y <- rp(x_t)
    # Generate U
    u <- runif(1, 0, 1)
    # Compute alpha
    alpha <- min(1, (dt(y) * dp(x_t, y)) / (dt(x_t) * dp(y, x_t)))
    # Accept the jump or stay in x_t
    if (u < alpha) {
      x_t <- y
    }
    # Sace x_t in vector
    x[t] <- x_t
  }
  
  return(x)
}


test <- metropolis_hastings(x_0=1, t_max = 10000, dt, dp, rp)

# compare the distribution
x=seq(from=0.01, to=20, by=0.002)
actual <- numeric(length(x))
for(i in seq_along(x)){
  actual[i] <- dt(x=x[i])
}

quantile(test)
##                   0%                  25%                  50% 
##  0.63207349214489117  4.20543485481044410  5.71968566388519051 
##                  75%                 100% 
##  7.57026419107211890 18.64709586611740377
quantile(temp)
##                   0%                  25%                  50% 
##  0.17199961445847106 10.12899971084385342 20.08599980722923561 
##                  75%                 100% 
## 30.04299990361461781 40.00000000000000000
ggplot() +
geom_line(aes(x=1:10000, y=test)) +
labs(x="Iterations", y="X") + 
ylim(0, 50) +
ggtitle("Metropolis-Hasting Sampler using Chisquare")

17 Gelman Rubin

all_series <- NULL
for(i in 1:10) { 
temp <- metropolis_hastings(x_0=i, t_max = 10000, dt, dp, rp)
temp <- coda::as.mcmc(temp)
all_series[[i]] <- temp
}

#convergence analysis
coda::gelman.diag(all_series)
## Potential scale reduction factors:
## 
##      Point est. Upper C.I.
## [1,]          1          1

18 Gibbs Sampling

calculate_conditional_probability <- function(i, X, Y, ...) {
  # Returns the conditional probability of an element
  # Args: (might change with the given task)
  #   X   Value we are looking for
  #   Y   Data
  #   i   position
  # In the lab we had to consider three cases 
  # This will change in the exam!
  d <- length(X)
  if (i == 1) {
    # i = 1
    mean <- (Y[1] + X[2]) / 2
    variance <- sigma_squared / 2
  } else if (i == d) {
    # i = d
    mean <- (X[d - 1] + Y[d]) / 2
    variance <- sigma_squared / 2
  } else { 
    # i = 2 ... d-1
    mean <- (X[i - 1]+ Y[i] + X[i + 1]) / 3
    variance <- sigma_squared / 3
  }
  
  # Generate a random variables
  X_i <- rnorm(1, mean = mean, sd = sqrt(variance))
  
  return(X_i)
}

gibbs_sampling <- function(Y, n, X0, ...) {
  # Gibbs sampling
  # Args:
  #   Y   Data
  #   n   Number of generated samples
  #   X0  Initialization points
  # Returns:
  #   X   Matrix with samples (colmeans = value we are looking for)
  d <- length(Y)
  X <- matrix(NA, ncol = d, nrow = n)
  X[1, ] <- X0
  
  for (row in 2:n) {
    for (col in 1:d) {
      X[row, col] <- calculate_conditional_probability(
        i = col,
        X = c(X[row, 1:(col - 1)], X[row - 1, col:d]),
        Y = Y,
        sigma_squared = sigma_squared
      )
    }
  }
  return(X)
}

19 Bootstrap estimation

19.1 Estimating the distribution of T by using a non-parametric bootstrap with B = 2000

19.1.1 Using Boot library

\(T = \frac{\hat{Y}(X_b)-\hat{Y}(X_a)}{X_b - X_a}\) Where \(X_b = argmax_xY(X)\) and \(X_a = argmin_xY(X)\)

data <- read.csv2('../../static/data/lottery.csv')

stat1 <- function(data,vn){
  data <- as.data.frame(data[vn,])
  X_b <- data[which.max(data$Draft_No),]$Day_of_year
  X_a <- data[which.min(data$Draft_No),]$Day_of_year
  model <- loess(formula = Draft_No~Day_of_year
               , data = data)
  Y_hat <- predict(object = model, newdata = data)
  Y_hat_a <- Y_hat[X_a]
  Y_hat_b <- Y_hat[X_b]
  (Y_hat_b - Y_hat_a)/(X_b - X_a)
}

res <- boot(data = data, statistic = stat1
                        # Ordinary nonparametric bootstrap (default)
            , R = 2000, sim = "ordinary")


plot(res)

# P-Value
mean(abs(res$t0) < abs(res$t-mean(res$t)))
## [1] 0.47599999999999998

19.1.2 Without Boot library

set.seed(12345)

X <- data$Day_of_year
Y <- data$Draft_No

T <- numeric()

for (i in 1:2000) {
        boot <-sample(length(X), replace=TRUE)
        data1 <- cbind(X,boot) 

        X_a <- data1[which.min(data1[,2])]
        X_b <- data1[which.max(data1[,2])]
        
        model2 <- loess(Y~X, data=as.data.frame(data1), method="loess")
        
        fitted_X_a <- model2$fitted[X_a]
        fitted_X_b <- model2$fitted[X_b]
        
        test <- (fitted_X_b - fitted_X_a)/(X_b - X_a)
        T[i] <- test
        }


pval <- length(which(T>=0))


hist(T, breaks=30,
     main="Distribution of T by using non-parametric bootstrapping")

print("Estimated p-value:")
## [1] "Estimated p-value:"
print(pval/2000)
## [1] 0.2185

19.2 Varience Estimate

19.2.1 Estimate the distribution of the mean price of the house using bootstrap. Determine the bootstrap bias-correction and the variance of the mean price. Compute a 95% confidence interval for the mean price using bootstrap percentile, bootstrap BCa, and first-order normal approximation.

Bias correction \[T1 = 2.T(D) - \frac{1}{D} \sum_{i=1}^B T^*_{i} \]

price_data <- read.csv("../../static/data/prices1.csv", sep=";")

# Estimation of mean of Price
stat_mean <- function(data, index){
    data <- data[index,]
    answer <- mean(data$Price)
    return(answer)
}

res <- boot::boot(data=price_data, statistic = stat_mean, R=2000)
res
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot::boot(data = price_data, statistic = stat_mean, R = 2000)
## 
## 
## Bootstrap Statistics :
##               original             bias           std. error
## t1* 1080.4727272727273 1.4798636363636888 36.546015841440351
plot(res,index = 1)

#95% CI for mean using percentile
boot.ci(res, index=1, type=c('perc'))
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 2000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = res, type = c("perc"), index = 1)
## 
## Intervals : 
## Level     Percentile     
## 95%   (1014, 1156 )  
## Calculations and Intervals on Original Scale
#95% CI for mean using bca
boot.ci(res, index=1, type=c('bca'))
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 2000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = res, type = c("bca"), index = 1)
## 
## Intervals : 
## Level       BCa          
## 95%   (1016, 1158 )  
## Calculations and Intervals on Original Scale
#95% CI for mean using first order normal
boot.ci(res, index=1, type=c('norm'))
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 2000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = res, type = c("norm"), index = 1)
## 
## Intervals : 
## Level      Normal        
## 95%   (1007, 1151 )  
## Calculations and Intervals on Original Scale
# Bias-correction and Varience of Price
boot.fn <- function(data,index){
        d <- data[index]
        res <- mean(d)
}

boot.result <- boot(data=price_data$Price, statistic=boot.fn, R=1000) 
bias_cor <- 2*mean(price_data$Price)-mean(boot.result$t)


list("bias_correction"=bias_cor, "variance_of_the_mean_price"=35.93^2)
## $bias_correction
## [1] 1080.4343636363637
## 
## $variance_of_the_mean_price
## [1] 1290.9648999999999
boot.result
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = price_data$Price, statistic = boot.fn, R = 1000)
## 
## 
## Bootstrap Statistics :
##               original               bias           std. error
## t1* 1080.4727272727273 0.038363636363555997 35.455872075252685
plot(boot.result)

boot.ci(boot.result)
## Warning in boot.ci(boot.result): bootstrap variances needed for studentized
## intervals
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = boot.result)
## 
## Intervals : 
## Level      Normal              Basic         
## 95%   (1011, 1150 )   (1009, 1148 )  
## 
## Level     Percentile            BCa          
## 95%   (1013, 1151 )   (1018, 1159 )  
## Calculations and Intervals on Original Scale

19.2.2 Estimate the variance of the mean price using the jackknife and compare it with the bootstrap estimate

Jacknife(n=B): \[\widehat{Var[T(\cdot)]}=\frac{1}{n(n-1)}\sum_{i=1}^{n}(T_{i}^*-J(T))^2\] where, \[T_{i}^*=nT(D)-(n-1)T(D_i^*)~,~~~~J(T)=\frac{1}{n}\sum_{i=1}^{n}T_{i}^*\] When you compute the equation given aboce, you got \[\frac{n-1}{n}\sum_{i=1}^{n}(T_{i}^*-J(T))^2\] Reference:The Jackknife Estimation Method, Avery I. McIntosh (http://people.bu.edu/aimcinto/jackknife.pdf)

result <- numeric()
n <- NROW(price_data)

for (i in 1:n){
        updated_price <- price_data$Price[-i]
        result[i] <- mean(updated_price)
}

var_T <- (n-1)/n*sum((result-mean(result))^2)
mean_T <- mean(result)

cat("The variance from jacknife method is:", var_T)
## The variance from jacknife method is: 1320.9110440518652

Analysis: The obtained variance using Jackknife method is 1320.911 while using bootstrapping the obtained value was 1290.965. Considering the fact that Jackknife overestimate variance, the answer seems reasonable.

19.2.3 Compare the confidence intervals obtained with respect to their length and the location of the estimated mean in these intervals.

confidence_interval_jackknife <- c((mean_T - 1.96*var_T), (mean_T + 1.96*var_T))

confidence_interval_jackknife
## [1] -1508.5129190689286  3669.4583736143832
intervals <- c("(1150-1011)","(1148-1011)","(1149-1013)","(1146-1007)")
length <- c(1150-1011, 1148-1011,1149-1013,1146-1007)
Center_of_interval <- c((1150+1011)/2,(1148+1011)/2,(1149+1013)/2,(1146+1007)/2)
 
dt <- data.frame(intervals, length, Center_of_interval,row.names = c("Normal", "Basic", "Percentile", "BCa"))

dt %>% kable(col.names = c("Confidence interval", "Length of interval","Center of interval"))
Confidence interval Length of interval Center of interval
Normal (1150-1011) 139 1080.5
Basic (1148-1011) 137 1079.5
Percentile (1149-1013) 136 1081.0
BCa (1146-1007) 139 1076.5

20 Genetic Algorithm

20.1 Find the max of one-dimensional function

\[f(x) := \frac{x^2}{e^x} - 2 e^-\frac{(9 sinx)}{(x^2 + x + 1)} \]

function_f <- function(x){
answer <- x^2/exp(x) - 2*exp(-(9*sin(x))/(x^2+x+1))
return(answer)
}

20.2 Crossover function

crossover <- function(x,y){
answer <- (x+y) * 0.5
return(answer)
}

20.3 Mutate Function

mutate <- function(x){
answer <- (x^2)%%30  
return(answer)
}

20.4 Plot of function

y <- vector("double", length = 30)
x <- seq(from=0,to=29,by=1)
for(i in 0:29){y[i+1] <- function_f(x=i)}

data <- data.frame(cbind(x,y))
data$flag <- "original_values"
data$x <- as.numeric(as.character(data$x))
data$y <- as.numeric(as.character(data$y))

ggplot(data, aes(x=x, y=y, group=1)) + 
geom_point() + 
geom_line(linetype = "dashed") + 
ggtitle("Plot of the function") 

cat("The maximum values of the function is at x =",which.max(data$y), "and the value is = ",max(data$y))
## The maximum values of the function is at x = 2 and the value is =  0.20766879224596799
inital_population <- seq(from=0, to=30, by=5)
Values <- vector("double", length = 7)
for(i in seq_along(inital_population)){Values[i] <- function_f(x=inital_population[i])}

20.5 Main function

myfunction <- function(maxiter, mutprob){

# maxiter = 10
# mutprob = 0.5 

  
for(i in seq_along(1:maxiter)){
# sampling and getting parent and victim position
parent_index <- sample(x = seq(from = 1, to=7, by=1), size = 2, replace = TRUE) 
parent_1 <- Values[parent_index[[1]]]
parent_2 <- Values[parent_index[[2]]]
victim <- which.min(order(Values))

# performing crossover and mutate based on prob
kid <- crossover(x = parent_1, y = parent_2)

if(as.numeric(rbinom(1,1,mutprob))==1){kid <- mutate(x=kid)}
inital_population[victim] <- kid

# generating new values  
for(j in seq_along(inital_population)){Values[j] <- function_f(x=inital_population[j])}

}
new_max <- max(Values)
index_max_val <- inital_population[which.max(Values)]

data2 <- data.frame(cbind(x=inital_population,y=Values))
data2$flag <- "new_values"                    
data2$x <- as.numeric(as.character(data2$x))
data2$y <- as.numeric(as.character(data2$y))
data3 <- rbind(data2,data)

answer <- ggplot(data3, aes(x=x, y=y, group=1)) + 
geom_point() + 
geom_point(aes(colour=flag)) +  
geom_line(linetype = "dashed") + 
ggtitle(paste0("Plot of the function with maxiter = ",maxiter," and mutprob = ",mutprob)) 
return(answer)
}

myfunction(maxiter=10, mutprob=0.5)

21 The EM Algorithm

Given a random sample of size \(n\), with observed sample \(\mathbf{X} = (X_1, ..., X_m)\) and missing random sample \(\mathbf{Z} = Z_{m+1}, ..., Z_n\) we seek to compute \[ \hat{\theta} = \text{ arg max } L(\theta | \mathbf{X}, \mathbf{Z}) \] Although \(\mathbf{Z}\) is unobservable, we assume that \((\bf{X, Z}) \sim f(\bf{x,z} | \theta)\).

We place a conditional distribion on \(\mathbf{Z}\) given the observed data \(\bf{x}\), \[ k(\mathbf{z}| \theta, \mathbf{x}) = f( \mathbf{x, z} | \theta) / g(\mathbf{x} | \theta) \]

Here we assume that that \(\mathbf{X} \sim g(\bf{x} | \theta)\), where \[ g(\bf{x} | \theta) = \int f( \mathbf{x,z} | \theta ) d\bf{z} \]

Denote the complete-data likelihood as \(L^c(\theta | \mathbf{x, z})\) and the observed-data likelihood as \(L(\theta | \mathbf{x} )\). Then, for any value of \(\theta\), \(\theta_i\)

\[ log L(\theta | \mathbf{x}) = E[ log L^c(\theta | \mathbf{x, z}) ] - E [ log k( \bf{Z} | \theta_i, \bf{x} ) ] \] where the expectation is with respect to \(k(\mathbf{z}| \theta_i, \bf{x})\). We can rewrite this as

\[ E[log L^c(\theta | \mathbf{x, z})] = log L(\theta | \mathbf{x}) + E[log k(\mathbf{Z} | \theta_i, \bf{x})] \]

where our focus is concerned with maximizing \(E[log L^c(\theta | \mathbf{x, z})]\).

Denoting \(E[log L^c(\theta | \mathbf{x, z})]\) = \(Q(\theta | \theta_i, \mathbf{x})\), the EM algorith iterates through values of \(\theta_i\) by maximizing \(Q(\theta | \theta_i, \mathbf{x})\).

The EM Algorithm

Pick a starting value \(\hat{\theta_0}\)

Then for i in 1:n do

1. Compute (E-step) \[ Q(\theta | \theta_{i-1}, \mathbf{x}) = E[log L^c(\theta | \mathbf{x, z})] \]

where the expectation is with respect to \(k( \bf{Z} | \theta_i, \bf{x} )\)

2. Maximize \(Q(\theta | \theta_{i-1}, \mathbf{x})\) in \(\theta\) and take

\[ \hat{\theta_i} = \text{ arg max } Q(\theta | \theta_{i-1}, \mathbf{x}) \]

repeat until convergence criteria is met

21.1 Normal Distribution with missing values using EM

Suppose \(X = (x_1, ..., x_n)^T\) is a random sample from \(N(\mu,1).\) Let the observations be in order such that \(x_1 < x_2 < ... < x_n\). Suppose that after time \(c\), values are censored or missing, such that only \(x_1, ..., x_m\) are observed, and \(x_{m+1}, ..., x_n\) are unobserved. Then, \(r = (n - m)\) would be the quantity missing. We will use the EM and MCEM algorithms to find approximations for \(\mu\). Let \(Z = (x_{m+1}, ..., x_n)^T\).

First, construct the likelihood function.

\[ \begin{aligned} L(\mu | x) & = \prod^m f(x_i | \mu, 1) \times \prod^r f(z_i | \mu, 1) \\ \ & = (2 \pi )^{-n/2} exp(-\frac{1}{2} \sum_{i=1}^m (x_i - \mu)^2) \times exp(-\frac{1}{2} \sum_{i=1}^m (z_i - \mu)^2) \\ \ & \propto exp(-\frac{1}{2} \sum_{i=1}^m (x_i - \mu)^2) \times exp(-\frac{1}{2} \sum_{i=1}^m (z_i - \mu)^2) \end{aligned} \]

The log-likelihood is then

\[ ln(L(\mu | X)) = -\frac{1}{2} \sum_{i=1}^m (x_i - \mu)^2) - \frac{1}{2} \sum_{i=1}^m (z_i - \mu)^2 \]

We now find the conditional expectation \(E[z_i | X]\)

\[ \begin{aligned} E[z_i | X] & = E[z_i | x > c] = \int_c^{\infty} \frac{P(x_i > x | x_i > c)}{P(x_i > c)} \\ \ & = \mu + \sigma \frac{\phi(c - \mu)}{1 - \Phi(c - \mu)} \end{aligned} \]

\[ \begin{aligned} Q(\mu | \mu_t) & = -\frac{1}{2} \sum_{i=1}^m (x_i - \mu)^2) - \sum E[z_i | X] \\ \ & = -\frac{1}{2} \sum_{i=1}^m (x_i - \mu)^2) - \sum E[z | X] \\ \ & = -\frac{1}{2} \sum_{i=1}^m (x_i - \mu)^2) - (n-m) E[z | X] \\ \end{aligned} \]

The MLE for \(\mu\) is then,

\[ \begin{aligned} \mu_{t+1} & = \frac{m \bar{x}}{n} + \frac{(n - m) E[z | X]}{n} \\ \ & = \frac{m \bar{x}}{n} + \frac{(n - m) (\mu_t)}{n} + \frac{(n-m) \phi(c - \mu_t)}{n \Phi(c-\mu_t)} \\ \end{aligned} \]

set.seed(2345)
n = 100
mu = 4
sd = 1
x = rnorm(n, mu, sd) ## generate some data
c = 5 ## time cut off
w = x[x < c] ## obtain samples before time cut off
m = sum(x < c) ## number of observed samples
wbar = mean(w) ## observed mean
r = n - m ## difference in sample size
N = 200
mu_new = wbar
results = numeric(N)
for(i in 1:N){
    results[i] = mu_new
    mu_old = mu_new
    mu_new = m*wbar/n + (r*mu_old/n) + 
      (r/n)*sd*(dnorm(c - mu_old))/(1 - pnorm(c - mu_old))  ## r/n instead of 1/n
    #print(mu_new)
}

print(tail(results))
## [1] 4.0408209179195476 4.0408209179195476 4.0408209179195476
## [4] 4.0408209179195476 4.0408209179195476 4.0408209179195476
plot(results, type = "l", main = "em estimates for mu", ylim = c(3.5, 4.5))
abline(h = mu, col = "red")
abline(h = wbar, col = "green", lty = 2)
abline(h = mean(x), col = "blue", lty = 3)

21.2 EM using Monte Carlo

A MC flavor of the EM algorithm

  1. Draw missing data sets \(\mathbf{Z_1, Z_2, ..., Z_m} \sim f_{Z|X}(z | x, \theta_i)\) where each \(\mathbf{Z_i}\) is a vector of all missing values needed to complete the observed data set \(( \mathbf{X, Z} )\).

  2. Calculate \(\bar{Q}(\theta | \theta_{i-1}, X, \mathbf{Z_1, ..., Z_m}) = \frac{1}{m} \sum_{i=1}^m Q(\theta | \theta_{i-1}, X, \mathbf{Z_i} )\)

set.seed(2345)
n = 100
mu = 4
sd = 1
x = rnorm(n, mu, sd)
c = 5
w = x[x < c]
m = sum(x < c)
wbar = mean(w)
r = n - m
M = 10
N = 100
mu_new = wbar
results = numeric(N)
for(i in 1:N){
    results[i] = mu_new
    mu_old = mu_new
    ## abs(N(0,1)) + mu_old + (c - mu_old) to *approximate*
    ## the truncated samples we need
    Z = matrix(data = (c - mu_old) + (mu_old +  abs(rnorm(n = r*M, mean = 0, sd = 1))), 
        nrow = r, ncol = M)
    mu_new = (m*wbar/n) + mean(colMeans(Z))*r/n
    M = M + 1
}
plot(results, type = "l", ylim = c(3.5, 4.5))
abline(h = mu, col = "red")
abline(h = wbar, col = "green", lty = 2)
abline(h = mean(x), col = "blue", lty = 3)