#================================================================================#
# Stat 7650: Computational Statistics -- Spring 2025                             #
# R code examples for the EM Algorithm                                           #
# Based in part on Givens & Hoeting (Chap 4) and Lange (Chap 13)                 #
# Modified from R. Martin's code                                                 #
#================================================================================#

#Set Working Directory to Source File Location
library("rstudioapi")                                 # Load rstudioapi package
setwd(dirname(getActiveDocumentContext()$path))       # Set working directory to source file location
#getwd()   

#########################################################################
# Example 1 -- Censored Exponential Data
#########################################################################
set.seed(123)
n <- 30
theta_true <- 3
Y <- rexp(n, theta_true) # Generate data from exponential distribution

# Censoring condition and applying censoring threshold
cens_thres <- 0.632
d <- Y <= cens_thres
X <- ifelse(d, Y, cens_thres)

# Incorrect MLE estimates for reference
mle_naive <- 1 / mean(Y[d])  # Incorrect due to ignoring censored data
mle_cens <- 1 / mean(X)   # Incorrect due to bias from censoring
c(mle_naive,mle_cens)

# Log-likelihood function for censored exponential data
log_like <- function(theta) 
  sum(d * log(dexp(X, theta)) + (1 - d) * log(pexp(X, theta, lower.tail = FALSE)))

# Plot likelihood across theta values
u <- seq(0.01, 8, length.out = 250)
Lu <- sapply(u, log_like)

plot(u, Lu, type = "l", xlab = expression(theta), 
     ylab = expression(log ~ L(x~"|"~theta)), 
     main = expression(paste("log-likelihood vs. ", theta)))

# Iterative parameter estimation using EM-like approach
tol <- 1e-8
maxit <- 500
est <- numeric(maxit + 1)
like_vec <- numeric(maxit + 1)
est[1] <- 7  # Initial estimate
like_vec[1] <- log_like(est[1])

for (i in seq_len(maxit)) {
  est[i + 1] <- est[i] / (est[i] * mean(X) + mean(!d))
  like_vec[i + 1] <- log_like(est[i + 1])
  if (abs(est[i + 1] - est[i]) < tol) {
    est <- head(est, i + 1)
    like_vec <- head(like_vec, i + 1)
    break
  }
}

# Optimization to find the MLE of theta
theta_mle <- nlm(function(theta) -log_like(theta), p = 6)$estimate

# Adding iterative estimates and the MLE to the plot
lines(est, like_vec, type = "b", lty = 2, col = "red")
abline(v = theta_mle, lty = 3, col = "blue")
legend("bottomright", legend = c("Iterative Estimation", "MLE via nlm"), 
       lty = c(2, 3), col = c("red", "blue"))

# Return final estimates
list(iterative_est = tail(est, 1), mle_nlm = theta_mle, naive_mle = mle_naive, cens_mle = mle_cens)

#########################################################################
# Dependence of the EM estimate to the censoring threshold value (with n=30 and same sample Y)
# Vector to store estimates for each censoring threshold
cens_thres <- seq(0.1, 1.5, by = 0.01)
estimates <- numeric(length(cens_thres))

# Loop over censoring thresholds
for(j in seq_along(cens_thres)) {
  cens_thre <- cens_thres[j]
  d <- as.numeric(Y <= cens_thre) # Censoring condition
  X <- ifelse(d == 1, Y, cens_thre) # Apply censoring
  
  # Log-likelihood function for censored exponential data
  log_like <- function(theta) {
    return(sum(d * log(dexp(X, theta)) + (1 - d) * log(pexp(X, theta, lower.tail = FALSE))))
  }
  
  # Initial estimation
  est <- numeric(maxit) # Store estimates for each iteration
  est[1] <- 7 # Initial estimate
  like_vec <- numeric(maxit) # Store log-likelihood for each estimate
  like_vec[1] <- log_like(est[1])
  maxit <- 500
  tol <- 1e-8
  i <- 1
  
  # Iterative parameter estimation
  repeat {
    theta_new <- est[i] / (est[i] * mean(X) + mean(1 - d))
    est[i + 1] <- theta_new
    like_vec[i + 1] <- log_like(theta_new)
    if(abs(theta_new - est[i]) < tol || i >= maxit) {
      est <- est[1:(i+1)] # Trim the unused portion of the vector
      like_vec <- like_vec[1:(i+1)]
      break
    }
    i <- i + 1
  }
  
  fin_est =tail(est,1) # Returning the final set of est
  estimates[j] = fin_est
}

estimates

# Plotting estimates vs censoring thresholds
plot(cens_thres, estimates, type = "b", pch = 19, xlab = "censoring threshold", 
     ylab = expression(paste("Estimated ", theta)),
     main = expression(paste("Estimated ",theta," vs. Censoring Threshold")))
abline(h = theta_true, col = "red", lty = 2) # True value of theta for reference
legend("bottomright", legend = c("Estimates", expression(paste("True ", theta))), 
       pch = c(19, NA), lty = c(1, 2), col = c("black", "red"))  

#QUESTION: Why there are jumps in the plot?

#########################################################################
# Dependence of the EM estimate to the censoring threshold value 
#(with n=1000 and different random samples for each threshold Y)

# Parameters and data generation
n <- 1000
theta_true <- 3

# Vector to store estimates for each censoring threshold
cens_thres <- seq(0.1, 3, by = 0.01)
estimates <- numeric(length(cens_thres))

# Loop over censoring thresholds
for(j in seq_along(cens_thres)) {
  cens_thre <- cens_thres[j]
  Y <- rexp(n, theta_true) # Generate data from exponential distribution
  d <- as.numeric(Y <= cens_thre) # Censoring condition
  X <- ifelse(d == 1, Y, cens_thre) # Apply censoring
  
  # Log-likelihood function for censored exponential data
  log_like <- function(theta) {
    return(sum(d * log(dexp(X, theta)) + (1 - d) * log(pexp(X, theta, lower.tail = FALSE))))
  }
  
  # Plotting likelihood across a range of theta values
  #  u <- seq(0.01, 8, length.out = 250)
  #  Lu <- sapply(u, log_like) # Vectorized computation of log-likelihood
  #  plot(u, Lu, type = "l", xlab = expression(theta), ylab = expression(log~L(X)(theta)), main = "Log-Likelihood vs. Theta")
  
  # Initial estimation
  est <- numeric(maxit) # Store estimates for each iteration
  est[1] <- 7 # Initial estimate
  like_vec <- numeric(maxit) # Store log-likelihood for each estimate
  like_vec[1] <- log_like(est[1])
  maxit <- 500
  tol <- 1e-8
  i <- 1
  
  # Iterative parameter estimation
  repeat {
    theta_new <- est[i] / (est[i] * mean(X) + mean(1 - d))
    est[i + 1] <- theta_new
    like_vec[i + 1] <- log_like(theta_new)
    if(abs(theta_new - est[i]) < tol || i >= maxit) {
      est <- est[1:(i+1)] # Trim the unused portion of the vector
      like_vec <- like_vec[1:(i+1)]
      break
    }
    i <- i + 1
  }
  
  fin_est =tail(est,1) # Returning the final set of estimates
  estimates[j] = fin_est
}

estimates

# Plotting estimates vs censoring thresholds
plot(cens_thres, estimates, type = "b", pch = 19, xlab = "censoring threshold", 
     ylab = expression(paste("Estimated ", theta)),
     main = expression(paste("Estimated ", theta, " vs. Censoring Threshold")))
abline(h = theta_true, col = "red", lty = 2) # True value of theta for reference
legend("bottomright", legend = c("Estimates", expression(paste("True ", theta))), 
       pch = c(19, NA), lty = c(1, 2), col = c("black", "red"))  

#QUESTION: What do you observe in the plot?

#########################################################################
# Example 2 -- Probit Regression via EM
#########################################################################

# Standard notation is to use Y for the binary response and X for the predictor
# variable.  But since the textbook (and my lecture notes) reserve the notation
# "X" for observed data and "Y" for complete data, I've used some non-standard
# notation here.  In particular, like in the lecture notes, I'm writing X for
# the binary response and U for the predictor variable. .

# EM Algorithm for Probit Regression
em_probit <- function(U, X, theta0) {
  maxit <- 1000
  tol <- 1e-10
  i <- 0
  M <- solve(t(U) %*% U) %*% t(U) # Matrix for updates
  theta <- theta0
  m <- U %*% theta
  
  repeat {
    i <- i + 1
    # E-step: Compute weights
    v <- (2 * X - 1) * dnorm(m) / pnorm((2 * X - 1) * m)
    # M-step: Update parameters
    up <- M %*% v
    theta <- theta + up
    m <- U %*% theta
    
    if(max(abs(up)) < tol || i >= maxit) break
  }
  
  return(list(iterations=i, estimate=as.numeric(theta)))
}

# Data Simulation
set.seed(123) # For reproducibility
A <- 0
B <- 1
x <- rnorm(50, 0, 4)
y <- rbinom(50, 1, pnorm(A + B * x))

# Negative Log-Likelihood Function
neg_log_like <- function(theta) {
  a <- theta[1]
  b <- theta[2]
  p <- pnorm(a + b * x)
  -sum(dbinom(y, size=1, prob=p, log=TRUE))
}

# Optimizing using BFGS method
theta0 <- c(1, 3) # Initial parameter estimates
res_optim <- optim(theta0, neg_log_like, method="BFGS")
print(res_optim)

# EM Algorithm
res_em_probit <- em_probit(as.matrix(cbind(rep(1, 50), x)), y, theta0)
print(res_em_probit)

# Plotting
plot(x, y, xlab="predictor (X)", ylab="response (Y)", 
     main="Probit Regression via EM Algorithm", pch=19)
curve(pnorm(res_em_probit$estimate[1] + res_em_probit$estimate[2] * x), add=TRUE, col=2, lwd=2)
legend("topleft", legend=c("EM Fit"), col=c(2), lwd=2, bty="n")

#########################################################################
# Example 3 -- Robust Regression via EM
#########################################################################

# em_robust_reg performs robust regression using the EM algorithm
# Inputs:
# X - Design matrix with predictor variables
# Y - Response variable
# df - Degrees of freedom for t-distribution
em_robust_reg <- function(X, Y, df) {
  tol <- 1e-08 # Convergence tolerance
  maxit <- 500 # Maximum number of iterations
  b0 <- rep(0, ncol(X)) # Initial beta estimates
  s2 <- var(Y) # Initial estimate of variance
  
  i <- 0 # Iteration counter
  repeat {
    i <- i + 1
    
    # E-step: Calculate residuals and update weights
    Z <- Y - X %*% b0
    w <- (df + 1) / (df + (Z^2) / s2)
    
    # M-step: Weighted regression
    out <- lm(Y ~ X - 1, weights = w)
    b <- coef(out)
    s2 <- summary(out)$sigma^2
    
    # Check for convergence
    if(mean((b - b0)^2) < tol || i >= maxit) break else b0 <- b
  }
  
  return(list(beta = b, sigma2 = s2, iterations = i))
}

# Example usage with phones data from MASS package
library(MASS)
data(phones)
Y <- phones$calls
X <- cbind(1, phones$year) # Adding intercept term

# Running both standard linear regression and robust regression via EM
beta_lm <- lm(Y ~ X - 1)$coefficients
beta_em <- em_robust_reg(X, Y, 4)$beta

# Plotting results
plot(X[,2], Y, xlab = "Year", ylab = "Calls (in millions)", main = "Robust Regression via EM")
abline(a = beta_em[1], b = beta_em[2], col = "red", lty = 1, lwd = 2)
abline(a = beta_lm[1], b = beta_lm[2], col = "blue", lty = 2, lwd = 2)
legend("topleft", legend = c("EM robust fit", "LS fit"), 
       col = c("red", "blue"), lty = 1:2, lwd = 2)

#########################################################################
# Example 2 (cont.) -- PXEM Algorithm for Probit Regression
#########################################################################

# PXEM Algorithm for Probit Regression
pxem_probit <- function(U, X, theta_init) {
  maxit <- 1000
  tol <- 1e-10
  iter <- 0
  n <- nrow(U)
  M <- solve(t(U) %*% U) %*% t(U)
  H <- U %*% M
  theta <- theta_init
  m <- U %*% theta
  
  repeat {
    iter <- iter + 1
    # E-step: Compute adjusted response
    Z_star <- m + (2 * X - 1) * dnorm(m) / pnorm((2 * X - 1) * m)
    Z_star2 <- 1 + m * Z_star
    # PX-step: Parameter expansion
    alpha2 <- (sum(Z_star2) - as.numeric(t(Z_star) %*% H %*% Z_star)) / n
    theta_new <- M %*% Z_star / sqrt(alpha2)
    
    # Convergence check
    if(max(abs(theta_new - theta)) < tol || iter >= maxit) break
    theta <- theta_new
    m <- U %*% theta
  }
  
  return(list(iterations=iter, estimate=as.numeric(theta)))
}

# Example usage with the kyphosis data set
library(rpart) # for the kyphosis data set
data(kyphosis)
X <- as.numeric(kyphosis$Kyphosis) - 1
U <- as.matrix(cbind(1, kyphosis[, 2:4]));  colnames(U) <- NULL 

# Run the PXEM algorithm
pxem_res <- pxem_probit(U, X, rep(0, ncol(U)))
print(pxem_res)

