#================================================================================#
# Stat 7650: Computational Statistics -- Spring 2025                             #
# R code examples for basic root-finding and optimization                        #
# Based in part on Givens & Hoeting (Chap 2) and Lange (Chap 5, 14)              #
# Modified from code of R. Martin                                                #
#================================================================================#

#Set Working Directory to Source File Location
library("rstudioapi")                                 # Load rstudioapi package
setwd(dirname(getActiveDocumentContext()$path))       # Set working directory to source file location
#getwd()   

#some functions for the methods used in G&H
source("AuxFunctions.r")

#########################################################################
## PART I: Univariate Methods
#########################################################################

#########################################################################
# Example: Maximizing the function g(x) = (log x) / (1 + x) in Givens & Hoeting
#########################################################################
# Define the function g(x)
g <- function(x) log(x) / (1 + x)
# Plot the function
curve(g, xlim=c(1, 10), main="Plot of g(x) = log(x) / (1 + x)")
# First derivative of g, needed for optimization
f <- function(x) (1 + 1 / x - log(x)) / (1 + x)^2
# Second derivative of g, needed for Newton's method
df <- function(x) -1 / x^2 / (1 + x) - 2 * (1 + 1 / x - log(x)) / (1 + x)^3

# Execute optimization methods
cat("Bisection method result:\n")
bisection(f, 1, 5, verbose=TRUE)
cat("Newton's method result:\n")
newton(f, df, 4, verbose=TRUE)
cat("Secant method result:\n")
secant(f, 1, 5, verbose=TRUE)

#########################################################################
# Example: Finding Percentiles of a Student-t Distribution.
#########################################################################
# Objective function: Difference between Student-t CDF and target percentile
f <- function(x, nu, p) pt(x, df = nu) - p

# Derivative of the objective function: PDF of the Student-t distribution
df <- function(x, nu, p) dt(x, df = nu)

# Plot the Student-t distribution's CDF for df = 3
plot(x = seq(-4, 7, by = 0.01), y = pt(seq(-4, 7, by = 0.01), df = 3), type = "l",
     main = "cdf of Student-t distribution (df = 3)", xlab = "quantile", ylab = "probability",
     xlim = c(-4, 7), col = "blue")
abline(h = 0.7, lty = 2, col = "red") # Target percentile line

# Theoretical quantile using R's built-in function
theo_quant <- qt(0.7, df = 3)
cat("Theoretical Quantile (qt):", theo_quant, "\n")

# Find the quantile using bisection method
bisect_quant <- bisection(f, -4, 7, nu = 3, p = 0.7)
cat("Bisection Method Quantile: \n")
bisect_quant

# Find the quantile using Newton's method
# Notice that Newton's method fails here if you have large initial value (say >=1.95), where derivative is virtually zero
newt_quant <- newton(f, df, 1.93, nu = 3, p = 0.7)
cat("Newton's Method Quantile: \n")
newt_quant

# Find the quantile using secant method
sec_quant <- secant(f, 1, 4, nu = 3, p = 0.7)
cat("Secant Method Quantile: \n")
sec_quant

#########################################################################
# Example: Kepler's Law and Functional Iteration
#########################################################################
# x = eccentric anomaly
# M = mean anomaly
# A = eccentricity (a number between 0 and 1)

# Kepler's law identifies the relationship:  x = M + A * sin(x)
# Use functional iteration to solve for x for a given (M, A)
# Define Kepler's function for the fixed-point iteration
F <- function(x, M, A) {
  if (A <= 0 || A >= 1) {
    stop("Eccentricity (A) must be in the range (0, 1).")
  }
  return(M + A * sin(x))
}

# Example parameters for Kepler's equation
M <- 7  # Mean anomaly
A <- 0.7  # Eccentricity, should be in the range (0,1)

# Solve for x using the fixed-point iteration
soln <- fixed_pnt_iter(F, 1, M = M, A = A, verbose = TRUE)

cat("The solution to Kepler's equation is approximately:\n")
soln

#########################################################################
## PART 2: Multivariate versions
#########################################################################

#########################################################################
# Example: Gamma MLEs
#########################################################################
# Data: observed values
X <- c(6.00, 5.98, 7.81, 6.77, 10.64, 13.63, 10.53, 8.92, 3.77, 5.78,
       6.32, 4.44, 5.34, 1.54, 4.42, 4.71, 6.12, 2.57, 9.47, 9.03)
S1 <- sum(log(X))  # Sum of log-transformed X
S2 <- sum(X)  # Sum of X
n <- length(X)  # Sample size

# Log-likelihood function for gamma distribution
loglik <- function(alpha, beta) {
  L <- -n * alpha * log(beta) - n * log(gamma(alpha)) + (alpha - 1) * S1 - S2 / beta
  return(L)
}

# Grid for alpha (shape) and beta (rate) parameters
A <- seq(1, 10, len = 100)
B <- seq(0.1, 3.5, len = 100)  # Start from 0.1 to avoid division by zero
# Compute log-likelihood for each (alpha, beta) pair
like_AB <- outer(A, B, Vectorize(loglik))

# Contour plot of the log-likelihood
contour(A, B, like_AB, xlab = expression(alpha), ylab = expression(beta), 
        levels = seq(min(like_AB), max(like_AB), length.out = 100), main = "Log-Likelihood Contour")
grid()

# Derivative of log-likelihood
dl <- function(theta) {
  alpha <- theta[1]
  beta <- theta[2]
  o1 <- -n * log(beta) - n * digamma(alpha) + S1
  o2 <- -n * alpha / beta + S2 / (beta^2)
  return(c(o1, o2))
}

# Second derivative (Hessian) of log-likelihood
ddl <- function(theta) {
  alpha <- theta[1]
  beta <- theta[2]
  o11 <- -n * trigamma(alpha)
  o12 <- -n / beta
  o22 <- n * alpha / (beta^2) - 2 * S2 / (beta^3)
  return(matrix(c(o11, o12, o12, o22), ncol = 2))
}

# Initial estimates based on the method of moments
theta0 <- c(mean(X)^2 / var(X), var(X) / mean(X))

# Newton's method for optimization
out <- mv_newton(dl, ddl, theta0)$solution
cat("Starting and finishing values for Newton's method:\n")
print(rbind(start = theta0, finish = out))

# Plot initial and final estimates
points(x = c(theta0[1], out[1]), y = c(theta0[2], out[2]), pch = c(20, 4), col = c("green", "red"), cex = 1.5)
text(x = out[1], y = out[2], labels = "Newton Finish", pos = 4, col = "red")

# Steepest ascent for the gamma MLE
## Just for illustration -- not for HW, "real life", etc...
# requires the stuff defined in gamma MLE example above
x0 <- c(2, 2)
gamma_ascent <- steepest_ascent(dl, x0, 0.01)  

lines(gamma_ascent$path[, 1], gamma_ascent$path[, 2], col = "blue", lwd = 2)
gamma_newton <- mv_newton(dl, ddl, x0)$solution
points(gamma_newton[1], gamma_newton[2], pch = 4, col = "blue", cex = 1.5)
text(gamma_newton[1], gamma_newton[2], labels = "Steepest Ascent Finish", pos = 2, col = "blue")

#########################################################################
# Example: Problem 2.3 in Givens & Hoeting
#########################################################################
# Data setup
y <- c(6, 6, 6, 6, 7, 9, 10, 10, 11, 13, 16, 17, 19, 20, 22, 23, 25, 32, 32, 34, 35,
       1, 1, 2, 2, 3, 4, 4, 5, 5, 8, 8, 8, 8, 11, 11, 12, 12, 15, 17, 22, 23)
w <- c(0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, rep(1, 21))
x <- c(rep(1, 21), rep(0, 21))

# Log-likelihood function
loglik <- function(theta) {
  alpha <- exp(theta[1])
  beta0 <- theta[2]
  beta1 <- theta[3]
  log_S <- -y^alpha * exp(beta0 + beta1 * x)
  log_f <- log(alpha) + (alpha - 1) * log(y) + beta0 + beta1 * x + log_S
  sum((1 - w) * log_S + w * log_f) 
}

# Derivative of the log-likelihood function
dloglik <- function(theta) {
  alpha <- exp(theta[1])
  beta0 <- theta[2]
  beta1 <- theta[3]
  log_S <- -y^alpha * exp(beta0 + beta1 * x)
  d_alpha <- sum((log(y) * log_S + w * (1 / alpha + log(y))) * alpha)
  d_beta0 <- sum(log_S + w)
  d_beta1 <- sum(x * (log_S + w))
  c(d_alpha, d_beta0, d_beta1) 
}

# Initial parameter estimates
theta0 <- c(0, -3, -3)
M <- -diag(c(400, 30, 9))
bfgs(f=loglik, df=dloglik, x=theta0, M=M, hess=TRUE)

g <- function(x) -loglik(x) # Negate loglik because optim is a minimizer
opt_result <- optim(fn=g, par=theta0, method="BFGS", hessian=TRUE)
opt_result

#More specifically
# Print the optimized parameters and log-likelihood value
cat("Optimized Parameters:\n")
print(opt_result$par)
cat("Optimized Log-Likelihood Value:\n")
print(opt_result$value) 

#########################################################################
# Example: Probit Regression
#########################################################################
# Simulating data
set.seed(123) # Ensure reproducibility
A <- 0; B <- 1
x <- rnorm(50, 0, 4)
y <- rbinom(50, 1, pnorm(A + B * x))

# Log-likelihood function for a logistic regression model
loglik <- function(theta) {
  a <- theta[1]
  b <- theta[2]
  p <- pnorm(a + b * x) # Probability of success
  log_likelihood <- sum(dbinom(y, size = 1, prob = p, log = TRUE))
  return(log_likelihood)
}

# Derivative (Gradient) of the log-likelihood function
dloglik <- function(theta) {
  a <- theta[1]
  b <- theta[2]
  z <- a + b * x
  p <- pnorm(z)
  dp <- dnorm(z)
  gradient <- ((y - p) * dp) / (p * (1 - p))
  grad_a <- sum(gradient)
  grad_a[is.na(grad_a)] <- 0
  grad_b <- sum(x * gradient)
  return(matrix(c(grad_a, grad_b), ncol=1)) # Return as a colmn vector
}

dloglik <- function(theta) {
  a <- theta[1]
  b <- theta[2]
  p <- pnorm(a + b * x)
  dp <- dnorm(a + b * x)
  u <- (y - p) * dp / p / (1 - p)
  u[is.na(u)] <- 0
  return(matrix(c(sum(u), sum(u * x)), ncol=1))
}

# Initial parameter estimates
theta0 <- rep(0, 2)
M <- -diag(rep(1, 2))

# Using custom BFGS optimization
bfgs_res <- bfgs(f = loglik, df = dloglik, x = theta0, M = M, hess = TRUE)
bfgs(f=loglik, df=dloglik, x=theta0, M=M, hess=TRUE); 

cat("Results from custom BFGS optimization:\n")
print(bfgs_res)

# Visualization of the data and fitted probability curve
plot(x, y, main = "data and fitted probabilities", xlab = "x", ylab = "probability")
curve(pnorm(bfgs_res$x[1] + bfgs_res$x[2] * x), add = TRUE, col = 2, lwd = 2)

# Using R's built-in optim function for comparison
opt_res <- optim(par = theta0, fn = function(x) -loglik(x), gr = function(x) -dloglik(x),
                       method = "BFGS", hessian = TRUE)
cat("Results from R's optim:\n")
print(opt_res$par)

# Add the curve from R's optim results for comparison
curve(pnorm(opt_res$par[1] + opt_res$par[2] * x), add = TRUE, col = 3, lwd = 2, lty = 2)
legend("topleft", legend = c("Custom BFGS", "R's optimizer"), col = c(2, 3), lwd = 2, lty = 1:2)

