############################################################################
# COMPUTATIONAL STATISTICS
# by Geof H. Givens and J. A. Hoeting
# CHAPTER 2 EXAMPLES
# Modified from code of Givens & Hoeting and R. Martin
#########################################################################

#Set Working Directory to Source File Location
library("rstudioapi")                                 # Load rstudioapi package
setwd(dirname(getActiveDocumentContext()$path))       # Set working directory to source file location
#getwd()   

#some functions for the methods used in G&H
source("AuxFunctions.r")

#########################################################################
### EXAMPLE 2.1 BISECTION
#########################################################################
# a = initial left endpoint
# b = initial right endpoint
# x = initial value
# itr = number of iterations to run
# f = objective function
# f_prime = first derivative of objective function
# eps = convergence criterion
#########################################################################

## INITIAL VALUES
a = 1
b = 5
x = (a + b) / 2
itr = 40
eps = 1e-08  # Convergence criterion

## FUNCTIONS
f = function(x) { log(x) / (1 + x) }
f_prime = function(x) { (1 + (1 / x) - log(x)) / ((1 + x) ^ 2) }

## PLOTTING
curve(f, xlim=c(1, 10))
curve(f, xlim=c(3., 4.3))
abline(v=3.591121, lty=2)

## MAIN
i <- 1
while(abs(f_prime(x)) > eps && i <= itr) {
  if (f_prime(a) * f_prime(x) < 0) {
    b <- x
  } else {
    a <- x
  }
  x <- (a + b) / 2
  i <- i + 1
}

## OUTPUT
print(paste("Final estimate of root:", x))
print(paste("Objective function at estimate:", f(x)))
print(paste("Derivative at estimate:", f_prime(x)))

# Using the bisection function from AuxFunctions.r
print(bisection(f_prime, 1, 5, verbose=TRUE))
# verbose=TRUE argument prints the x^t and f(x^t) values at each iteration

#########################################################################
### bisection to find p-th percentile of student t distribution
#########################################################################

findPercentileT <- function(p, nu, a = -10, b = 10, tol = 1e-6, max.iter = 100) {
  require(stats) # For pt(), the CDF of the Student-t distribution
  
  iter <- 0
  while(iter < max.iter) {
    c <- (a + b) / 2
    Fc <- pt(c, df = nu)
    
    if (abs(Fc - p) < tol) {
      return(c)
    } else if (Fc < p) {
      a <- c
    } else {
      b <- c
    }
    
    iter <- iter + 1
  }
  
  warning("Maximum iterations reached without convergence")
  return(c)
}

# Example: Find the 95th percentile for t-distribution with 10 degrees of freedom
p <- 0.95
nu <- 10
percentile <- findPercentileT(p, nu)
cat("The 95th percentile of the t-distribution with", nu, "degrees of freedom is:", percentile, "\n")

#########################################################################
### EXAMPLE 2.2 NEWTON'S METHOD
#########################################################################

#########################################################################
# x = initial value
# itr = number of iterations to run
# f = objective function
# f_prime = first derivative of objective function
# f_2prime = second derivative of objective function
#########################################################################

## INITIAL VALUES
x = 3 # Try also 0.1, 5, 10 (observe behavior at 10)
itr = 40
eps = 1e-08  # Convergence criterion

## FUNCTIONS
f = function(x) { log(x) / (1 + x) }
f_prime = function(x) { (1 + (1 / x) - log(x)) / ((1 + x) ^ 2) }
f_2prime = function(x) { (-1 / (x^2 + x^3)) - 2 * (1 + (1 / x) - log(x)) / ((1 + x) ^ 3) }

## MAIN
i <- 1
while(abs(f_prime(x)) > eps && i <= itr) {
  x <- x - f_prime(x) / f_2prime(x)
  i <- i + 1
}

## OUTPUT
print(paste("Final estimate of root:", x))
print(paste("Objective function at estimate:", f(x)))
print(paste("Derivative at estimate:", f_prime(x)))

# Using the newton function from AuxFunctions.r
x0 = 3
print(newton(f_prime, f_2prime, x0, verbose=TRUE))

#########################################################################
### SECANT METHOD (on the same function)
#########################################################################

#########################################################################
# x0, x1 = initial values
# itr = number of iterations to run
# f = objective function
# f_prime = first derivative of objective function
#########################################################################
## INITIAL VALUES
x0 = 1.5
x1 = 2.4
itr = 40
eps = 1e-08  # Convergence criterion

## FUNCTIONS
f = function(x) { log(x) / (1 + x) }
f_prime = function(x) { (1 + (1 / x) - log(x)) / ((1 + x) ^ 2) }

## MAIN
x_old = x0
x = x1
f_pri_old = f_prime(x_old)
f_pri = f_prime(x)

for (i in 1:(itr - 1)) {
  if (abs(f_pri - f_pri_old) < eps) {
    warning("Small or no change in function value; might lead to division by zero.")
    break
  }
  x_new = x - f_pri * (x - x_old) / (f_pri - f_pri_old)
   if (abs(x_new - x) < eps) break
  x_old = x
  f_pri_old = f_pri
  x = x_new
  f_pri = f_prime(x)
}

## OUTPUT
print(paste("Final estimate of root:", x))
print(paste("Objective function at estimate:", f(x)))
print(paste("Derivative at estimate:", f_prime(x)))

# Using the secant function from AuxFunctions.r
print(secant(f_prime, x0, x1, verbose=TRUE))

#########################################################################
### EXAMPLE 2.3 (SCALED) FIXED POINT ALGORITHM
#########################################################################

#########################################################################
# alpha = scale parameter
# x = initial value
# itr = number of iterations to run
# f = objective function
# f_prime = first derivative of objective function
#########################################################################

## INITIAL VALUES
alpha = 4 # Try also 2, 3, 5
x = 1.75
itr = 100
eps = 1e-08  # Convergence criterion

## OBJECTIVE FUNCTION AND DERIVATIVE
f = function(x) { log(x) / (1 + x) }
f_prime = function(x) { (1 + (1 / x) - log(x)) / ((1 + x) ^ 2) }

## MAIN
x_old = x
for(i in 1:itr) {
  x = alpha * f_prime(x) + x
  if(abs(x - x_old) < eps) break
  x_old = x
}

## OUTPUT
print(paste("Final estimate of root:", x))
print(paste("Objective function at estimate:", f(x)))
print(paste("Derivative at estimate:", f_prime(x)))

# Using the fixed_pnt_iter function from AuxFunctions.r
F = function(x, ...) { alpha * f_prime(x) + x }
x0 = 1.75
print(fixed_pnt_iter(F, x0, verbose=TRUE))

#########################################################################
### EXAMPLE 2.4 NEWTON'S METHOD (BIVARIATE)
#########################################################################

#########################################################################
# x = initial value
# itr = number of iterations to run
# x_vals = contains values of x for each iteration
# f = objective function
# f_prime = first derivative of objective function
# f_2prime = second derivative of objective function
#########################################################################

## CAVEAT: The objective function in the following example is the negative of the Himmelblau's function 
# (In mathematical optimization, Himmelblau's function is a multi-modal function,
# used to test the performance of optimization algorithms. The function is defined by:
# f(x,y)=(x^{2}+y-11)^{2}+(x+y^{2}-7)^{2}. 
# It has one local maximum at x=-0.270845 and y=-0.923039 where f(x,y)=181.617, 
# and four identical local minima:
# f(3.0,2.0)=0.0, 
# f(-2.805118,3.131312)=0.0, 
# f(-3.779310,-3.283186)=0.0, 
# f(3.584428,-1.848126)=0.0.  
# This is different than the example in the book.

#########################################################################

# Himmelblau's function
hb <- function(x, y) {
  (x^2 + y - 11)^2 + (x + y^2 - 7)^2
}

# prepare variables for plotting
x <- y <- seq(-6, 6, length = 125)
z <- outer(x, y, hb)

# plot the 3D surface
persp(x, y, z, theta = 45, phi = 30, col = "lightblue", ltheta = 120)
# contour plot of the surface
contour(x, y, z, nlevels = 160, drawlabels = TRUE)

## INITIAL VALUES
x <- c(-2, -2)
itr <- 40
x_vals <- matrix(0, itr + 1, 2)
x_vals[1, ] <- x

## OBJECTIVE FUNCTION AND DERIVATIVES
#negative of the himmelblau function
f <- function(x) {
  -1 * (((x[1]^2 + x[2] - 11)^2) + (x[1] + x[2]^2 - 7)^2)
}

f_prime <- function(x) {
  c(
    -1 * (4*x[1]^3 + 4*x[1]*x[2] - 42*x[1] + 2*x[2]^2 - 14),
    -1 * (2*x[1]^2 - 26*x[2] - 22 + 4*x[1]*x[2] + 4*x[2]^3)
  )
}

f_2prime <- function(x) {
  matrix(c(
    -1 * (12*x[1]^2 + 4*x[2] - 42),
    -1 * 4 * (x[1] + x[2]),
    -1 * 4 * (x[1] + x[2]),
    -1 * (12*x[2]^2 + 4*x[1] - 26)
  ), nrow = 2, byrow = TRUE)
}

## MAIN
for (i in 1:itr) {
  x <- x - solve(f_2prime(x)) %*% f_prime(x)
  x_vals[i + 1, ] <- as.numeric(x)
}

## OUTPUT
print(paste("Final estimate:", paste(x, collapse = ", ")))
print(paste("Objective function at estimate:", f(x)))
print(paste("Gradient at estimate:", f_prime(x)))

## PLOT OF CONVERGENCE
nx1 <- nx2 <- 200
x1_ran <- range(c(-6, 6, x_vals[, 1]))
x2_ran <- range(c(-6, 6, x_vals[, 2]))
x1 <- seq(x1_ran[1], x1_ran[2], length = nx1)
x2 <- seq(x2_ran[1], x2_ran[2], length = nx2)
z <- outer(x1, x2, Vectorize(function(x1, x2) f(c(x1, x2))))

contour(x1, x2, z, nlevels = 100, drawlabels = TRUE)
for (i in 1:itr) {
  segments(x_vals[i, 1], x_vals[i, 2], x_vals[i + 1, 1], x_vals[i + 1, 2], lty = 2)
}

# Using the mv_newton function from AuxFunctions.r
x0 <- c(-2, -2)
mv_newton(f_prime, f_2prime, x0)

#########################################################################
### EXAMPLE 2.5 HUMAN FACE RECOGNITION (IRLS)
#########################################################################

#########################################################################
###  Note: the data set provided here differs slightly from the data used
###  in the book: it omits 30 cases.  Thus, the answer here is slightly different
#
# face_dat = observed data
# n = number of observations
# itr = number of iterations to run
# y = observed response data (1=correct match)
# z = covariate matrix
# w = weights
# beta_val = beta estimates at each iteration
# pi_val = pi estimates at each iteration
#########################################################################

## INITIAL VALUES 
#THE INPUT FILE IS: facerecognition.dat (choose this one, not the .txt one!)
#face_dat = read.table(file.choose(),header=TRUE,sep=" ")
## Read Data
face_dat = read.table("../Datasets/facerecognition.dat", header = TRUE, sep = " ")
head(face_dat)

## INITIAL VALUES
n <- nrow(face_dat)
itr <- 40
y <- face_dat$match
z <- cbind(1, face_dat$eyediff)  # Adding a column of 1s for the intercept
beta_val <- matrix(0, itr + 1, 2)
beta_val[1, 1] <- 0.9591309  # Initial beta value

## MAIN
for (j in 1:itr) {
  eta <- z %*% beta_val[j, ]
  pi_val <- 1 / (1 + exp(-eta))
  w <- pi_val * (1 - pi_val)
  W <- diag(w)  # Diagonal matrix of weights
  XW <- sweep(z, 1, w, "*")
  hessian <- t(z) %*% XW
  score <- t(z) %*% (y - pi_val)
  beta_val[j + 1, ] <- beta_val[j, ] + solve(hessian, score)
}

## OUTPUT
final_beta <- beta_val[itr + 1, ]
print(paste("Final estimates: beta1 =", final_beta[1], "beta2 =", final_beta[2]))

# Can you also plot the convergence of beta as before (i.e., the steps of the beta vector as algorithm proceeds)?
# Left as exercise.

#also with the mv_newton function from AuxFunctions.r (exercise, needs some more work to adapt)

#########################################################################
### EXAMPLE 2.6 STEEPEST ASCENT
#########################################################################

#########################################################################
# x = initial value
# M = Hessian approximation
# itr = number of iterations to run
# alpha = scale parameter
# x_vals = contains values of x for each iteration
# f = objective function
# f_prime = first derivative of objective function
#########################################################################
## CAVEAT: The objective function in the following example is the negative of
# Himmelblau's function.  This is different than the function used in the book.
#########################################################################

## INITIAL VALUES
x <- c(0, 0)
M <- diag(-1, 2, 2)  # Hessian approximation
itr <- 40
alpha_def <- 1
alpha <- alpha_def
x_vals <- matrix(0, itr + 1, 2)
x_vals[1, ] <- x

## Modified OBJECTIVE FUNCTION to accept two arguments
f <- function(x1, x2) {
  x <- c(x1, x2)
  -((x[1]^2 + x[2] - 11)^2 + (x[1] + x[2]^2 - 7)^2)
}

## Modified to use vector x directly
f_vec <- function(x) {
  f(x[1], x[2])
}

f_prime <- function(x) {
  c(-4*x[1]^3 - 4*x[1]*x[2] + 42*x[1] - 2*x[2]^2 + 14,
    -2*x[1]^2 + 26*x[2] + 22 - 4*x[1]*x[2] - 4*x[2]^3)
}

## MAIN
hess_inv <- solve(M)  # Inverse of Hessian approximation (constant in this case)
for (i in 1:itr) {
  xt <- x - alpha * hess_inv %*% f_prime(x)
  # REDUCE ALPHA UNTIL A CORRECT STEP IS REACHED
  while (f_vec(xt) < f_vec(x)) {
    alpha <- alpha / 2
    xt <- x - alpha * hess_inv %*% f_prime(x)
  }
  x_vals[i + 1, ] <- x <- xt
  alpha <- alpha_def  # Reset alpha
}

## OUTPUT
final_est <- x
obj_val <- f_vec(x)
grad_at_est <- f_prime(x)

print(paste("Final estimate:", paste(final_est, collapse = ", ")))
print(paste("Objective function at estimate:", obj_val))
print(paste("Gradient at estimate:", paste(grad_at_est, collapse = ", ")))

## PLOT OF CONVERGENCE
# No need to Vectorize since f now takes two arguments
x1_ran <- range(c(-6, 6, x_vals[, 1]))
x2_ran <- range(c(-6, 6, x_vals[, 2]))
x1 <- seq(x1_ran[1], x1_ran[2], length.out = 100)
x2 <- seq(x2_ran[1], x2_ran[2], length.out = 100)
z <- outer(x1, x2, Vectorize(f))

contour(x1, x2, z, nlevels = 20, drawlabels = FALSE)
for(i in 1:itr) {
  segments(x_vals[i, 1], x_vals[i, 2], x_vals[i + 1, 1], x_vals[i + 1, 2], lty = 2)
}

# Using the steepest_ascent function from AuxFunctions.r
x0 <- c(0, 0)
alpha <- 0.01
df <- function(x) { as.vector(f_prime(x)) }
steepest_ascent(df, x0, alpha, eps = 1e-04, maxiter = 1000)

#########################################################################
### EXAMPLE 2.7 QUASI-NEWTON METHOD (BFGS UPDATE AND BACKTRACKING)
#########################################################################

#########################################################################
# x = initial value
# M = Hessian approximation
# itr = number of iterations to run
# x_vals = contains values of x for each iteration
# alpha = scale parameter
# eps = tolerance
# f = objective function
# f_prime = first derivative of objective function
#########################################################################

#########################################################################
## NOTES: The objective function in the following example is the negative of
# Himmelblau's function.  This is different than the book
#########################################################################

## INITIAL VALUES
x <- c(-4, 0)
M <- diag(-1, 2, 2)
itr <- 40
eps <- 1e-10
x_vals <- matrix(0, itr + 1, 2)
x_vals[1, ] <- x
alpha_def <- 1
alpha <- alpha_def

## OBJECTIVE FUNCTION AND DERIVATIVES
f <- function(x1, x2) {
  -(((x1^2 + x2 - 11)^2) + (x1 + x2^2 - 7)^2)
}

f_vec <- function(x) {
  f(x[1], x[2])
}

f_prime <- function(x) {
  c(-4*x[1]^3 - 4*x[1]*x[2] + 42*x[1] - 2*x[2]^2 + 14,
    -2*x[1]^2 + 26*x[2] + 22 - 4*x[1]*x[2] - 4*x[2]^3)
}

## MAIN
for (i in 1:itr) {
  hess_inv <- solve(M)
  dfx <- f_prime(x)
  xt <- x - alpha * hess_inv %*% dfx
  while (f_vec(xt) < f_vec(x) && alpha > eps) {
    alpha <- alpha / 2
    xt <- x - alpha * hess_inv %*% dfx
  }
  x_vals[i + 1, ] <- xt
  z <- xt - x
  y <- f_prime(xt) - dfx
  v <- y - M %*% z
  M.old <- M
  o1 <- (t(z) %*% M %*% z)[1]
  o2 <- (t(z) %*% y)[1]
  if (o1 > eps && o2 > eps) {
    M <- M - (M %*% z %*% t(M %*% z)) / o1 + (y %*% t(y)) / o2
  } else {
    M <- M.old
  }
  alpha <- alpha_def
  x <- xt
}

## OUTPUT
final_est <- x
obj_val <- f_vec(x)
grad_at_est <- f_prime(x)

print(paste("Final estimate:", paste(final_est, collapse = ", ")))
print(paste("Objective function at estimate:", obj_val))
print(paste("Gradient at estimate:", paste(grad_at_est, collapse = ", ")))

## PLOT OF CONVERGENCE
x1_ran <- range(c(-6, 6, x_vals[, 1]))
x2_ran <- range(c(-6, 6, x_vals[, 2]))
x1 <- seq(x1_ran[1], x1_ran[2], length.out = 100)
x2 <- seq(x2_ran[1], x2_ran[2], length.out = 100)
z <- outer(x1, x2, f)

contour(x1, x2, z, nlevels = 20, drawlabels = FALSE)
for (i in 1:itr) {
  segments(x_vals[i, 1], x_vals[i, 2], x_vals[i + 1, 1], x_vals[i + 1, 2], lty = 2)
}

# Using the bfgs function from AuxFunctions.r
x = c(-4,0)
M = diag(-1,2,2)

bfgs(f_vec, f_prime, x, M)

#########################################################################
### EXAMPLE 2.8 - 2.9 NELDER-MEAD
#########################################################################

#########################################################################
## NOTE: These examples are meant to illustrate concepts rather than a
#  detailed implementation. For example, the Nelder-Mead algorithm is the 
#  default method used in the R function optim()
#########################################################################

#########################################################################
### EXAMPLE 2.10 GAUSS-SEIDEL (with NEWTON method applied to each component)
#########################################################################

#########################################################################
# x = initial value
# itr = number of iterations to run
# x_vals = contains values of x for each iteration
# alpha = scale parameter
# f = objective function
# f_prime = first derivative of objective function
# f_2prime = second derivative of objective function
#########################################################################

#########################################################################
## NOTES: The objective function in the following example is the negative of
# Himmelblau's function.  This is different than in the book.  Also,
# the results of the example here are very sensitive to starting value.
#########################################################################

## INITIAL VALUES
x <- c(-1.5, 1.5)
itr <- 100
x_vals <- matrix(0, itr + 1, 2)
x_vals[1, ] <- x

## OBJECTIVE FUNCTION AND DERIVATIVES
f <- function(x1, x2) {
  -(((x1^2 + x2 - 11)^2) + (x1 + x2^2 - 7)^2)
}

f_prime <- function(x) {
  c(-4*x[1]^3 - 4*x[1]*x[2] + 42*x[1] - 2*x[2]^2 + 14,
    -2*x[1]^2 + 26*x[2] + 22 - 4*x[1]*x[2] - 4*x[2]^3)
}

f_2prime <- function(x) {
  matrix(c(-12*x[1]^2 - 4*x[2] + 42, -4*(x[1] + x[2]),
           -4*(x[1] + x[2]), -12*x[2]^2 - 4*x[1] + 26), nrow = 2)
}

## MAIN
for (j in 1:itr) {
  # Update first variable
  xt1 <- x[1] - (f_prime(x)[1] / f_2prime(x)[1, 1])
  x[1] <- xt1
  x_vals[j + 1, 1] <- xt1
  
  # Update second variable
  xt2 <- x[2] - (f_prime(x)[2] / f_2prime(x)[2, 2])
  x[2] <- xt2
  x_vals[j + 1, 2] <- xt2
}

## OUTPUT
final_est <- x
obj_val <- f(x[1], x[2])
grad_at_est <- f_prime(x)

print(paste("Final estimate:", paste(final_est, collapse = ", ")))
print(paste("Objective function at estimate:", obj_val))
print(paste("Gradient at estimate:", paste(grad_at_est, collapse = ", ")))

g <- Vectorize(f)
x1_ran <- range(c(-6, 6, x_vals[, 1]))
x2_ran <- range(c(-6, 6, x_vals[, 2]))
x1 <- seq(x1_ran[1], x1_ran[2], length.out = 100)
x2 <- seq(x2_ran[1], x2_ran[2], length.out = 100)
z <- outer(x1, x2, g)

contour(x1, x2, z, nlevels = 20, drawlabels = FALSE)
for (i in 1:itr) {
  segments(x_vals[i, 1], x_vals[i, 2], x_vals[i + 1, 1], x_vals[i + 1, 2], lty = 2)
}

#########################################################################
### END OF FILE

