#########################################################################
# COMPUTATIONAL STATISTICS
# by Geof Givens and Jennifer Hoeting
# CHAPTER 9 EXAMPLES
#########################################################################

# pacman must already be installed; then load contributed
# packages (including pacman) with pacman
pacman::p_load(pacman, rio, tidyverse, VGAM)

#Set Working Directory to Source File Location
library("rstudioapi") # Load rstudioapi package
setwd(dirname(getActiveDocumentContext()$path)) # Set working directory to source file location
getwd() # Check updated working directory

############################################################################
# Note: The random number seeds used here sometimes give slightly different
# answers than in the book.
############################################################################

#The toy example with X={1,2,6}
set.seed(123)
# Initial data
X <- c(1, 2, 6)
theta_est <- c(3, 4, 5, 6, 8, 9, 10, 13, 14, 18) / 3
probs <- c(1, 3, 3, 1, 3, 6, 3, 3, 3, 1) / 27

# Number of bootstrap samples
Nboot <- 1000

# bootstrap sampling
boot_means <- replicate(Nboot, mean(sample(X, replace = TRUE)))

# mean of bootstrap samples
boot_mean <- mean(boot_means)

# weighted mean of theta_est
wgted_theta_mean <- sum(theta_est * probs)

# relative frequency of each unique bootstrap sample mean
rel_freq <- table(boot_means) / Nboot

# Plotting
plot(as.numeric(names(rel_freq)), rel_freq,
     type = 'h', lwd = 2, col = 'blue', 
     ylab = "probability", xlab = expression(widehat(theta)),
     main = "Bootstrap Estimates vs. Theoretical Probabilities")
points(theta_est, probs, pch = 19, col = 'red')
legend("topright", legend = c("Bootstrap", "Theoretical"), 
       col = c("blue", "red"), pch = c(15, 19), lwd = c(2, NA))

############################################################################
### Example 9.3-4 G&H Copper-Nickel Alloy (Paired Bootstrap)
############################################################################
# alloy = contains data
# z = contains paired data
# Niter = number of resamples to run
# theta = observed data estimate (beta1/beta0)
# thetas = bootstrapped estimates (beta1/beta0)
############################################################################
## Notes: The R package {boot} can perform a wide variety of bootstrapping 
# techniques and is usually a simpler approach than directly programming a method. 
############################################################################

## Load data
# Uncomment and modify the line below if reading from a file selector
# alloy <- read.table(file.choose(), header = TRUE)
alloy <- read.table(file="../Datasets/alloy.dat", header=TRUE)
head(alloy)

## Initialize
Nboot <- 10000
theta_est <- numeric(Nboot)
set.seed(1)  # For reproducibility

## Main computation
ini_mod <- lm(corrosionloss ~ ironcontent, data=alloy)
ini_ratio <- coef(ini_mod)[2] / coef(ini_mod)[1]

# Bootstrap simulation
for (i in 1:Nboot) {
    boot_data <- alloy[sample(nrow(alloy), replace=TRUE), ]
    boot_mod <- lm(corrosionloss ~ ironcontent, data=boot_data)
    theta_est[i] <- coef(boot_mod)[2] / coef(boot_mod)[1]
}

bias <- mean(theta_est) - ini_ratio

## Output
cat("Observed Data Estimate: ", ini_ratio, "\n")
cat("Estimated Bias: ", bias, "\n")
cat("Bias-Corrected Estimate: ", ini_ratio - bias, "\n")

## Plots
hist(theta_est, breaks=40, main="Distribution of Bootstrap Ratios",
     xlab=expression(theta), ylab="density", col="blue",probability = T)
abline(v = ini_ratio, col = "red", lwd = 4, lty = 2)
legend("topright", legend = c("Initial Ratio"), col = c("red"), lty = c(2), lwd = c(2))

############################################################################
### Example 9.5-6 Copper-Nickel Alloy (Confidence Intervals)
############################################################################
# alloy = contains data
# z = contains paired data
# Niter = number of resamples to run
# theta = observed data estimate (beta1/beta0)
# thetas = bootstrapped estimates (beta1/beta0)
# ci.95 = 95% CI based on empirical percentile
# ci.95.bc = 95% CI based on accelerated bias-corrected percentile
############################################################################
## Notes: It's possible to have a resample that contains only one pair of original
# observations. This will produce a NA in lm(). 
############################################################################

## Initial Values
# alloy <- read.table(file="../Datasets/alloy.dat", header = TRUE, sep = "")
Nboot <- 10000
set.seed(1)  # For reproducibility

## MAIN COMPUTATIONS
# Linear model fitting
fit_ini <- lm(corrosionloss ~ ironcontent, data = alloy)
theta <- coef(fit_ini)[2] / coef(fit_ini)[1]

# Bootstrap simulation
boot_est <- replicate(Nboot, {
    boot_data <- alloy[sample(nrow(alloy), replace = TRUE), ]
    fit_boot <- lm(corrosionloss ~ ironcontent, data = boot_data)
    coef(fit_boot)[2] / coef(fit_boot)[1]
})

# 95% Confidence Intervals
CI_emp <- quantile(boot_est, probs = c(0.025, 0.975))

# Accelerated Bias-Corrected Bootstrap
jack_est <- sapply(1:nrow(alloy), function(i) {
    fit_jack <- lm(corrosionloss ~ ironcontent, data = alloy[-i, ])
    coef(fit_jack)[2] / coef(fit_jack)[1]
})
psi <- mean(jack_est) - jack_est
a <- (1/6) * sum(psi^3) / (sum(psi^2)^(3/2))
z0 <- qnorm(mean(boot_est < theta))

# bias-corrected CIs
adjust <- function(p) {
    z_p = qnorm(p)  # Z-score for the p-th quantile under normal assumption
    numer = z0 + (z0 + z_p)
    denom = 1 - a * (z0 + z_p)
    if (denom == 0) {  # Prevent division by zero
        print("Denominator is zero; adjusting...")
        denom <- 1e-8  # small positive number to avoid division by zero
    }
    adj_p = pnorm(numer / denom)
    
    # Ensure the probs are within the valid range
    adj_p <- max(min(adj_p, 1), 0)
    
    print(sprintf("p: %f, z_p: %f, adj_p: %f", p, z_p, adj_p))  # Debug output
    return(adj_p)
}

CI_BC <- quantile(boot_est, probs = c(adjust(0.025), adjust(0.975)))

## Output
list(theta = theta,
     CI_emp = CI_emp,
     CI_BC = CI_BC)

## Plots
hist(boot_est, breaks = 40, main = "Bootstrap Estimates", xlab = "estimates",ylab="density",prob=T)
abline(v = c(CI_emp, theta), col = c("blue", "red", "darkgreen"), lwd = 2)
legend("topright", legend = c("Lower CL", "Upper CL", "Theta"), col = c("blue", "red", "darkgreen"), lwd = 2)

############################################################################
### Example 9.7 Copper-Nickel Alloy (Studentized Bootstrap)
############################################################################
# alloy = contains data
# z = contains paired data
# Niter = number of resamples to run
# theta = observed data estimate for (beta1/beta0)
# theta.var = estimated variance for theta
# theta.sd = estimated std deviation for theta
# thetas = bootstrapped estimates for (beta1/beta0)
# G = studentized bootstrap values
# ci.95.t  = 95% CI for studentized bootstrap method
############################################################################
## Notes: It's possible to have a resample that contains only one pair of original
# observations. This will produce a NA in lm(). 
############################################################################

## Initial Values
#alloy = read.table(file="../Datasets/alloy.dat",sep="",header=T)
z <- alloy
Nboot <- 10000
set.seed(1)  # Ensuring reproducibility

## Preliminary Linear Model Analysis
fit_ini <- lm(z[, 2] ~ z[, 1])
slope <- coef(fit_ini)[2]
intercept <- coef(fit_ini)[1]
theta <- slope / intercept

## Variance Calculation for Theta
cov_mat <- vcov(fit_ini)
theta_var <- (theta^2) * (
    cov_mat[2, 2] / slope^2 +
        cov_mat[1, 1] / intercept^2 -
        2 * cov_mat[1, 2] / (slope * intercept)
)
theta_SD <- sqrt(theta_var)

## Bootstrap Simulation
boot_est <- numeric(Nboot)
stud_stats <- numeric(Nboot)

for (i in 1:Nboot) {
    sam_ind <- sample(nrow(z), replace = TRUE)
    boot_data <- z[sam_ind, ]
    fit_boot <- lm(boot_data[, 2] ~ boot_data[, 1])
    
    boot_slope <- coef(fit_boot)[2]
    boot_inter <- coef(fit_boot)[1]
    boot_theta <- boot_slope / boot_inter
    
    boot_cov_mat <- vcov(fit_boot)
    boot_var <- (boot_theta^2) * (
        boot_cov_mat[2, 2] / boot_slope^2 +
            boot_cov_mat[1, 1] / boot_inter^2 -
            2 * boot_cov_mat[1, 2] / (boot_slope * boot_inter)
    )
    
    stud_stats[i] <- (boot_theta - theta) / sqrt(boot_var)
}

## Confidence Interval Calculation
quantiles <- quantile(stud_stats, probs = c(0.025, 0.975), na.rm = TRUE)
conf_int <- theta - theta_SD * c(quantiles[2], quantiles[1])

## Results Output
results <- list(
    estimated_theta = theta,
    theta_standard_deviation = theta_SD,
    stud_boot_conf_int = conf_int
)
## Return the results
results

## Histogram of Studentized Bootstrap Estimates
hist(stud_stats, breaks = 40, main = "Histogram of Studentized Bootstrap Estimates",
     xlab = "Studentized Estimates", ylab = "density", prob = TRUE,col="skyblue")

############################################################################
### Example 9.8 Copper-Nickel Alloy (Nested Bootstrap)
############################################################################
# alloy = contains data
# z = contains paired data
# Niter = number of resamples to run for R0
# Niter2 = number of resamples to run for R1
# theta = observed data estimate for (beta1/beta0)
# thetas = bootstrapped estimates for (beta1/beta0)
# thetas2  = nested bootstrapped estimates
# R0 = test statistics for bootstrapped estimates
# R1 = test statistics for nested bootstrapped estimates
# CI_nest  = 95% CI for nested bootstrap method
############################################################################
## Notes: It's possible to have a resample that contains only one pair of original
# observations. This will produce a NA in lm(). 
############################################################################

## Initial Values
#alloy = read.table(file="../Datasets/alloy.dat",sep="",header=T)
z = alloy
Niter = 300
Niter2 = 300
theta = NULL
thetas = rep(0,Niter)
thetas2  = matrix(0,Niter,Niter2)
R0 = rep(0,Niter)
R1 = rep(0,Niter2)
CI_nest  = rep(0,2)
set.seed(1)

## MAIN
model = lm(z[,2]~z[,1])
theta = as.numeric(model$coef[2]/model$coef[1])

for(i in 1:Niter){
      Znew = z[sample(1:length(z[,1]),replace=T),]
      model = lm(Znew[,2]~Znew[,1])
      thetas[i] = model$coef[2]/model$coef[1]
      for(j in 1:Niter2){
            Znew2 = Znew[sample(1:length(z[,1]),replace=T),]
            model = lm(Znew2[,2]~Znew2[,1])
            thetas2[i,j] = model$coef[2]/model$coef[1]
      }
}

R0 = thetas-theta
for(i in 1:Niter2){R1[i] = mean((thetas2[i,]-thetas[i]) <= R0[i])}
R0_UL = quantile(R0, quantile(R1,0.975,na.rm=T), na.rm=T)
R0_LL = quantile(R0, quantile(R1,0.025,na.rm=T), na.rm=T)
CI_nest[1] = theta - R0_UL
CI_nest[2] = theta - R0_LL

## Output
theta # OBSERVED DATA ESTIMATE
CI_nest # 95% CI FOR NESTED BOOTSTRAP METHOD

## Histogram of R1
hist(R1, main = "Histogram of Residual Bootstrap Estimates", prob=T,
     xlab = "Bootstrap Estimates", ylab = "density", col = "skyblue")

############################################################################
### Example 9.9 Industrialized Country GDP Example
############################################################################
# blks = arranges the blks in a matrix for easy manipulation
# xbar  = means from block bootstrap pseudo-datasets
############################################################################
##Initial Values
#gdp=read.table(file.choose(),header=TRUE)
## Create blks of data
num_blks <- 5
blk_size <- 8
blks <- matrix(NA, nrow = num_blks, ncol = blk_size)
for (i in 1:num_blks) {
    start_ind <- (i - 1) * blk_size + 1
    end_ind <- i * blk_size
    blks[i, ] <- gdp[start_ind:end_ind, 2]
}

## Non-Moving Block Bootstrap
set.seed(1)  # Ensuring reproducibility
Nboot <- 10000
boot_means <- numeric(Nboot)  # Preallocate vector for speed

for (i in 1:Nboot) {
    sam_blks_ind <- sample(1:num_blks, num_blks, replace = TRUE)
    sam_data <- as.vector(blks[sam_blks_ind, ])
    boot_means[i] <- mean(sam_data)
}

## Output: Standard Deviation of bootstrap means
boot_sd <- sqrt(var(boot_means))
boot_sd  # Print the bootstrap standard deviation

############################################################################
### Example 9.10 Industrialized Country GDP Example, Cont.
############################################################################
# blks = arranges the blks in a matrix for easy manipulation
# xbar  = means from block bootstrap pseudo-datasets
############################################################################
# NOTE: See errata for typo in the text for this example
############################################################################

## Load Data
gdp <- read.table(file = "../Datasets/gdp.dat", header = TRUE, sep = "")
gdp_change <- gdp[, 2]  # the second column (gdpchange) is of interest

## Create Overlapping Blocks
blk_size <- 8
num_blks <- length(gdp_change) - blk_size + 1
blks <- matrix(NA, nrow = num_blks, ncol = blk_size)
for (i in 1:num_blks) {
    blks[i, ] <- gdp_change[i:(i + blk_size - 1)]
}

## Moving Block Bootstrap
set.seed(1)  # Ensuring reproducibility
Nboot <- 10000
boot_means <- numeric(Nboot)  # Preallocate for efficiency

for (i in 1:Nboot) {
    sam_ind <- sample(1:num_blks, 5, replace = TRUE)
    sam_data <- as.vector(blks[sam_ind, ])
    boot_means[i] <- mean(sam_data)
}

## Output the Standard Deviation of Bootstrap Means
boot_sd <- sqrt(var(boot_means))
boot_sd

## Histogram of Bootstrap Means
hist(boot_means, main = "Histogram of Bootstrap Means", xlab = "Bootstrap Mean Values", 
     ylab="density", col = "lightblue", border = "darkblue",prob=T)

############################################################################
### Example 9.11 Tree Rings
############################################################################
# n = number of data points
# p = number of data points per small block
# big_blk = number of big blks (blks-of-blks)
# sub_blk = number of small blks per big block
# small_blks = one column per small block, p rows
# big_blks = array: row=small block, col=data, 3rd dim=big block 
# lag2cor = bootstrap estimates of lag-2 correlation
############################################################################

##Initial Values
#basal=read.table(file.choose(),header=TRUE)
basal <- read.table(file = "../Datasets/tree.dat", header = TRUE, sep = "")
head(basal)
p <- 3
n <- 452
big_blk <- 450
sub_blk <- 25

## Function to calculate lag-2 correlation
calc_rhat <- function(x) {
    n <- dim(x)[1]
    xwhole <- c(x[, 1], x[(n - 1):n, 3])
    xm <- mean(xwhole)
    rhat <- sum((x[, 1] - xm) * (x[, 3] - xm)) / sum((xwhole - xm)^2)
    return(rhat)
}

## Establish blks and blks-of-blks
small_blks <- t(basal[2:(n - 1), 2])  # Transpose for easier slicing later
big_blks <- array(NA, c(sub_blk, p, big_blk - sub_blk + 1))

for (i in 1:(big_blk - sub_blk + 1)) {
    big_blks[, , i] <- small_blks[, i:(i + sub_blk - 1)]
}

## Blocks-of-Blocks Bootstrap
lag2cor <- numeric(10000)  # Preallocate for performance

set.seed(567890)
for (i in 1:10000) {
    sam_ind <- sample(1:(big_blk - sub_blk + 1), big_blk / sub_blk, replace = TRUE)
    new_data <- array(apply(big_blks[, , sam_ind], 2, c), dim = c(sub_blk, p))
    lag2cor[i] <- calc_rhat(new_data)
}

## Output the Standard Deviation of lag-2 correlations
lag2_sd <- sqrt(var(lag2cor))
lag2_sd

##Note: The {tseries} package includes tsbootstrap() which is a
##  convenient tool for some of these methods and includes a bootstrap
##  bias estimate as provided in the text.

############################################################################
### END OF FILE
