#================================================================================#
# Stat 7650: Computational Statistics -- Spring 2025                             #
# R code examples for Bootstrap                                                  #
# Based in part on Givens & Hoeting (Chap 9) and Lange (Chap 24)            #
# Modified from code of R. Martin                                                #
#================================================================================#

#Set Working Directory to Source File Location
library("rstudioapi")                                 # Load rstudioapi package
setwd(dirname(getActiveDocumentContext()$path))       # Set working directory to source file location
#getwd()  

# Set the path for saving plots
path.name <- "../STAT7650-Slides/figures"

############################################################################
# Example -- Bootstrap Approximation of the Variance of the Sample Median.
############################################################################
# parameters
n <- 21
mu <- 0
set.seed(77)

# t-distributed data with 1 degree of freedom
X <- mu + rt(n, df=1)
# The median of the sample
M <- median(X)

# Number of bootstrap samples
B <- 5000
# bootstrapping to estimate the distribution of the median
Mboot <- replicate(B, {
  Xboot <- sample(X, size = n, replace = TRUE)
  median(Xboot)
})

# Histogram of the bootstrap medians
hist(Mboot, freq = FALSE, col = "gray", border = "white", main = "Bootstrap Distribution of Median",
     xlab = "Medians", ylab = "Density")
abline(v = M, col = "red", lwd = 2, lty = 2)

# Variance estimates
var_est <- c(exact = 0.1367, clt = 0.1175, bootstrap = var(Mboot))
print(var_est)

############################################################################
# Example -- Variance of Sample Median (Cont'd)
############################################################################
# Parameters
n <- 21
mu <- 0
set.seed(77)

# t-distributed data with 1 degree of freedom and calculate the median
X <- mu + rt(n, df=1)
M <- median(X)

# Number of bootstrap samples
B <- 5000
# bootstrap medians for efficiency
Mboot <- numeric(B)

# Bootstrapping to estimate the distribution of the median
for(j in 1:B) {
  # Generate bootstrap samples centered around the median of the original data
  Xboot <- M + rt(n, df=1)
  Mboot[j] <- median(Xboot)
}

# Print the variance of the bootstrap medians
boot_var <- var(Mboot)
print(boot_var)

############################################################################
# Example -- Bootstrap in Regression (Givens and Hoeting, Example 9.3).
############################################################################

# Set up the data
x <- c(0.01, 0.48, 0.71, 0.95, 1.19, 0.01, 0.48, 1.44, 0.71, 1.96, 0.01, 1.44, 1.96)
y <- c(127.6, 124.0, 110.8, 103.9, 101.5, 130.1, 122.0, 92.3, 113.1, 83.7, 128.0, 91.4, 86.2)
n <- length(x)

# Number of bootstrap samples
B <- 5000
# Preallocate vectors for efficiency
theta_paired <- numeric(B)  
theta_resid <- numeric(B)

# Paired bootstrap
for(j in 1:B) {
  iz <- sample(1:n, n, replace=TRUE)
  reg <- lm(y[iz] ~ x[iz])
  theta_paired[j] <- coef(reg)[2] / coef(reg)[1]
}

# Histogram of theta estimates from paired bootstrap
#pdf(file=paste0(path.name, "/boot_theta_paired.pdf"), width=7, height=5)
hist(theta_paired, freq=FALSE, col="gray", border="white", 
     main="Paired Bootstrap Distribution",
     xlab=expression(theta[paired]),ylab="density")
#dev.off()

print(summary(theta_paired))

# Linear model for residual bootstrap
reg0 <- lm(y ~ x)
betahat <- coef(reg0)
resid <- residuals(reg0)
fitted <- fitted(reg0)

# Perform residual bootstrap
for(j in 1:B) {
  ir <- sample(1:n, size=n, replace=TRUE)
  yy <- fitted + resid[ir]
  reg <- lm(yy ~ x)
  theta_resid[j] <- coef(reg)[2] / coef(reg)[1]
}

# Histogram of theta estimates from residual bootstrap
hist(theta_resid, freq=FALSE, col="gray", border="white", 
     main="Residual Bootstrap Distribution",
     xlab=expression(theta[resid]),ylab="density")

print(summary(theta_resid))

############################################################################
# Example -- Bootstrap CIs (Percentile and t Methods) in Regression
############################################################################
# Set parameters
alpha <- 0.05
B <- 5000

# Percentile confidence interval for paired bootstrap results
CI_p <- quantile(theta_paired, c(alpha / 2, 1 - alpha / 2))
print(CI_p)

# Fit the original regression model
reg0 <- lm(y ~ x)
b <- coef(reg0)
Cov <- vcov(reg0)
theta_hat <- b[2] / b[1]

# Variance of the ratio of coefficients
var_theta <- theta_hat^2 * (Cov[2, 2] / b[2]^2 + Cov[1, 1] / b[1]^2 - 2 * Cov[1, 2] / (b[1] * b[2]))

# Preallocate vector for storing bootstrap t statistics
Tstar <- numeric(B)

# Bootstrap to simulate the distribution of the t statistic
for(j in 1:B) {
  iz <- sample(1:n, n, replace=TRUE)
  reg <- lm(y[iz] ~ x[iz])
  bb <- coef(reg)
  Sigma <- vcov(reg)
  TS <- bb[2] / bb[1]
  se <- TS^2 * (Sigma[2, 2] / bb[2]^2 + Sigma[1, 1] / bb[1]^2 - 2 * Sigma[1, 2] / (bb[1] * bb[2]))
  Tstar[j] <- (TS - theta_hat) / sqrt(se)
}

# Compute quantiles for bootstrap t confidence interval
Qcrit <- quantile(Tstar, c(alpha / 2, 1 - alpha / 2))
t_CI <- theta_hat + Qcrit * sqrt(var_theta)  # bootstrap t interval
print(t_CI)

############################################################################
# Example -- m-out-of-n Bootstrap for Uniform Data
############################################################################
# Parameters setup
theta <- 1
n <- 100
m <- n  # regular bootstrap
set.seed(77)
X <- theta * runif(n)
M <- max(X)

# Bootstrap parameters
B <- 5000
Tstar <- numeric(B)

# Generating bootstrap samples
Tstar <- sapply(1:B, function(b) -m * (max(sample(X, m, replace=TRUE)) - M))

# Plotting the bootstrap distribution
#pdf(file=paste0(path.name, "/boot_fail.pdf"), width=7, height=5)
hist(Tstar, freq=FALSE, col="gray", border="white", 
     main="Bootstrap vs. Theoretical Distribution", 
     xlab=expression(paste("T", "*")), 
     ylab="density")
curve(dexp(x, rate=1 / theta), col=2, add=TRUE, lwd=2)
legend("topright", legend=c("Emp. Bootstrap", "Theo. Exponential"), 
       col=c("gray", "red"), lty=1, fill=c("gray", NA), bty="n", lwd=c(2, 2))
#dev.off()

########################################################
#m-out-of-n Bootstrap
m <- round(2 * sqrt(n))
X <- theta * runif(n)
M <- max(X)

# Bootstrap parameters
B <- 5000
Tstar <- numeric(B)

# Generating bootstrap samples
Tstar <- sapply(1:B, function(b) -m * (max(sample(X, m, replace=TRUE)) - M))

# Plotting the bootstrap distribution
#pdf(file=paste0(path.name, "/boot_moutofn.pdf"), width=7, height=5)
hist(Tstar, freq=FALSE, col="gray", border="white", 
     main="Bootstrap Distribution vs. Theoretical", 
     xlab=expression(paste("T", "*")), 
     ylab="density")
curve(dexp(x, rate=1 / theta), col=2, add=TRUE, lwd=2)
legend("topright", legend=c("Emp. Bootstrap", "Theo. Exponential"), 
       col=c("gray", "red"), lty=1, fill=c("gray", NA), bty="n", lwd=c(2, 2))
#dev.off()

############################################################################
# Example -- Permutation Test (from Example 24.2.2 in Lange)
############################################################################
# Data preparation
y <- c(135, 91, 111, 87, 122, 175, 130, 514, 283, 105, 147, 159, 107, 194)
grp_sizes <- c(n1 = 5, n2 = 4, n3 = 5)
n <- sum(grp_sizes)
grp_labs <- rep(1:3, times=grp_sizes)
x <- factor(grp_labs)

# Initial plot
boxplot(y ~ x, col="gray", ylab="Response (y)")
hist(y, freq=FALSE, breaks=25, col="gray", border="white", main="Distribution of y")

# Linear model analysis
out_lm <- lm(y ~ x)
summary_lm <- summary(out_lm)
print(summary_lm)
Fmod <- summary_lm$fstatistic[1]  # F-statistic from the model

# Bootstrap for permutation test
B <- 5000
Fperm <- numeric(B)
set.seed(123)  # Set seed for reproducibility

for (b in 1:B) {
  shuffled_y <- sample(y)  # Shuffle y instead of sampling indices
  Fperm[b] <- summary(lm(shuffled_y ~ x))$fstatistic[1]
}

# Histogram of bootstrap f-statistics
hist(Fperm, freq=FALSE, breaks=25, col="gray", border="white", 
     xlab="F statistics",ylab="density", main="Bootstrap F-Statistics")
abline(v=Fmod, col="red", lwd=2)

# Calculate p-value
pval <- mean(Fperm >= Fmod)
print(sprintf("p-value: %.5f", pval))

