## Lecture Slides 12 R Examples

#Set Working Directory to Source File Location
library("rstudioapi")  # Load rstudioapi package
setwd(dirname(getActiveDocumentContext()$path)) # Set working directory to source file location
#getwd()  

##################################################################################
# Poisson Regression with Metropolis-Hastings on Simulated High School Awards Data
##################################################################################

# Load necessary libraries
library(mvtnorm)  # For multivariate normal sampling
library(coda)     # For MCMC diagnostics
library(TeachingDemos)  # For HPD intervals

# Load the dataset
award_data <- read.csv("https://stats.idre.ucla.edu/stat/data/poisson_sim.csv")
head(award_data)

# Define the response variable and predictors
y <- award_data$num_awards
x1 <- award_data$math
x2 <- ifelse(award_data$prog == 2, 1, 0)  # Academic track
x3 <- ifelse(award_data$prog == 3, 1, 0)  # Vocational track

# Construct design matrix for regression
X <- cbind(1, x1, x2, x3)
p <- ncol(X)  # Number of parameters in the model

# Define prior parameters
beta_pri_mean <- c(0, 3, 2, -1)  # Prior means for each beta
beta_pri_sd <- rep(10, p)        # Standard deviation for prior

# Set up Metropolis-Hastings algorithm parameters
k<-1 # Can adjust this k up or down if the acceptance rate is too high or low...
pro_cov_mat <- k*var(log(y+1/2))*solve(t(X)%*%X)   # Proposal covariance matrix
S <- 100000  # Number of MCMC iterations
beta_cur <- beta_pri_mean  # Initial values for beta
acs <- 0  # Track acceptance count

# Storage for sampled beta values
beta_vals <- matrix(0, nrow = S, ncol = p)

# Metropolis-Hastings Sampling
for (s in 1:S) {
  # Propose new beta values from multivariate normal distribution
  beta_proposed <- t(rmvnorm(1, beta_cur, pro_cov_mat))
  
  # Calculate the log-acceptance ratio
  log_acc_rat <- sum(dpois(y, exp(X %*% beta_proposed), log = TRUE)) - 
    sum(dpois(y, exp(X %*% beta_cur), log = TRUE)) +
    sum(dnorm(beta_proposed, beta_pri_mean, beta_pri_sd, log = TRUE)) - 
    sum(dnorm(beta_cur, beta_pri_mean, beta_pri_sd, log = TRUE))
  
  # Accept or reject the proposed beta values
  if (log_acc_rat > log(runif(1))) {
    beta_cur <- beta_proposed
    acs <- acs + 1  # Increase acceptance count
  }
  
  # Store current beta values
  beta_vals[s, ] <- beta_cur
}

# Calculate acceptance rate
acceptance_rate <- acs / S
print(paste("Acceptance Rate:", round(acceptance_rate, 4)))

# Visualize ACF and trace plots for beta parameters
par(mfrow = c(2, 2))
for (i in 1:p) {
  acf(beta_vals[, i], main = bquote(ACF ~ "for" ~ beta[.(i - 1)]))
}
par(mfrow = c(1, 1))

#to compute effective sample size (ratios)
# Convert beta_vals to an mcmc object
beta_mcmc <- as.mcmc(beta_vals)
# Calculate the effective sample size for each parameter
ess_values <- effectiveSize(beta_mcmc)
# Calculate the effective sample size ratio for each parameter
ess_ratios <- round(ess_values / nrow(beta_vals),6)
# Display the results
ess_results <- data.frame(parameter = paste0("beta_", 0:(ncol(beta_vals) - 1)),
                          eff_SS_ratio = ess_ratios)
print("Effective Sample Size Ratios for Parameters:")
row.names(ess_results) = c("intercept","math_score","academic","vocational")
print(ess_results)

# Thinning the chain to reduce autocorrelation
thin <- 10
beta_vals_thin <- beta_vals[seq(1, S, by = thin), ]

# Trace plots after thinning
par(mfrow = c(2, 2))
for (i in 1:p) {
  plot(beta_vals_thin[, i], type = 'l', main = bquote("Trace Plot for" ~ beta[.(i - 1)]))
}
par(mfrow = c(1, 1))

# Perform Geweke diagnostic for convergence on the thinned chain
# Geweke diagnostic for checking convergence of an MCMC chain:
# This gives a z-statistic that compares the mean values of
# (by default) the first 10% of the chain vs. the last 50% of the chain:
geweke_diag <- sapply(1:p, function(i) {
  z <- geweke.diag(beta_vals_thin[, i])
  list(statistic = z[[1]], p_value = 2 * pnorm(abs(z[[1]]), lower.tail = FALSE))
})
print("Geweke Diagnostic Results:")
print(geweke_diag)
# P-value associated with the Geweke diagnostic:
# (a small p-value indicates a LACK of convergence):

# Discard the first 10% as burn-in for posterior analysis
burn_in <- 1000
beta_vals_post <- beta_vals_thin[-(1:burn_in), ]

# Trace plots after thinning and burn-in
par(mfrow = c(2, 2))
for (i in 1:p) {
  plot(beta_vals_post[, i], type = 'l', main = bquote("Trace Plot for" ~ beta[.(i - 1)]))
}
par(mfrow = c(1, 1))

# Nice hairy caterpillars now ...

#to compute effective sample size (ratios)
# Convert beta_vals to an mcmc object
beta_mcmc <- as.mcmc(beta_vals_post)
# Calculate the effective sample size for each parameter
ess_values <- effectiveSize(beta_mcmc)
# Calculate the effective sample size ratio for each parameter
ess_ratios <- round(ess_values / nrow(beta_vals),6)
# Display the results
ess_results <- data.frame(parameter = paste0("beta_", 0:(ncol(beta_vals) - 1)),
                          eff_SS_ratio = ess_ratios)
print("Effective Sample Size Ratios for Parameters:")
row.names(ess_results) = c("intercept","math_score","academic","vocational")
print(ess_results)

### Posterior summary:

# Posterior summary: medians and 95% credible intervals
post_meds <- apply(beta_vals_post, 2, median)
cred_ints <- t(apply(beta_vals_post, 2, quantile, probs = c(0.025, 0.975)))
hpd_ints <- t(apply(beta_vals_post, 2, emp.hpd, conf = 0.95))

print("Posterior Medians:")
names(post_meds) = c("intercept","math_score","academic","vocational")
print(post_meds)
print("95% Credible Intervals:")
row.names(cred_ints) = c("intercept","math_score","academic","vocational")
print(cred_ints)
print("95% HPD Intervals:")
row.names(hpd_ints) = c("intercept","math_score","academic","vocational")
print(hpd_ints)

### Posterior Prediction for Model Fit
Ndraws <- nrow(beta_vals_post)
y_hats <- exp(X %*% t(beta_vals_post))  # Posterior predictions for each sample

# Calculate in-sample Mean Absolute Error (MAE)
mae <- mean(abs(y - rowMeans(y_hats)))
print(paste("In-sample MAE:", round(mae, 4)))

# Consider fitting different models (e.g., without `x1` or adding interaction terms)
# and comparing MAE across models for a more robust analysis.

######################################################
### Bayesian Poisson Regression using 'rstanarm' Package
######################################################

# Load necessary packages
library(bayesrules)
library(rstanarm)
library(bayesplot)
library(tidyverse)
library(tidybayes)
library(broom.mixed)

# Prepare data
prog <- as.factor(award_data$prog)
awards_data <- data.frame(num_awards = y, math_score = x1, academic= x2, vocational = x3, program = prog)  # 'rstanarm' requires data in a data frame format

# Fit Bayesian Poisson regression model with 4 MCMC chains
awards_mod <- stan_glm(
  num_awards ~ math_score + academic + vocational, 
  data = awards_data,
  family = poisson,
  prior_intercept = normal(0, 0.5),
  prior = normal(c(3, 2, -1), 2.5, autoscale = TRUE), 
  chains = 4, iter = 10000
)

# MCMC Diagnostics - trace plots, density overlays, and autocorrelation checks
mcmc_trace(awards_mod)
mcmc_dens_overlay(awards_mod)
mcmc_acf(awards_mod)

# Posterior Predictive Check: Visualize 5 predictive histograms
pp_check(awards_mod, plotfun = "hist", nreps = 5) + 
  xlab("Number of awards") +
  ggtitle("Posterior Predictive Check: Awards Count")

# The model seems to match the observed awards distribution well.

# Visualize 50 posterior plausible models by plotting fitted draws
awards_data %>%
  add_fitted_draws(awards_mod, n = 50) %>%
  ggplot(aes(x = math_score, y = num_awards, color = program)) +
  geom_line(aes(y = .value, group = paste(program, .draw)), alpha = 0.1) +
  geom_point(data = awards_data, size = 0.1) +
  labs(
    x = "Math score",
    y = "Number of awards",
    title = "Posterior Plausible Models by Program Track"
  ) +
  theme_minimal()

# Point estimates and interval estimates for regression coefficients (betas)
tidy(awards_mod, conf.int = TRUE, conf.level = 0.95)

# Alternative summary of posterior estimates
summary(awards_mod, digits = 4)

# Posterior Prediction for a New Observation:
# Predict awards for a "general" track student with a math score of 40
new_data <- data.frame(math_score = 40, academic = 0, vocational = 0)
ex_pred <- posterior_predict(awards_mod, newdata = new_data)

# Tabulate the posterior predictive distribution
pred_table <- table(ex_pred)
pred_dist <- pred_table / length(ex_pred)  # Proportion for each count
print(pred_dist)

# Calculate and print the posterior predictive mean
post_pred_mean <- sum(as.numeric(names(pred_table)) * pred_dist)
cat("Posterior Predictive Mean:", post_pred_mean, "\n")

# Plot the posterior predictive distribution for this individual
qplot(ex_pred, binwidth = 1, geom = "histogram") + 
  xlab("Predicted number of awards") +
  ggtitle("Posterior Predictive Distribution for General Student with Math Score 40")

# Model Fit Evaluation
# Generate posterior predictive values for the observed data
poi_preds <- posterior_predict(awards_mod, newdata = awards_data)

# Plot posterior predictive intervals for each student by program track
ppc_intervals_grouped(
  awards_data$num_awards, yrep = poi_preds, 
  x = awards_data$math_score, group = awards_data$prog,
  prob = 0.5, prob_outer = 0.95,
  facet_args = list(scales = "fixed")
) +
  labs(
    x = "Math score",
    y = "Number of awards",
    title = "Posterior Predictive Intervals by Program Track"
  )

# Numerical Summary of In-Sample Fit
in_sample_fit <- prediction_summary(model = awards_mod, data = awards_data)
print(in_sample_fit)

# Out-of-Sample Predictive Accuracy using 10-fold Cross-Validation
set.seed(84735)  # For reproducibility
poi_cv <- prediction_summary_cv(model = awards_mod, data = awards_data, k = 10)
print(poi_cv$cv)  # Cross-validated metrics

# Could fit a different model (like without x1? or with an interaction term?) 
# and compare MAE's across models ... smaller would be better.

################################################
## Negative Binomial Regression with Bayesian Model Checking
################################################

# Load necessary libraries
library(bayesrules)
library(rstanarm)
library(bayesplot)
library(tidyverse)
library(tidybayes)
library(broom.mixed)

# Load and clean data
data(pulse_of_the_nation)
pulse <- pulse_of_the_nation %>%
  filter(books < 100)  # Remove outliers with 100+ books

# Summary plots to examine data distribution and relationships
ggplot(pulse, aes(x = books)) + 
  geom_histogram(color = "white") +
  labs(x = "Books", title = "Distribution of Books Read")

ggplot(pulse, aes(y = books, x = age)) + 
  geom_point() +
  labs(x = "Age", y = "Books", title = "Books Read vs. Age")

ggplot(pulse, aes(y = books, x = wise_unwise)) + 
  geom_boxplot() +
  labs(x = "Preference (Wise/Unwise)", y = "Books", title = "Books Read by Preference")

# Attempt Poisson model first to assess fit
books_poi_sim <- stan_glm(
  books ~ age + wise_unwise, 
  data = pulse, family = poisson,
  prior_intercept = normal(0, 2.5, autoscale = TRUE),
  prior = normal(0, 2.5, autoscale = TRUE), 
  prior_aux = exponential(1, autoscale = TRUE),
  chains = 4, iter = 10000, seed = 84735
)

# Posterior predictive check to evaluate fit
pp_check(books_poi_sim) + 
  xlab("Books") + 
  ggtitle("Posterior Predictive Check (Poisson Model)")

# Check distribution of data's mean and variance
pulse %>%
  summarize(mean = mean(books), var = var(books))

# Variance is much larger than the mean, indicating overdispersion; Negative Binomial may be better suited

#############################################################
# Negative Binomial regression to handle overdispersion
books_negbin_sim <- stan_glm(
  books ~ age + wise_unwise, 
  data = pulse, family = neg_binomial_2,
  prior_intercept = normal(0, 2.5, autoscale = TRUE),
  prior = normal(0, 2.5, autoscale = TRUE), 
  prior_aux = exponential(1, autoscale = TRUE),
  chains = 4, iter = 10000
)

# Posterior predictive check for the Negative Binomial model
pp_check(books_negbin_sim) + 
  xlim(0, 75) + 
  xlab("Books") + 
  ggtitle("Posterior Predictive Check (Negative Binomial Model)")

# MCMC diagnostics: trace plots, density overlays, and autocorrelation
mcmc_trace(books_negbin_sim)
mcmc_dens_overlay(books_negbin_sim)
mcmc_acf(books_negbin_sim)

# Summarize posterior distributions of parameters
tidy(books_negbin_sim, conf.int = TRUE, conf.level = 0.95)

# Posterior prediction for a specific case: age 30, "Wise but Unhappy"
ex_pred_nb <- posterior_predict(
  books_negbin_sim, newdata = data.frame(age = 30, wise_unwise = 'Wise but Unhappy')
)

# Summary and visualization of the predictive distribution for this individual
pred_table <- table(ex_pred_nb) / length(ex_pred_nb)
post_pred_mean <- sum(as.numeric(names(pred_table)) * pred_table)
cat("Posterior Predictive Mean:", post_pred_mean, "\n")

### Easiest to see with a plot here:
# Histogram of posterior predictive distribution for this individual:
qplot(ex_pred_nb, binwidth = 1, geom = "histogram") + 
  xlab("Predicted Books Read") + 
  ggtitle("Predictive Distribution for Age 30, Wise but Unhappy")

# Posterior predictive intervals for all observations
nb_preds <- posterior_predict(books_negbin_sim, newdata = pulse)

# Plot the posterior predictive models for each observation:
# Big data set, so this take a while...

ppc_intervals_grouped(
  pulse$books, yrep = nb_preds, 
  x = pulse$age, 
  group = pulse$wise_unwise,
  prob = 0.5, prob_outer = 0.95,
  facet_args = list(scales = "fixed")
) + labs(
  x = "Age",
  y = "Books",
  title = "Posterior Predictive Intervals by Age and Preference"
)

# Do the dark blue circles (observed counts) fall near the light blue circles (predicted counts)?
# Do most of the dark blue circles (observed counts) fall within the 95% prediction intervals?

# A numerical summary of in-sample model fit:
in_sample_fit <- prediction_summary(model = books_negbin_sim, data = pulse)
print(in_sample_fit)

# Out-of-sample predictive accuracy with 10-fold cross-validation
set.seed(84735)
nb_cv <- prediction_summary_cv(model = books_negbin_sim, data = pulse, k = 10)
print(nb_cv$cv)

# Model comparison using Leave-One-Out (LOO) cross-validation
# Comparing a couple of other models:
# Setting chains=1 for speed reasons...
books_negbin <- stan_glm(
  books ~ wise_unwise, 
  data = pulse, family = neg_binomial_2,
  prior_intercept = normal(0, 2.5, autoscale = TRUE),
  prior = normal(0, 2.5, autoscale = TRUE), 
  prior_aux = exponential(1, autoscale = TRUE),
  chains = 1, iter = 10000
)

books_negbin_interact <- stan_glm(
  books ~ age + wise_unwise + age:wise_unwise, 
  data = pulse, family = neg_binomial_2,
  prior_intercept = normal(0, 2.5, autoscale = TRUE),
  prior = normal(0, 2.5, autoscale = TRUE), 
  prior_aux = exponential(1, autoscale = TRUE),
  chains = 1, iter = 10000
)

# This takes a while .... 

# Compare models with LOO
loo(books_negbin_sim)
loo(books_negbin)
loo(books_negbin_interact)

#This output provides diagnostics from the LOO-CV analysis of the model.
# elpd_loo (Expected Log Predictive Density for LOO-CV) is a measure of the model’s predictive accuracy. 
# Lower negative values indicate better predictive performance.

# p_loo (Effective Number of Parameters) accounts for model complexity and flexibility.

# looic (Leave-One-Out Information Criterion) calculated as \( \text{looic} = -2 \times \text{elpd\_loo} \). 
# Similar to AIC or DIC, lower values suggest better predictive performance.

# Monte Carlo Standard Error (MCSE) of elpd_loo, lower values indicate high precision in the LOO-CV estimate 

# Effective Sample Size (ESS) and r_eff indicates that the effective sample size is calculated under the assumption
# of “MCMC draws (r_eff in [0.5, 0.8])” reflecting a good level of autocorrelation control in the MCMC draws.

#Pareto k Estimates, low values indicate that the LOO-CV estimates are reliable. 
# If k values were higher, it would signal potential issues with the stability of the LOO estimates.

## Fitting the model using regular R coding can be done with the negative binomial model, 
## but the code is not as easy since there is an extra parameter (mean plus dispersion)
## rather than the one-parameter Poisson.
## I won't provide the base R code for it here...

