## Lecture Slides 16 R Examples

#Set Working Directory to Source File Location
library("rstudioapi")  # Load rstudioapi package
setwd(dirname(getActiveDocumentContext()$path)) # Set working directory to source file location
#getwd()  

##################################################################################
# The Spotify Dataset:
# This script analyzes a subset of Spotify data, focusing on song popularity 
# and artist-level summaries.
##################################################################################

# Load necessary packages
library(bayesrules)   # Bayesian methods
library(tidyverse)    # Data manipulation and visualization
library(rstanarm)     # Bayesian regression
library(bayesplot)    # Visualization for Bayesian models
library(tidybayes)    # Tools for working with Bayesian model outputs
library(broom.mixed)  # Tidying mixed models
library(forcats)      # Factor handling

# Load the Spotify dataset
data(spotify)

# Print the first 30 rows of the dataset
cat("First 10 rows of the Spotify dataset:\n")
print(spotify[1:10, ])

# Select relevant variables and reorder artists by mean popularity
spotify <- spotify %>% 
  select(artist, title, popularity) %>% 
  mutate(artist = fct_reorder(artist, popularity, .fun = mean))

# Display the first three rows of the modified dataset
cat("First three rows of the modified dataset:\n")
head(spotify, 3)

# Summary of dataset
cat("Total number of songs in the dataset:", nrow(spotify), "\n")
cat("Total number of unique artists in the dataset:", nlevels(spotify$artist), "\n")

# List of unique artists
cat("Artists in the dataset:\n")
levels(spotify$artist)

# Group songs by artist and compute summary statistics (count and mean popularity)
artist_means <- spotify %>% 
  group_by(artist) %>% 
  summarize(
    count = n(),  # Number of songs per artist
    mean_popularity = mean(popularity)  # Mean popularity per artist
  )

# Display the first two and last two artists based on reordering
cat("First two and last two artists by mean popularity:\n")
artist_means %>% slice(1:2, 43:44)

# Range of the number of songs per artist
cat("Range of songs per artist:\n")
artist_means %>% 
  summarize(min_songs = min(count), max_songs = max(count))

# Density plot of song popularity across all artists
ggplot(spotify, aes(x = popularity)) + 
  geom_density(fill = "blue", alpha = 0.3) +  # Add color and transparency for better visualization
  labs(
    title = "Density Plot of Song Popularity",
    x = "Popularity", y = "Density"
  ) +
  xlim(0, 100)

##################################################################################
# Fitting an Intercept-Only Regression Model:
# This section fits a Bayesian regression model with only an intercept to 
# analyze the overall mean popularity, summarizes the results, and compares 
# posterior predictions to observed artist-level means.
##################################################################################

# Fit an intercept-only Bayesian regression model
pooled_mod <- stan_glm(
  popularity ~ 1,  # Intercept-only model
  data = spotify, 
  family = gaussian, 
  prior_intercept = normal(50, 2.5, autoscale = TRUE),  # Informative prior for the intercept
  prior_aux = exponential(1, autoscale = TRUE),         # Prior for auxiliary parameters
  chains = 4, iter = 10000, seed = 12345               # 4 chains, 10,000 iterations
)

# Summarize posterior distributions for intercept and auxiliary parameters
pooled_summ <- tidy(
  pooled_mod, 
  effects = c("fixed", "aux"), 
  conf.int = TRUE, conf.level = 0.90  # 90% credible intervals
)

cat("Summary of the Intercept-Only Model:\n")
print(pooled_summ)

# Generate posterior predictive means for artist-level popularity
pooled_pred <- posterior_predict(
  pooled_mod, 
  newdata = artist_means  # Predict for artist means
)

# Visualize posterior predictive intervals with actual artist popularity
ppc_intervals(
  y = artist_means$mean_popularity,   # Observed artist mean popularity
  yrep = pooled_pred,  # Posterior predictive samples
  prob_outer = 0.80             # 80% prediction intervals
) +
  ggplot2::scale_x_continuous(
    labels = artist_means$artist,  # Artist names as x-axis labels
    breaks = 1:nrow(artist_means)  # Match labels with artist positions
  ) +
  ggplot2::theme(axis.text.x = element_text(angle = 90, hjust = 1)) +  # Rotate x-axis labels
  labs(
    title = "Posterior Predictive Intervals for Artist Popularity",
    x = "Artist", 
    y = "Popularity"
  ) +
  ylim(0, 100)

##################################################################################
# No Pooling and Hierarchical Modeling of Spotify Popularity Data
##################################################################################
# No pooling: Separate popularity densities for each artist
ggplot(spotify, aes(x = popularity, group = artist)) + 
  geom_density() +
  labs(
    title = "Popularity Densities by Artist (No Pooling)",
    x = "Popularity", y = "Density"
  )+
  xlim(0, 100)

# No pooling model estimation: Fit a separate intercept for each artist
no_pooled_mod <- stan_glm(
  popularity ~ artist - 1,  # Separate intercepts for each artist
  data = spotify, 
  family = gaussian, 
  prior = normal(50, 2.5, autoscale = TRUE), 
  prior_aux = exponential(1, autoscale = TRUE),
  chains = 4, iter = 10000, seed = 12345
)

# Posterior predictive distributions: Simulate the posterior predictive values
no_pooled_pred <- posterior_predict(
  no_pooled_mod, 
  newdata = artist_means
)

# Plot posterior predictive intervals for no pooling model
ppc_intervals(
  y = artist_means$mean_popularity,   # Observed artist-level mean popularity
  yrep = no_pooled_pred,         # Posterior predictive samples
  prob_outer = 0.80              # 80% prediction intervals
) +
  ggplot2::scale_x_continuous(
    labels = artist_means$artist,  # Artist names on x-axis
    breaks = 1:nrow(artist_means)
  ) +
  ggplot2::theme(axis.text.x = element_text(angle = 90, hjust = 1)) +  # Rotate x-axis labels
  labs(
    title = "Posterior Predictive Intervals for Artist Popularity (No Pooling)",
    x = "Artist", 
    y = "Popularity"
  )

##################################################################################
# Hierarchical Modeling: Accounting for artist-specific variability
##################################################################################

# Distribution of artist-level sample means
ggplot(artist_means, aes(x = mean_popularity)) + 
  geom_density(fill = "blue", alpha = 0.3) +
  labs(
    title = "Density of Artist-Level Mean Popularity",
    x = "Popularity", y = "Density"
  ) +
  xlim(0, 100)

# Fit a hierarchical model with varying intercepts for each artist
hier_mod <- stan_glmer(
  popularity ~ (1 | artist),  # Random intercept for each artist
  data = spotify, 
  family = gaussian,
  prior_intercept = normal(50, 2.5, autoscale = TRUE),       # Prior for global intercept
  prior_aux = exponential(1, autoscale = TRUE),             # Prior for auxiliary parameter
  prior_covariance = decov(reg = 1, conc = 1, shape = 1, scale = 1),  # Prior for random effects
  chains = 4, iter = 10000, seed = 12345
)

pp_check(hier_mod) + 
  xlab("popularity")

##################################################################################
# MCMC Diagnostics and Posterior Analysis for Spotify Hierarchical Model
##################################################################################

# MCMC Diagnostics: Trace plots and posterior densities

# There are lots of parameters here, so there are a bunch of plots here:
#mcmc_trace(hier_mod) # don't run, takes a lot of time to show 45+ trace plots ...
#mcmc_dens_overlay(hier_mod)

# Trace plot for the global intercept
mcmc_trace(hier_mod, pars = '(Intercept)') +
  ggtitle("Trace Plot for Global Intercept (Mu)")

# Posterior density plot for sigma (residual standard deviation)
mcmc_dens_overlay(hier_mod, pars = 'sigma') +
  ggtitle("Posterior Density for Residual Standard Deviation (Sigma)")

# Posterior density for sigma_mu (random effect variability)
mcmc_dens_overlay(hier_mod, pars = 'Sigma[artist:(Intercept),(Intercept)]') +
  ggtitle("Posterior Density for Artist-Level Variability (Sigma_mu)")

# Posterior density for the global intercept (Mu)
mcmc_dens_overlay(hier_mod, pars = '(Intercept)') +
  ggtitle("Posterior Density for Global Intercept (Mu)")

# Posterior density for a specific artist's random effect (e.g., Vampire Weekend)
mcmc_dens_overlay(hier_mod, regex_pars = "Vampire_Weekend") +
  ggtitle("Posterior Density for Vampire Weekend's Effect")

# Autocorrelation Function (ACF) plot for the global intercept
mcmc_acf(hier_mod, pars = '(Intercept)') +
  ggtitle("Autocorrelation Plot for Global Intercept (Mu)")

##################################################################################
# Posterior Summaries
##################################################################################

# Posterior summary for the global intercept (Mu)
mu_summ <- tidy(
  hier_mod, 
  effects = "fixed", 
  conf.int = TRUE, 
  conf.level = 0.80
)

cat("Posterior Summary for Global Mu:\n")
print(mu_summ)

# Posterior summary for sigma_y (residual standard deviation) and sigma_mu (random effect variability)
random_effects_summary <- tidy(
  hier_mod, 
  effects = "ran_pars"
)
cat("Posterior Summary for Sigma_y and Sigma_mu:\n")
print(random_effects_summary)

##################################################################################
# Artist-Level Posterior Summaries
##################################################################################

# Extract MCMC chains for artist-specific intercepts (mu_j)
artist_chains <- hier_mod %>%
  spread_draws(`(Intercept)`, b[,artist]) %>% 
  mutate(mu_j = `(Intercept)` + b)  # Combine global intercept with artist-specific effects

# Summarize mu_j with 80% credible intervals and reorder by mean mu_j
artist_summ_sc <- artist_chains %>% 
  select(-`(Intercept)`, -b) %>% 
  mean_qi(.width = 0.80) %>% 
  mutate(artist = fct_reorder(artist, mu_j))

# Display a sample of the summarized results
cat("Artist-Level Posterior Summaries (Top Rows):\n")
print(artist_summ_sc %>% 
        select(artist, mu_j, .lower, .upper) %>% 
        head(4))

# Plot estimates and 80% credible intervals for mu_j
ggplot(artist_summ_sc, aes(x = artist, y = mu_j, ymin = .lower, ymax = .upper)) +
  geom_pointrange() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(
    title = "Posterior Estimates and 80% Credible Intervals for Artist Effects",
    x = "Artist", 
    y = "Artist-Specific Popularity Effect (Mu_j)"
  ) +
  ylim(0,100)

# Checking artists with wide and narrow credible intervals
artist_means %>% 
  filter(artist %in% c("Frank Ocean", "Lil Skies"))
# Frank Ocean's mu_j has a narrow interval; Lil Skies's mu_j has a wide interval.

##################################################################################
### Posterior Prediction:
##################################################################################

# Convert the simulation results to a data frame for easier manipulation
hier_mod_df <- as.data.frame(hier_mod)

### Simulating Vampire Weekend's Posterior Predictive Model
# Extract relevant parameters and simulate predicted song popularities for Vampire Weekend
VW_chains <- hier_mod_df %>%
  rename(b = `b[(Intercept) artist:Vampire_Weekend]`) %>% 
  select(`(Intercept)`, b, sigma) %>% 
  mutate(
    # Calculate the mean popularity (mu_VW) for Vampire Weekend
    mu_VW = `(Intercept)` + b,
    # Simulate 20,000 draws for predicted song popularity
    y_VW = rnorm(20000, mean = mu_VW, sd = sigma)
  )

# Display the first few rows of the simulated predictions
head(VW_chains, 5)

# Summarize the posterior predictive distribution for Vampire Weekend
# Posterior summary for predicting the popularity of a new song
VW_chains %>%
  mean_qi(y_VW, .width = 0.80)

# Summarize the posterior for Vampire Weekend's mean popularity
artist_summ_sc %>%
  filter(artist == "artist:Vampire_Weekend")

# Insight: We can predict the mean popularity for Vampire Weekend with much higher precision
# than the popularity of any single new song due to the hierarchical model's shrinkage.

### Posterior Prediction for a New Artist (Taylor Swift)
# Simulate the posterior predictive distribution for Taylor Swift
taylor_chains <- hier_mod_df %>%
  mutate(
    # Calculate the standard deviation for the artist-level intercept
    sigma_mu = sqrt(`Sigma[artist:(Intercept),(Intercept)]`),
    # Simulate the mean popularity for Taylor Swift
    mu_taylor = rnorm(20000, `(Intercept)`, sigma_mu),
    # Simulate predicted song popularities for Taylor Swift
    y_taylor = rnorm(20000, mu_taylor, sigma)
  )

# Posterior predictive summary for Taylor Swift
taylor_chains %>%
  mean_qi(y_taylor, .width = 0.80)

### Summarizing Predictions for Multiple Artists
# Use a shortcut function to summarize predictions for both artists
prediction_shortcut <- posterior_predict(
  hier_mod,
  newdata = data.frame(artist = c("Vampire Weekend", "Taylor Swift"))
)

# Posterior predictive model plots for the two artists
mcmc_areas(prediction_shortcut, prob = 0.8) +
  ggplot2::scale_y_discrete(labels = c("Vampire Weekend", "Taylor Swift"))

### Posterior Predictive Plots to Visualize Shrinkage
# Generate predictions for all artists based on their means
predictions_hierarchical <- posterior_predict(
  hier_mod,
  newdata = artist_means
)

# Plot posterior predictive intervals to demonstrate shrinkage
ppc_intervals(
  artist_means$mean_popularity, 
  yrep = predictions_hierarchical, 
  prob_outer = 0.80
) +
  ggplot2::scale_x_continuous(
    labels = artist_means$artist, 
    breaks = 1:nrow(artist_means)
  ) +
  xaxis_text(angle = 90, hjust = 1) + 
  geom_hline(yintercept = 58.4, linetype = "dashed")

