## Lecture Slides 14 R Examples

#Set Working Directory to Source File Location
library("rstudioapi")  # Load rstudioapi package
setwd(dirname(getActiveDocumentContext()$path)) # Set working directory to source file location
#getwd()  

##################################################################################
# Naive Bayes Classification w/ Penguin Data
##################################################################################

# Load packages
library(bayesrules)
library(tidyverse)
library(e1071)
library(janitor)

# Load penguin dataset
data(penguins_bayes)  # Load dataset from bayesrules package
penguins <- penguins_bayes

# Summarize species counts in the sample
penguins %>% 
  tabyl(species) %>% 
  adorn_totals("row")  # Tidy summary with row total

# Base R equivalent for species counts
table(penguins$species)  # Simple table for species count

## Classify based on one categorical predictor

# Stacked bar plot showing species distribution by weight category
penguins %>% 
  drop_na(above_average_weight) %>%  # Remove rows with missing weight data
  ggplot(aes(fill = above_average_weight, x = species)) + 
  geom_bar(position = "fill") + 
  labs(
    title = "Proportion of Weight Categories by Species",
    x = "Species",
    y = "Proportion",
    fill = "Above Average Weight"
  ) + 
  theme_minimal()

# Cross-tabulation of species by weight categories with totals
penguins %>% 
  select(species, above_average_weight) %>%  # Select relevant columns
  drop_na() %>%  # Remove rows with missing data
  tabyl(species, above_average_weight) %>% 
  adorn_totals(c("row", "col"))  # Add row and column totals

# Base R equivalent for the same cross-tabulation
addmargins(table(penguins$species, penguins$above_average_weight))  # Adds margins (totals)

## Analyze conditional distributions of bill length

# Density plot of bill length for each species with a reference line at 50mm
penguins %>% 
  ggplot(aes(x = bill_length_mm, fill = species)) + 
  geom_density(alpha = 0.7) +  # Overlapping density curves with transparency
  geom_vline(xintercept = 50, linetype = "dashed", color = "red") +  # Reference line at 50mm
  labs(
    title = "Conditional Distribution of Bill Length by Species",
    x = "bill length (mm)",
    y = "density",
    fill = "Species"
  ) + 
  theme_minimal()

# Calculate sample means and standard deviations of bill length for each species
penguins %>% 
  group_by(species) %>% 
  summarize(
    mean_bill_length = mean(bill_length_mm, na.rm = TRUE),  # Exclude missing values
    sd_bill_length = sd(bill_length_mm, na.rm = TRUE)  # Exclude missing values
  ) %>% 
  arrange(desc(mean_bill_length))  # Order by mean bill length
# Likelihood calculations for flipper length (X3 = 195) for each species
# Using normal density function (dnorm)

# Species A (Adelie): mean = 190, sd = 6.54
L_A <- dnorm(195, mean = 190, sd = 6.54)
print(paste("L(y = A | x3 = 195):", round(L_A, 6)))

# Species C (Chinstrap): mean = 196, sd = 7.13
L_C <- dnorm(195, mean = 196, sd = 7.13)
print(paste("L(y = C | x3 = 195):", round(L_C, 6)))

# Species G (Gentoo): mean = 217, sd = 6.48
L_G <- dnorm(195, mean = 217, sd = 6.48)
print(paste("L(y = G | x3 = 195):", round(L_G, 6)))

###############################################################
#### Naive Bayes Classification using the naiveBayes function in e1071 package
###############################################################

# Load necessary library
library(e1071)

# Building Naive Bayes Models

# Model 1: Using above_average_weight as predictor
naive_mod1 <- naiveBayes(species ~ above_average_weight, data = penguins)

# Model 2: Using bill length as predictor
naive_mod2 <- naiveBayes(species ~ bill_length_mm, data = penguins)

# Model 3: Using bill length and flipper length as predictors
naive_mod3 <- naiveBayes(species ~ bill_length_mm + flipper_length_mm, data = penguins)

# Model 4: Including weight information in addition to bill and flipper lengths
naive_mod4 <- naiveBayes(species ~ above_average_weight + bill_length_mm + flipper_length_mm, data = penguins)

# Making Predictions for a New Observation

# Define a new penguin with specific measurements
our_penguin <- data.frame(above_average_weight = '0', bill_length_mm = 50, flipper_length_mm = 195)

# Predictions using Model 1 (above_average_weight only)
cat("Posterior probabilities (Model 1):\n")
print(predict(naive_mod1, newdata = our_penguin, type = "raw"))

cat("Predicted class (Model 2):\n")
print(predict(naive_mod2, newdata = our_penguin, type = "raw"))
print(predict(naive_mod2, newdata = our_penguin))

# Predictions using Model 3 (bill length + flipper length)
cat("\nPosterior probabilities (Model 3):\n")
print(predict(naive_mod3, newdata = our_penguin, type = "raw"))
cat("Predicted class (Model 3):\n")
print(predict(naive_mod3, newdata = our_penguin))

# Predictions using Model 4 (above_average_weight + bill length + flipper length)
cat("\nPosterior probabilities (Model 4):\n")
print(predict(naive_mod4, newdata = our_penguin, type = "raw"))
cat("Predicted class (Model 4):\n")
print(predict(naive_mod3, newdata = our_penguin))

# In-sample Predictions for All Penguins

# Generate predicted classifications for the entire sample using all models
penguins <- penguins %>% 
  mutate(class_1 = predict(naive_mod1, newdata = .),
         class_2 = predict(naive_mod2, newdata = .),
         class_3 = predict(naive_mod3, newdata = .),
         class_4 = predict(naive_mod4, newdata = .))

# Results and Summary

cat("\nIn-sample classifications have been added as new columns:\n")
print(head(penguins[, c("species", "class_1", "class_2", "class_3", "class_4")]))

##################################################################################
# Confusion Matrices for In-sample Predictions

# Function to generate and format confusion matrices
conf_mat <- function(data, species_col, predicted_col) {
  data %>%
    tabyl(!!sym(species_col), !!sym(predicted_col)) %>%
    adorn_percentages("row") %>%
    adorn_pct_formatting(digits = 2) %>%
    adorn_ns()
}

# Confusion matrix for naive_mod1
cat("\nConfusion Matrix for naive_mod1:\n")
print(conf_mat(penguins, "species", "class_1"))

# Confusion matrix for naive_mod2
cat("\nConfusion Matrix for naive_mod2:\n")
print(conf_mat(penguins, "species", "class_2"))

# Confusion matrix for naive_mod3
cat("\nConfusion Matrix for naive_mod3:\n")
print(conf_mat(penguins, "species", "class_3"))

# Confusion matrix for naive_mod4
cat("\nConfusion Matrix for naive_mod4:\n")
print(conf_mat(penguins, "species", "class_4"))

##################################################################################
# Cross-Validation for Classification Accuracy

# Perform cross-validation for each model
cat("\nCross-validation classification accuracy (k=5):\n")

# Cross-validation for naive_mod1
CV_mod1 <- naive_classification_summary_cv(
  model = naive_mod1, data = penguins, y = "species", k = 5)
cat("naive_mod1 Cross-validation Accuracy:\n")
CV_mod1$cv

# Cross-validation for naive_mod2
CV_mod2 <- naive_classification_summary_cv(
  model = naive_mod2, data = penguins, y = "species", k = 5)
cat("naive_mod2 Cross-validation Accuracy:\n")
CV_mod2$cv

# Cross-validation for naive_mod3
CV_mod3 <- naive_classification_summary_cv(
  model = naive_mod3, data = penguins, y = "species", k = 5)
cat("naive_mod3 Cross-validation Accuracy:\n")
CV_mod3$cv

# Cross-validation for naive_mod3
CV_mod4 <- naive_classification_summary_cv(
  model = naive_mod4, data = penguins, y = "species", k = 5)
cat("naive_mod4 Cross-validation Accuracy:\n")
CV_mod4$cv

