# S5_6600_S8_RCode.R
# R code for generating plots and examples as used in STAT 5600/6600 Lecture Slides 8

#Set Working Directory to Source File Location
library("rstudioapi")                                 # Load rstudioapi package

setwd(dirname(getActiveDocumentContext()$path)) # Set working directory to source file location
getwd() # Check updated working directory

library(stats4) #for the mle() function

###########################
# Example: Return Time for Library Books, Version II
###########################
pima <- read.csv('diabetes.csv', header = FALSE)
bmi <- as.numeric(pima [ ,6])
bmi <- bmi[bmi > 0]
hist(bmi, breaks=19, freq=FALSE)

hist(bmi, breaks=99, freq=FALSE)

hist(bmi, breaks=2, freq=FALSE)

#hist() with defaults # of breaks
hist(bmi, freq=FALSE)

#or try histogram() function from the lattice package
library(lattice)
histogram(bmi, type='density')

plot(density(na.omit(bmi)),main="density estimate")

#Let's see how well the model fits, at least visually:
# Calculate the mean, variance, and other parameters while handling missing values
xb <- mean(bmi, na.rm = TRUE)
s2 <- var(bmi, na.rm = TRUE)
lh <- xb / s2
ch <- xb^2 / s2
# Create a histogram with the modified parameters
hist(bmi, freq=FALSE, breaks= 20)
# Overlay the gamma distribution curve with the modified parameters
curve(dgamma(x, ch, lh), 0, 70, col=2, lwd=2,add = TRUE)

###########################################################################
#Example for finding the MLE estimator of k for Bin(k, p) 
#where p is known and k is unknown.
###########################################################################

#first, using our customized optim() function
library(stats)
data <- c(3, 2, 1, 12, 15, 11, 18, 20, 7, 9, 8, 10, 14, 13, 6, 5, 19, 4, 16, 17)  
# use your own binomial data
p <- 0.5  # use the known probability

# Define the likelihood function
likeli <- function(k) {
  # Ensure k is an integer and larger than all data points
  k <- max(ceiling(k), max(data))
  
  # Calculate negative likelihood (since optim() is for minimization)
  -prod(dbinom(data, size = k, prob = p))
}

# Use optim() to find the MLE of k
# Start the optimization at a sensible point, e.g., the maximum of your data
k0 <- max(data)
mle_est <- optim(par = k0, fn = likeli, method = "Brent", lower = k0, upper = 100)

#the estimated k (rounded to the nearest integer)
est_k <- ceiling(mle_est$par)

#the estimated k
print(est_k)

#using the mle() function 
#WARNING:the below code does not work!
# bin.ll <- function(k) {
#   -sum(dbinom(data, size = k, prob = 0.5, log=TRUE))
# }
# 
# summary(mle(minuslogl=bin.ll, start=list(k=k0)))

#So, we try instead Log-likelihood function with factorial replaced with gamma functions
#Here, first we treat the case where both k and p are unknown
log_bin_pmf <- function(k, p) {
  if (any(k < 0, p < 0, p > 1)) {
    return(Inf)
  }
  k = k
  lbin_cf <- log(gamma(k + 1)) - log(gamma(data + 1)) - log(gamma(k - data + 1))
  log_pr <- sum(lbin_cf + data*log(p) + (k - data)*log(1 - p))
  return(-log_pr)
}

#initial values
k0 <- max(data)  
p0 <- 0.5  

# Perform MLE
result <- tryCatch({
  mle(minuslogl = log_bin_pmf, start = list(k = k0, p = p0))
}, error = function(e) e)

#tryCatch({...}, error = function(e) e) attempts to execute the code within its braces ({}). 
#If an error occurs during the execution of that code, it is caught, 
#and the error function is executed. If an error occurs, the error object e is returned.

# Check for errors
if (inherits(result, "error")) {
  cat("Error in MLE:", result$message, "\n")
} else {
  # Display the summary of the MLE result
  summary(result)
}
# Extract the value of 'k' and convert it to an integer
k_val <- as.integer(coef(result)["k"])
cat("Value of k is", k_val, "\n")

#Next we treat the case where k is unknown and p=0.5
#MLE of K when p=.5
# Log-likelihood function with factorial replaced with gamma functions
log_bin_pmf <- function(k) {
  p=.5
  if (any(k < 0, p < 0, p > 1)) {
    return(Inf)
  }
  k = k
  lbin_cf <- log(gamma(k + 1)) - log(gamma(data + 1)) - log(gamma(k - data + 1))
  log_pr <- sum(lbin_cf + data*log(p) + (k - data)*log(1 - p))
  return(-log_pr)
}

#initial value
k0 <- max(data)

# Perform MLE
result <- tryCatch({
  mle(minuslogl = log_bin_pmf, start = list(k = k0))
}, error = function(e) e)

# Check for errors
if (inherits(result, "error")) {
  cat("Error in MLE:", result$message, "\n")
} else {
  # Display the summary of the MLE result
  summary(result)
}

k_val <- as.integer(coef(result)["k"])
cat("Value of k is", k_val, "\n")

#########################################################
#Using mle() to Estimate Parameters of Gamma Distribution:
#########################################################
x = bmi[!is.na(bmi)]
gam.ll <- function(c, lambda) {
    -sum(dgamma(x, shape=c, rate=lambda, log=TRUE))
}

summary(mle(minuslogl=gam.ll, start=list(c=1.5, lambda=2)))

#########################################################
#plotting ecdf's for bmi
#########################################################
ebmi <- ecdf (bmi)
plot (ebmi ,cex =.1, xlim=c(15,60))
curve(pgamma(x, ch, lh), 15, 60, col=2,add = TRUE)

ks_result <- ks.test(bmi, "pgamma", ch, lh)
ks_result$statistic
