######################################################################
# examples for linear regression 
# 04-02-2022 
######################################################################

######################################################################
# load data  
######################################################################

import pandas as pd 

filename = "http://www.auburn.edu/~zengpen/teaching/STAT-7000/datasets/SAT.txt" 
sat = pd.read_csv(filename, sep = "\s+") 
sat.info() 

# drop Alaska in the remaining analysis 
clean = sat[sat['state'] != "Alaska"]   

######################################################################
# scatter plot 
######################################################################

import matplotlib.pyplot as plt 

plt.plot('takers', 'sat', '.', data = sat) 
plt.title('Scatter Plot')
plt.xlabel('Percent of SAT takers')
plt.ylabel('SAT average')
plt.show() 

# scatter plot matrix 

pd.plotting.scatter_matrix(sat)
plt.show() 

######################################################################
# linear regression using statsmodels  
######################################################################

import numpy as np 
import statsmodels.formula.api as smf 

results = smf.ols("sat ~ np.log(takers) + income + years + public + expend + rank", data = clean).fit()
results.summary()

results.params           # fitted parameters
results.rsquared         # R-square 
results.predict()        # prediction 
results.predict(sat)     # prediction 

######################################################################
# linear regression using statsmodels, alternative approach   
######################################################################

import statsmodels.api as sm

y = clean['sat']
x = clean[['income', 'years', 'public', 'expend', 'rank']]
x['logtaker'] = np.log(clean['takers'])
x = sm.add_constant(x) 

results2 = sm.OLS(y, x).fit()
results2.summary() 

######################################################################
# linear regression using sklearn   
######################################################################

from sklearn.linear_model import LinearRegression

y = clean['sat']
x = clean[['income', 'years', 'public', 'expend', 'rank']]
x['logtaker'] = np.log(clean['takers'])

model = LinearRegression()    # create a model 
model.fit(x, y)               # fit LSE 
model.intercept_              # fitted intercept 
model.coef_                   # fitted coefficients 
model.score(x, y)             # R-square 
model.predict(x)              # predictted responses 

######################################################################
# THE END
######################################################################