######################################################################
# examples for data manipulation
# 09-01-2021 
# 02-22-2022 
######################################################################

import pandas as pd 

######################################################################
# load data (9759 obs. and 17 variables)
######################################################################

filename = 'http://webhome.auburn.edu/~zengpen/teaching/STAT-7000/datasets/bodymeasure.csv'
body = pd.read_csv(filename).convert_dtypes()  # use pd.NA for missing 
body.info() 

body = pd.read_csv(filename)
body.info() 

######################################################################
# check individual varaible 
######################################################################

body.age.describe()         # continuous variable 
body.race.describe()        # categorical variable 
body.race.value_counts()    # categorical variable 

######################################################################
# indexing 
# pay attention to the difference of slice in .loc and .iloc 
#    in .loc,  :4 indicates indexes and 4 is included 
#    in .iloc, :4 indicates integer positions, 4 is not included 
######################################################################

body.loc[:4, ["age", "gender"]]   # label based, boolean array
body.iloc[:4, [1, 16]]            # integer-position based, boolean array  

body.at[3, "age"]   # access a single value, label based 
body.iat[3, 1]      # access a single value, integer-position based 

######################################################################
# create a new variable
######################################################################

body['BMI'] = body['weight'] / (body['height'] * body['height'] / 10000) 

# equivalent, but easier to read 

body.eval('BMI = weight / (height * height / 10000)', inplace = True)

######################################################################
# create a subset 
# sometimes, need to reset the index 
######################################################################

boys = body.loc[(body.gender == 'M') & (body.age >= 2) & (body.age <= 5)]
boys = boys.reset_index(drop = True)

body.query('(gender == "M") and (age >= 2) and (age <= 5)')
# need to remove .convert_dtypes() when reading csv 
# the extension integer dtypes does not work with query 

######################################################################
# compute summary statistics by groups
######################################################################

body_selected = body.loc[body['age'] >= 18]
a1 = body_selected[['race', 'gender', 'SEQN']]. \
    groupby(['race', 'gender']).count()
a2 = body_selected[['race', 'gender', 'height']]. \
    groupby(['race', 'gender']).mean()
a3 = body_selected[['race', 'gender', 'weight']]. \
    groupby(['race', 'gender']).mean()
pd.concat([a1, a2, a3], axis = 1).reset_index(). \
    rename(columns = {'SEQN': 'count', 
                      'height': 'meanheight', 
                      'weight': 'meanweight'})

# equivalent, but easier to read 

body_selected.groupby(['race', 'gender'], as_index = False). \
    agg(count = ('SEQN', 'count'), 
        meanheight = ('height', 'mean'), 
        meanweight = ('weight', 'mean'))

######################################################################
# find the first observation in each group after sorting 
######################################################################

body = body.sort_values(by = ['gender', 'race', 'height'], 
    ascending = [True, True, False])
body.groupby(['gender', 'race']).first(). \
    reset_index()[['SEQN', 'gender', 'race', 'height']]

body.groupby(['gender', 'race']).nth(0). \
    reset_index()[['SEQN', 'gender', 'race', 'height']]

######################################################################
# coding categorical variable  
######################################################################

body.race.value_counts()

body['race_code'] = 999
for index, row in body.iterrows():
    if row['race'] == 'Non-Hispanic White':
        body.at[index, 'race_code'] = 1
    elif row['race'] == 'Mexican American':
        body.at[index, 'race_code'] = 2 
    elif row['race'] == 'Non-hispanic Black':
        body.at[index, 'race_code'] = 3
    elif row['race'] == 'Other Hispanic':
        body.at[index, 'race_code'] = 4 
    elif row['race'] == 'Other':
        body.at[index, 'race_code'] = 5

body.race_code.value_counts()

# equivalent, but easier to read and faster 

code = {'Non-Hispanic White': 1, 
        'Mexican American':   2, 
        'Non-hispanic Black': 3, 
        'Other Hispanic':     4, 
        'Other':              5}
body['race_code'] = [code[i] for i in body.race]

# equivalent, use replace() directly 

body['race_code2'] = body.race.replace(code) 

######################################################################
# fill na with a string 
######################################################################

body.marital.isna().sum() 
body['marital'] = body.marital.fillna("no info") 
body.marital.value_counts()

######################################################################
# THE END
######################################################################