# R Lab
# The reason is ....
a <- 7
b <- 7
a * b
# R is an object-oriented statistical package! For you, that means traditional
# point-and-click forms of analysis are gone. In R, instead, we ``assign'' things
# to ``objects'' and then manipulate those objects to perform analysis. The way
# this looks is often much more like actual computer code than anything else. Don't
# worry if it doesn't come to you right away. It involves a shift in the way that
# you think about how to do analysis. But I promise that it gets better.
# R is comprised of two windows: the R Console and the R Editor. The R Console
# is where all of the analysis happens. The R Editor is where you physically type
# into your R Script the command that you want R to run. You continuously save your
# R script, so any time, you can come back later and get to where you were. You
# save your R Console far less often. We run commands from the Editor into the
# Console by CTRL + R or right-click, ``Run line or section.'' (On Mac: Command + Enter.)
# The ``comment'' function in R is through the # sign. So R will ignore any line
# that begins with this symbol.
# If you ever see a command you don't know, type ?``command'' in the R Console.
# Since it is object-oriented, the most important thing we're going to do is
# assign things to objects. This involves the assignment arrow, <-, which can
# also be replaced by an equal sign, =. The thing on the left of the assignment
# arrow is the object. The thing on the right is what we're assigning! It can be
# almost anything: a single number, a group of numbers, a ``vector'' (a column of
# numbers, just like a column of numbers in a dataset).
# Assigning values to objects (we can name these objects anything we want)
a = 7
a = 1
print(a)
a
a <- 1
print(a)
a
male <- rep( c(0, 0, 1, 1), 1000)
height <- rep(c(5.5, 5.6, 6.0, NA), 1000)
height <- c(5.5, 5.6, 6.0, NA)
height[c(4, 5, 6, 7, 8, 9, 10)]
height[4:10]
height[male == 0]
subset(height, male == 0)
subset(height, male == 1)
mean(height)
mean(height, na.rm = TRUE)
mean(height[male == 0])
mean(height[male == 1])
sd(height)
vector1 <- c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1)
vector2 <- c(1, 2, 3, 4, 5, 6, -1, -2, -3, -4)
mean(vector1)
mean(vector2)
sd(vector1)
sd(vector2)
sd(height, na.rm = TRUE)
?sd
<-
a = 2
b <= 3
b - a
c <- b - a
d = 1:2
d
c + d
c*d
f = "I'm so confused"
# To see what is in objects, you can either ``print'' or just type the object name.
print (f)
f
print("It will be ok")
# Since we're interested in data analysis, that is, finding numeric patterns in
# groups of data, we'll be working with vectors most of the time. The basic function
# here is ``c()'', which is short for concatenate.
# This is an example of creating a new object called ``vector'' with the numbers
# 1 to 5.
vector <- c(1,2,3,4,5)
vector
# We can also use c() to join together two objects. Recall our object ``a'' from earlier
# (it's the number 2). What do you think the following line will do? Were you right?
vector2 <- c(a, vector)
vector2
# If we wanted a vector with more than five numbers (say 50), we might not want to type
# all 50 numbers. The colon : means ``from ... to''. So 1:5 means 1, 2, 3, 4, 5.
# Let's see this in action!
quicker <- c(1:10)
# Another useful command is seq, which is short for sequence! It creates a vector of numbers
# starting with one number and ending in another.
quicker2 = seq(1,10, by=0.5)
# What if we forgot what ``seq'' did, as a command?
?seq
# R offers three main advantages over its friends (other than being free).
# The first is being able to manipulate specific elements of variables more easily.
# The second is being able to read in multiple datasets. The last is pretty plots.
# Calling out elements of vectors. When you see square brackets, you are calling
# out that ``element'' of that object. So we're calling out the 3rd element of
# the object ``vector''
vector[3]
vector2[3]
# R is actually a lot smarter than you might think. You can ask to see a set of elements
# using the : method from earlier
quicker[3:8]
# Or maybe you want to see only the elements that fit a criteria?
# How about all the elements of ``quicker'' that are greater than 5
quicker[quicker>5]
# Or the elements of vector2 that are exactly equal to 1?
vector2[vector2 == 1]
# So far we've asked R to do boring things. It can also do fun things!
# Generate a random normal number!
rnorm(1)
# Generate a 0 or a 1 (heads or tails!)
rbinom(1, 1, 0.5)
# Calculate the odds of winning the Powerball!
(5/69)*(4/68)*(3/67)*(2/66)*(1/65)*(1/26)
# Published odds: 1 in 292,201,338
1/292201338
# Calculate your class grade without having to do it a bunch of times!
# (Object assignment is fun!)
Participation <- 90
Midterm <- 80:100
Research.Design <- 95
Annotated.Bibliography <- 85
Presentation <- 100
final <- Participation*0.20 + Midterm*0.30 + Research.Design*0.20 +
Annotated.Bibliography*0.20 + Presentation*0.10
final
print(final)
cbind(Midterm, final)
height[male <= 0]
final[Midterm == 70]
# Like we said, though, you're going to be using R for analysis. So we're especially
# interested in those things that give us descriptions of our data. The following
# are some useful descriptive statistics functions. In R, things with parentheses
# wrapped around them are FUNCTIONS. You've already learned a few functions: c(),
# seq(), and print() are all functions. Functions are the lifeblood of data analysis
# in R. We apply functions to objects in order to perform analysis.
# We can do a variety of univariate (single-variable) statistical analyses in R.
summary(quicker)
length(quicker)
sum(quicker)
mean(quicker)
var(quicker)
max(quicker)
min(quicker)
# What happens with missing data? This can often break simple functions. The problem
# is that our data is missing all of the time. Notice this.
quickerNA <- c(NA, quicker)
mean(quickerNA)
# But notice also the solution!
mean(quickerNA, na.rm = TRUE)
mean(na.omit(quickerNA))
# Vectors are single columns of data. Matrices are columns stacked next to each other.
# This is important because most of our data are matrices.
matrix1 <- matrix(c(1,2,3,4,5,6))
mat <- matrix(c(1,2,3,4,5,6), nrow=2, ncol=3, byrow=TRUE)
mat
# You'll notice that you're starting to see things to the right of a comma here.
# Those are optional arguments to functions. If we ever forgot what they are, how
# would we look them up?
?matrix
mat1 <- matrix(1:16, nrow=8, byrow = FALSE)
mat1[8, 1]
mat1[3, 2]
mat1[, 1]
# Remember when we called out elements from vectors? We can do that with matrices, too!
# Always remember that R thinks of things in [rows,columns]. So when we ask for
# [1,1], we're asking for the element in the first row, first column. [2,2] is second
# row, second column. If we don't supply a number, like [,1], it thinks of that
# as all of the rows in the first column! Let's look.
mat1[1,1]
mat1[1,]
mat1[,1]
# Why would this break?
mat1[26,]
height
# Through your career, you're almost entirely going to be working with datasets.
# The easiest way to get your data in is when it is saved as a .csv file. This is a
# little known option in Excel, but it standardizes how programs ``think'' about data
# Read in dataset
a <- 1
data <- read.csv("/Users/teach/Downloads/AWARDS.csv")
names(data)
sex
data$sex
table(data$sex)
mean(data$sex)
dim(data)
female <- rep(NA, length(data$sex))
female[data$sex == "Female"] <- 1
female
female[data$sex == "Male"] <- 0
female
table(data$sex)
table(female)
table(data$sex, female)
mean(female)
sd(female)
data$prty
partyid <- rep(NA, length(data$prty))
partyid[data$prty == "Republican"] <- 1
partyid[data$prty == "Independent"] <- 2
partyid[data$prty == "Democrat"] <- 3
table(data$prty, partyid)
summary(partyid)
partyidwithdks <- rep(NA, length(data$prty))
partyidwithdks[data$prty == "Republican"] <- 1
partyidwithdks[data$prty == "Independent"] <- 2
partyidwithdks[data$prty == "Democrat"] <- 3
partyidwithdks[data$prty == "Don't know/No answer"] <- 2
table(data$prty, partyidwithdks)
summary(partyidwithdks)
mean(partyid, na.rm = TRUE)
mean(partyid[female == 1], na.rm = TRUE)
mean(partyid[female == 0], na.rm = TRUE)
# Statistical test for this difference
# Gender -> Partyid
# Gender is nominal
# Independent is ordinal
t.test(partyid[female == 1], partyid[female == 0])
(mean(partyid[female == 1], na.rm = TRUE) - mean(partyid[female == 0], na.rm = TRUE))
ideology <- rep(NA, length(data$pphl))
ideology[data$pphl == "Very conservative"] <- 1
ideology[data$pphl == "Somewhat conservative"] <- 2
ideology[data$pphl == "Moderate"] <- 3
ideology[data$pphl == "Somewhat liberal"] <- 4
ideology[data$pphl == "Very liberal"] <- 5
ideologywithdks <- rep(NA, length(data$pphl))
ideologywithdks[data$pphl == "Very conservative"] <- 1
ideologywithdks[data$pphl == "Somewhat conservative"] <- 2
ideologywithdks[data$pphl == "Moderate"] <- 3
ideologywithdks[data$pphl == "DK/NA"] <- 3
ideologywithdks[data$pphl == "Somewhat liberal"] <- 4
ideologywithdks[data$pphl == "Very liberal"] <- 5
ideologywithdksallbythemselves <- rep(NA, length(data$pphl))
ideologywithdksallbythemselves[data$pphl == "Very conservative"] <- 1
ideologywithdksallbythemselves[data$pphl == "Somewhat conservative"] <- 2
ideologywithdksallbythemselves[data$pphl == "Moderate"] <- 3
ideologywithdksallbythemselves[data$pphl == "DK/NA"] <- 9
ideologywithdksallbythemselves[data$pphl == "Somewhat liberal"] <- 4
ideologywithdksallbythemselves[data$pphl == "Very liberal"] <- 5
table(data$pphl, ideology)
table(data$pphl, ideologywithdks)
table(data$pphl, ideologywithdksallbythemselves)
mean(partyid[ideology == 1], na.rm = T) # 1.438889
mean(partyid[ideology == 2], na.rm = T) # ????????? come back to this
mean(partyid[ideology == 3], na.rm = T)
mean(partyid[ideology == 4], na.rm = T)
mean(partyid[ideology == 5], na.rm = T)
table(partyid[ideology == 1])
table(partyid[ideology == 5])
table(partyid[ideologywithdksallbythemselves == 9])
summary(partyid[ideologywithdksallbythemselves == 9])
summary(partyid[female == 1 & ideology == 3], na.rm = T)
summary(partyid[female == 0 & ideology == 3], na.rm = T)
mean(partyid[female == 1 & ideology == 3], na.rm = T) - mean(partyid[female == 0 & ideology == 3], na.rm = T)
0.02020677/0.05471094
# Cross-tab
table(partyid, ideology)
cor.test(partyid, ideology)
?cor.test
# partyid
# ideology
names(data)
# Labels and factors in R, reading codebooks
# Recoding data (why is it good to do here?)
# Bivariate hypotheses
# Comparing datasets
gunsdata <- read.csv("/Users/teach/Downloads/GUNS.csv")
names(gunsdata)
gunsdata$Q34
ebola <- rep(NA, length(gunsdata$Q32))
ebola[gunsdata$Q32 == 1] <- 1
ebola[gunsdata$Q32 == 2] <- 2
ebola[gunsdata$Q32 == 3] <- 3
ebola[gunsdata$Q32 == 4] <- 4
table(ebola, gunsdata$Q32)
manboss <- gunsdata$Q33
manboss[manboss == 9] <- NA
manboss[1:10]
# manboss 1 = prefer man
# 2 = prefer woman
# 3 = no pref
table(ebola, manboss)
chisq.test(ebola, manboss)
cor.test(ebola, manboss)
plot(jitter(ebola), jitter(manboss))
gunsdata$D7A
gunsdata$D7B
partyidguns <- rep(NA, length(gunsdata$D7A))
partyidguns[gunsdata$D7A == 1] <- 1 # Republicans
partyidguns[gunsdata$D7B == 2] <- 2 # Lean GOP
partyidguns[gunsdata$D7B == 3] <- 3 # INDP
partyidguns[gunsdata$D7B == 1] <- 4 # Lean Dem
partyidguns[gunsdata$D7A == 2] <- 5 # Dem
table(partyidguns, gunsdata$D7A)
table(partyidguns, gunsdata$D7B)
?rep
enthusiasm <- rep(NA, length(gunsdata$Q8))
enthusiasm[gunsdata$Q8 == 1] <- 1
enthusiasm[gunsdata$Q8 == 2] <- 2
enthusiasm[gunsdata$Q8 == 3] <- 3
enthusiasm[gunsdata$Q8 == 4] <- 4
enthusiasm[gunsdata$Q8 == 5] <- 5
table(enthusiasm, gunsdata$Q8)
enthusiasoijio
partyidguns
plot(jitter(enthusiasm), jitter(partyidguns))
abline(model)
cor.test(enthusiasm, partyidguns)
lm
?lm(formula = y [dependent var] ~ [independent var] + )
model <- lm(enthusiasm ~ partyidguns)
summary(model)
0.04113 * (5 - 1)
0.04 * (max(partyidguns, na.rm = T) - min(partyidguns, na.rm = T))/
(max(enthusiasm, na.rm = T) - min(enthusiasm, na.rm = T))