# FALL 2022 HDFS 8050
##Introduction to R/RStudio
What data types are available in R?
What is an object?
How can subsets be extracted from vectors?
How does R treat missing values?
How can we deal with missing values in R?
```{r}
3 + 5
12 / 7
grade <- 1.0
```
<- is the assignment operator. It assigns values on the right to objects on the left.
In RStudio, typing Alt + - (push Alt at the same time as the - key) will write <- in a single keystroke in a PC, while typing Option + - (push Option at the same time as the - key) does the same in a Mac.
```{r}
grade <- 1.0 # doesn't print anything
```
```{r}
(grade <- 1.0) # putting parenthesis around the call prints the value of `grade`
```
```{r}
grade # and so does typing the name of the object
```
```{r}
grade <- 1
```
```{r}
(grade <- 1) # assign a value and print it
grade
```
```{r}
grade <- 3 # in 4 point system, grade 3
```
```{r}
grade_new <- grade*2.5
```
## Functions
```{r}
sqrt(16)
```
```{r}
grade_results<- sqrt(grade_new)
```
```{r}
?sqrt
```
```{r}
args(sqrt)
```
```{r}
temp <- 5.14789655891
```
```{r}
round(temp)
```
```{r}
?round
```
```{r}
round(temp, digits = 3)
```
```{r}
round(digits=2, x=temp)
```
## Vectors and data types
```{r}
hh_members <- c(3, 7, 10, 6)
hh_members
```
```{r}
education <- c("no_education", "highschool", "college")
education
```
```{r}
length(hh_members)
```
```{r}
length(education)
```
```{r}
typeof(hh_members)
```
```{r}
typeof(education)
```
The function str() provides an overview of the structure of an object and its elements. It is a useful function when working with large and complex objects:
```{r}
str(hh_members)
```
```{r}
str(education)
```
You can use the c() function to add other elements to your vector:
```{r}
hh_items <- c("bicycle", "radio", "television")
```
```{r}
hh_items <- c(hh_items, "mobile_phone") # add to the end of the vector
```
```{r}
hh_items <- c("car", hh_items) # add to the beginning of the vector
hh_items
```
```{r}
num_char <- c(1, 2, 3, "a")
```
```{r}
num_logical <- c(1, 2, 3, TRUE)
```
```{r}
char_logical <- c("a", "b", "c", TRUE)
```
```{r}
tricky <- c(1, 2, 3, "4")
```
## Subsetting vectors
```{r}
education <- c("no_education", "highschool", "college")
education[2]
```
```{r}
education[c(3, 2)]
```
```{r}
more_education <- education[c(1, 2, 3, 2, 1, 3)]
more_education
```
R indices start at 1.
## Conditional subsetting
```{r}
hh_members <- c(3, 7, 10, 6)
hh_members[c(TRUE, FALSE, TRUE, TRUE)]
```
```{r}
hh_members > 5 # will return logicals with TRUE for the indices that meet the condition
```
```{r}
hh_members[hh_members > 5]
```
```{r}
hh_members[hh_members < 4 | hh_members > 7]
```
```{r}
hh_members[hh_members >= 4 & hh_members <= 7]
```
```{r}
hh_items <- c("car", "bicycle", "radio", "television", "mobile_phone")
```
```{r}
hh_items[hh_items == "car" | hh_items == "bicycle"] # returns both car and bicycle
```
```{r}
hh_items %in% c("car", "bicycle")
```
```{r}
hh_items %in% c("car", "bicycle", "motorcycle", "truck", "boat", "bus")
```
```{r}
hh_items[hh_items %in% c("car", "bicycle", "motorcycle", "truck", "boat", "bus")]
```
```{r}
hhmembers <- c(3,5,10,2)
typeof(hhmembers)
```
```{r}
education <- c("no_education", "highschool", "college")
typeof(education)
str(education)
```
```{r}
library(nlme)
View(BodyWeight) # shows dataset
mydata <- BodyWeight # save as mydata
```
```{r}
str(mydata)
mydata$Time >10
mydata[3,2]
mydata[2:15 ,1:3]
mydatasubset <- mydata[which(mydata$Time >10),]
mysubset2 <-mydata[which(mydata$Time >10 & mydata$Diet == 3),]
#weight < 400
#diet < 3
mysubset5 <- mydata[which((mydata$weight <400 & mydata$Diet ==1)
( mydata$weight <400 &mydata$Diet ==2)),]
str(mydata)
```
## Missing data
```{r}
rooms <- c(2, 1, 1, NA, 7)
mean(rooms)
```
```{r}
max(rooms)
```
```{r}
mean(rooms, na.rm = TRUE)
```
```{r}
max(rooms, na.rm = TRUE)
```
If your data include missing values, you may want to become familiar with the functions is.na(), na.omit(), and #complete.cases().
Extract those elements which are not missing values.
The ! character is also called the NOT operator
```{r}
variable1 <- c(1,2,NA,3,4,5,6,7,NA)
#complete.cases
#mean imputation-- week 6,
# don't use this line if you have only one variable
#variable1_R <- variable1[which(variable1 != NA),]
```
##recode variable1 into a new variable and remove NA
```{r}
# shows missing cases
is.na(variable1)
# shows not missing cases
!is.na(variable1)
variable1_R <- variable1[!is.na(variable1)]
variable1_R
mean(variable1_R)
max(variable1_R)
args(mean)
mean(variable1, na.rm = TRUE)
```
```{r}
args(complete.cases)
?complete.cases
variable2 <- variable1[complete.cases(variable1)]
variable2
variable1_R
```
## Regression Intro
```{r}
install.packages("tidyverse")
install.packages("psych")
install.packages("ggplot2")
```
```{r}
library(tidyverse)
library(psych)
library(ggplot2)
```
```{r}
install.packages("Rcmdr")
library(Rcmdr)
```
```{r}
filepath <- "https://quantdev.ssri.psu.edu/sites/qdev/files/wisc3raw_gender.csv"
#read in the .csv file using the url() function
wisc3raw <- read.csv(file=url(filepath),header=TRUE)
saveRDS(wisc3raw, file= "wisc3raw.R")
```
https://quantdev.ssri.psu.edu/tutorials/regression-continuous-count-and-binary-outcomes
## Prelim - Reading in Data File
```{r}
#set filepath for data file
filepath <- "https://quantdev.ssri.psu.edu/sites/qdev/files/wisc3raw_gender.csv"
#read in the .csv file using the url() function
wisc3raw <- read.csv(file=url(filepath),header=TRUE)
wiscsub <- wisc3raw[ , c("id","verb1","verb2","verb4","verb6","momed","grad", "female")]
describe(wiscsub)
describe(wiscsub$verb2)
```
```{r}
ggplot(data=wiscsub, aes(x=verb2)) +
geom_histogram(binwidth=2.5, fill="white", color="black", boundary=0) +
xlab("Verbal Ability Grade 2") + ylab("Count") +
xlim(0,50) +
theme_classic()
```
```{r}
library(tidyverse)
library(psych)
library(ggplot2)
```
```{r}
install.packages("Rcmdr")
library(Rcmdr)
```
```{r}
filepath <- "https://quantdev.ssri.psu.edu/sites/qdev/files/wisc3raw_gender.csv"
#read in the .csv file using the url() function
wisc3raw <- read.csv(file=url(filepath),header=TRUE)
saveRDS(wisc3raw, file= "wisc3raw.R")
```
```{r}
summary(wisc3raw)
names(wisc3raw)
wisc3raw$grad <- as.factor(wisc3raw$grad)
wisc3raw$female <- as.factor(wisc3raw$female)
```
```{r}
summary(wisc3raw)
```
```{r}
library(Rcmdr)
```
```{r}
library(ggplot2)
```
```{r}
ggplot(data=wisc3raw, aes(x=verb2)) +
geom_histogram(stat = "bin", binwidth = 2.5,
fill = "white", color = "black", boundary = 0) +
xlab("Verbal Ability Grade 2") +
ylab("Count") +
theme_classic()
```
### Week 2 ends here
```{r}
model1 <- lm(formula = verb2 ~ 1,
data = wiscsub,
na.action = na.exclude)
summary(model1)
```
```{r}
#After noting that there is no R-squared given, we ask for it explicitly
summary(model1)$r.squared
```
```{r}
mean(wiscsub$verb2)
ggplot(wiscsub, aes(x=verb1)) +
geom_histogram(binwidth=2.5, fill="white", color="black", boundary=0) +
xlab("Verbal Ability Grade 1") +
ylab("Count") +
xlim(0,50) +
theme_classic()
```
```{r}
ggplot(wiscsub, aes(x=verb1, y = verb2)) +
geom_point() +
stat_ellipse(color="blue", alpha=.7) +
xlab("Verbal Ability Grade 1") +
ylab("Verbal Ability Grade 2") +
ylim(0,45) +
xlim(0,45) +
theme_classic()
```
```{r}
model2 <- lm(verb2 ~ 1 + verb1,
data = wiscsub,
na.action = na.exclude)
summary(model2)
```
```{r}
ggplot(data=wiscsub, aes(x=verb1,y=verb2)) +
geom_point(size = 2, shape=19) +
geom_smooth(method=lm,se=TRUE,fullrange=TRUE,colour="red", size=2) +
labs(x= "Verbal Ability Grade 1", y= "Verbal Ability Grade 2") +
xlim(0,50) +
ylim(0,50) +
#theme with white background
theme_bw() +
#eliminate background, gridlines, and chart border
theme(
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank()
) +
#draws x and y axis line
theme(axis.line = element_line(color = 'black')) +
#set size of axis labels and titles
theme(axis.text = element_text(size=12),
axis.title = element_text(size=14))
```
```{r}
#calculate the mean centered variable
wiscsub$verb1_star <- wiscsub$verb1 - mean(wiscsub$verb1, na.rm = TRUE)
```
```{r}
model3 <- lm(verb2 ~ 1 + verb1_star,
data = wiscsub,
na.action = na.exclude)
summary(model3)
```
```{r}
ggplot(data=wiscsub, aes(x=verb1_star,y=verb2)) +
geom_point(size = 2, shape=19) +
geom_smooth(method=lm,se=TRUE,fullrange=TRUE,colour="red", size=2) +
labs(x= "Sample-Centered Verbal Ability Grade 1", y= "Verbal Ability Grade 2") +
xlim(-20,20) +
ylim(0,50) +
#theme with white background
theme_bw() +
#eliminate background, gridlines, and chart border
theme(
plot.background = element_blank()
,panel.grid.major = element_blank()
,panel.grid.minor = element_blank()
,panel.border = element_blank()
) +
#draws x and y axis line
theme(axis.line = element_line(color = 'black')) +
#set size of axis labels and titles
theme(axis.text = element_text(size=12),
axis.title = element_text(size=14))
```
```{r}
describe(wiscsub$momed)
```
```{r}
ggplot(data=wiscsub, aes(x=momed)) +
geom_histogram(binwidth=2.5, fill="white", color="black") +
xlim(0,20) +
xlab("Number of Years of Mother's Education") +
ylab("Count") +
theme_classic()
```
```{r}
ggplot(data=wiscsub, aes(x=momed, y = verb2)) +
geom_point() +
xlim(0,20) +
ylim(0,50) +
xlab("Mother's Education (Years)") + ylab("Verbal Ability Grade 2") +
theme_classic()
```
```{r}
# Calculate mean-centered version of mother's education variable
wiscsub$momed_star <- wiscsub$momed - mean(wiscsub$momed)
```
```{r}
model4 <- lm(verb2 ~ 1 + verb1_star + momed_star,
data = wiscsub,
na.action = na.exclude)
summary(model4)
```
```{r}
#create an empty matrix for the predicted scores
pred_matrix <- matrix(NA, nrow = 32, ncol = 13)
#fill in all the cells of the matrix
for(momed in c(-5:7)){
for(verb in c(-15:16)){
curr_row <- verb + 16 #why would things not come out right without this?
curr_col <- momed + 6
#prediction equation
pred_matrix[curr_row, curr_col] = 25.41534 + 0.66786*(verb) + 0.41454*(momed)
}
}
#Making 3-d perspective plot
persp(x = -15:16, y = -5:7, z = pred_matrix,
col="lightblue", shade=.7, border = NA, expand = .5,
xlab="Grade 1 Verbal ",ylab="Years of Mom Ed",zlab="Grade 2 Verbal",
main = "Prediction Surface for Grade 2 Verbal",
theta=-30,phi=30,d=5)
```
```{r}
model5 <- lm(verb2 ~ 1 + verb1_star + grad + I(verb1_star*grad),
data = wiscsub,
na.action = na.exclude)
summary(model5)
```
```{r}
#Alternative formula
# model5 <- lm(verb2 ~ 1 + verb1_star + grad + verb1_star:grad,
# data = wiscsub,
# na.action = na.exclude)
# summary(model5)
```
```{r}
#plot of moderation
ggplot(data=wiscsub,
aes(x=verb2,y=verb1, color = factor(grad))) +
geom_jitter() +
stat_smooth(method='lm', se=TRUE, fullrange=TRUE) +
xlab("1st Grade Verbal Score") +
ylab("2nd Grade Verbal Score") +
guides(color=guide_legend(title="HS Grad")) +
theme_bw()
```
```{r}
wiscsub$momed_count <- round(wiscsub$momed)
```
```{r}
ggplot(data=wiscsub, aes(x=momed_count)) +
geom_histogram(binwidth = 1, fill = 'white', color = 'black') +
xlim(0,20) +
xlab("Number of Years of Mother's Education") +
ylab("Count") +
theme_classic()
```
##
```{r}
# Note that the function is glm(), and the code to use it is almost
# identical to lm() from before. The only extra option is that we
# use the option 'family = "poisson" ' at the end to state that
# the outcome variable is a count variable.
# Then the glm() function will invoke the log-link function we
# described before.
```
```{r}
model6 <- glm(momed_count ~ 1,
family = "poisson",
data = wiscsub,
na.action = na.exclude)
summary(model6)
```
```{r}
exp(2.41293
```
```{r}
confint(model6)
```
##
```{r}
ggplot(data=wiscsub,
aes(x=verb1_star,y=log(momed_count))) +
geom_jitter()+
xlab("Sample-Mean Centered 1st Grade Verbal Score") +
ylab("Log Number of Years of Mothers' Education") +
theme_bw()
```
##
```{r}
model7 <- glm(momed_count ~ 1 + verb1_star,
family = "poisson",
data = wiscsub,
na.action = na.exclude)
summary(model7)
```
```{r}
exp(2.407459)
```
```{r}
confint(model7)
```
```{r}
ggplot(data=wiscsub,
aes(x=verb1_star,y=log(momed_count))) +
geom_jitter()+
xlab("Sample-Mean Centered 1st Grade Verbal Score") +
ylab("Log Number of Years of Mothers' Education") +
theme_bw()+
stat_smooth(method = 'lm', se = TRUE)
```
##
```{r}
ggplot(data=wiscsub,
aes(x=verb1_star,y=momed_count)) +
geom_jitter()+
xlab("Sample-Mean Centered 1st Grade Verbal Score") +
ylab("Number of Years of Mothers' Education") +
theme_bw()+
stat_smooth(method = 'glm', method.args = list(family = "poisson"), se = TRUE)
```
##
```{r}
model8 <- glm(momed_count ~ 1 + female + verb1_star + I(verb1_star*female),
family = "poisson",
data = wiscsub,
na.action = na.exclude)
summary(model8)
```
##
```{r}
ggplot(data=wiscsub,
aes(x=verb1_star,y=log(momed_count), color = factor(female))) +
geom_jitter() +
stat_smooth(method = 'lm', se = TRUE, fullrange = TRUE) +
xlab("Sample-Mean Centered 1st Grade Verbal Score") +
ylab("Log Number of Years of Mothers' Education") +
guides(color=guide_legend(title=paste("Female"))) +
theme_bw()
```
```{r}
ggplot(data=wiscsub,
aes(x=verb1_star,y=momed_count, color = factor(female))) +
geom_jitter() +
stat_smooth(method = 'glm', method.args = list(family = "poisson"), se = TRUE, fullrange=TRUE) + xlab("Sample-Mean Centered 1st Grade Verbal Score") +
ylab("Number of Years of Mothers' Education") +
guides(color=guide_legend(title=paste("Female"))) +
theme_bw()
```
##
```{r}
model9 <- glm(grad ~ 1,
family = "binomial",
data = wiscsub,
na.action = na.exclude)
summary(model9)
```
```{r}
exp(-1.2340)/(1 + exp(-1.2340))
```
```{r}
mean(wiscsub$grad)
```
##
```{r}
model10 <- glm(grad ~ 1 + verb1_star,
family = "binomial",
data = wiscsub,
na.action = na.exclude)
summary(model10)
```
##
```{r}
exp(-1.39038 )/(1+exp(-1.39038))
```
```{r}
exp(coef(model10))
```
##
```{r}
exp(cbind(OR = coef(model10), confint(model10)))
```
##
```{r}
ggplot(data=wiscsub,
aes(x=verb1_star,y=grad)) +
geom_point(alpha = .08, size = 10) +
xlab("1st Grade Verbal Score") +
ylab("Mothers' Graduation") +
theme_bw() +
stat_smooth(method = 'glm', method.args = list(family = "binomial"), se = TRUE)
```
##
```{r}
model11 <- glm(grad ~ 1 + female + verb1_star + I(verb1_star*female) ,
family = "binomial",
data = wiscsub,
na.action = na.exclude)
summary(model11)
```
## last
```{r}
ggplot(data=wiscsub,
aes(x=verb1_star,y=grad, color = factor(female))) +
geom_point(alpha = .08, size = 10) +
xlab("1st Grade Verbal Score") +
ylab("Mothers' Graduation") +
guides(color=guide_legend(title="Female")) +
theme_bw() +
stat_smooth(method = 'glm', method.args = list(family = "binomial"), se = TRUE, fullrange = TRUE)
```