# FALL 2022 HDFS 8050 ##Introduction to R/RStudio What data types are available in R? What is an object? How can subsets be extracted from vectors? How does R treat missing values? How can we deal with missing values in R? ```{r} 3 + 5 12 / 7 grade <- 1.0 ``` <- is the assignment operator. It assigns values on the right to objects on the left. In RStudio, typing Alt + - (push Alt at the same time as the - key) will write <- in a single keystroke in a PC, while typing Option + - (push Option at the same time as the - key) does the same in a Mac. ```{r} grade <- 1.0 # doesn't print anything ``` ```{r} (grade <- 1.0) # putting parenthesis around the call prints the value of `grade` ``` ```{r} grade # and so does typing the name of the object ``` ```{r} grade <- 1 ``` ```{r} (grade <- 1) # assign a value and print it grade ``` ```{r} grade <- 3 # in 4 point system, grade 3 ``` ```{r} grade_new <- grade*2.5 ``` ## Functions ```{r} sqrt(16) ``` ```{r} grade_results<- sqrt(grade_new) ``` ```{r} ?sqrt ``` ```{r} args(sqrt) ``` ```{r} temp <- 5.14789655891 ``` ```{r} round(temp) ``` ```{r} ?round ``` ```{r} round(temp, digits = 3) ``` ```{r} round(digits=2, x=temp) ``` ## Vectors and data types ```{r} hh_members <- c(3, 7, 10, 6) hh_members ``` ```{r} education <- c("no_education", "highschool", "college") education ``` ```{r} length(hh_members) ``` ```{r} length(education) ``` ```{r} typeof(hh_members) ``` ```{r} typeof(education) ``` The function str() provides an overview of the structure of an object and its elements. It is a useful function when working with large and complex objects: ```{r} str(hh_members) ``` ```{r} str(education) ``` You can use the c() function to add other elements to your vector: ```{r} hh_items <- c("bicycle", "radio", "television") ``` ```{r} hh_items <- c(hh_items, "mobile_phone") # add to the end of the vector ``` ```{r} hh_items <- c("car", hh_items) # add to the beginning of the vector hh_items ``` ```{r} num_char <- c(1, 2, 3, "a") ``` ```{r} num_logical <- c(1, 2, 3, TRUE) ``` ```{r} char_logical <- c("a", "b", "c", TRUE) ``` ```{r} tricky <- c(1, 2, 3, "4") ``` ## Subsetting vectors ```{r} education <- c("no_education", "highschool", "college") education[2] ``` ```{r} education[c(3, 2)] ``` ```{r} more_education <- education[c(1, 2, 3, 2, 1, 3)] more_education ``` R indices start at 1. ## Conditional subsetting ```{r} hh_members <- c(3, 7, 10, 6) hh_members[c(TRUE, FALSE, TRUE, TRUE)] ``` ```{r} hh_members > 5 # will return logicals with TRUE for the indices that meet the condition ``` ```{r} hh_members[hh_members > 5] ``` ```{r} hh_members[hh_members < 4 | hh_members > 7] ``` ```{r} hh_members[hh_members >= 4 & hh_members <= 7] ``` ```{r} hh_items <- c("car", "bicycle", "radio", "television", "mobile_phone") ``` ```{r} hh_items[hh_items == "car" | hh_items == "bicycle"] # returns both car and bicycle ``` ```{r} hh_items %in% c("car", "bicycle") ``` ```{r} hh_items %in% c("car", "bicycle", "motorcycle", "truck", "boat", "bus") ``` ```{r} hh_items[hh_items %in% c("car", "bicycle", "motorcycle", "truck", "boat", "bus")] ``` ```{r} hhmembers <- c(3,5,10,2) typeof(hhmembers) ``` ```{r} education <- c("no_education", "highschool", "college") typeof(education) str(education) ``` ```{r} library(nlme) View(BodyWeight) # shows dataset mydata <- BodyWeight # save as mydata ``` ```{r} str(mydata) mydata$Time >10 mydata[3,2] mydata[2:15 ,1:3] mydatasubset <- mydata[which(mydata$Time >10),] mysubset2 <-mydata[which(mydata$Time >10 & mydata$Diet == 3),] #weight < 400 #diet < 3 mysubset5 <- mydata[which((mydata$weight <400 & mydata$Diet ==1) ( mydata$weight <400 &mydata$Diet ==2)),] str(mydata) ``` ## Missing data ```{r} rooms <- c(2, 1, 1, NA, 7) mean(rooms) ``` ```{r} max(rooms) ``` ```{r} mean(rooms, na.rm = TRUE) ``` ```{r} max(rooms, na.rm = TRUE) ``` If your data include missing values, you may want to become familiar with the functions is.na(), na.omit(), and #complete.cases(). Extract those elements which are not missing values. The ! character is also called the NOT operator ```{r} variable1 <- c(1,2,NA,3,4,5,6,7,NA) #complete.cases #mean imputation-- week 6, # don't use this line if you have only one variable #variable1_R <- variable1[which(variable1 != NA),] ``` ##recode variable1 into a new variable and remove NA ```{r} # shows missing cases is.na(variable1) # shows not missing cases !is.na(variable1) variable1_R <- variable1[!is.na(variable1)] variable1_R mean(variable1_R) max(variable1_R) args(mean) mean(variable1, na.rm = TRUE) ``` ```{r} args(complete.cases) ?complete.cases variable2 <- variable1[complete.cases(variable1)] variable2 variable1_R ``` ## Regression Intro ```{r} install.packages("tidyverse") install.packages("psych") install.packages("ggplot2") ``` ```{r} library(tidyverse) library(psych) library(ggplot2) ``` ```{r} install.packages("Rcmdr") library(Rcmdr) ``` ```{r} filepath <- "https://quantdev.ssri.psu.edu/sites/qdev/files/wisc3raw_gender.csv" #read in the .csv file using the url() function wisc3raw <- read.csv(file=url(filepath),header=TRUE) saveRDS(wisc3raw, file= "wisc3raw.R") ``` https://quantdev.ssri.psu.edu/tutorials/regression-continuous-count-and-binary-outcomes ## Prelim - Reading in Data File ```{r} #set filepath for data file filepath <- "https://quantdev.ssri.psu.edu/sites/qdev/files/wisc3raw_gender.csv" #read in the .csv file using the url() function wisc3raw <- read.csv(file=url(filepath),header=TRUE) wiscsub <- wisc3raw[ , c("id","verb1","verb2","verb4","verb6","momed","grad", "female")] describe(wiscsub) describe(wiscsub$verb2) ``` ```{r} ggplot(data=wiscsub, aes(x=verb2)) + geom_histogram(binwidth=2.5, fill="white", color="black", boundary=0) + xlab("Verbal Ability Grade 2") + ylab("Count") + xlim(0,50) + theme_classic() ``` ```{r} library(tidyverse) library(psych) library(ggplot2) ``` ```{r} install.packages("Rcmdr") library(Rcmdr) ``` ```{r} filepath <- "https://quantdev.ssri.psu.edu/sites/qdev/files/wisc3raw_gender.csv" #read in the .csv file using the url() function wisc3raw <- read.csv(file=url(filepath),header=TRUE) saveRDS(wisc3raw, file= "wisc3raw.R") ``` ```{r} summary(wisc3raw) names(wisc3raw) wisc3raw$grad <- as.factor(wisc3raw$grad) wisc3raw$female <- as.factor(wisc3raw$female) ``` ```{r} summary(wisc3raw) ``` ```{r} library(Rcmdr) ``` ```{r} library(ggplot2) ``` ```{r} ggplot(data=wisc3raw, aes(x=verb2)) + geom_histogram(stat = "bin", binwidth = 2.5, fill = "white", color = "black", boundary = 0) + xlab("Verbal Ability Grade 2") + ylab("Count") + theme_classic() ``` ### Week 2 ends here ```{r} model1 <- lm(formula = verb2 ~ 1, data = wiscsub, na.action = na.exclude) summary(model1) ``` ```{r} #After noting that there is no R-squared given, we ask for it explicitly summary(model1)$r.squared ``` ```{r} mean(wiscsub$verb2) ggplot(wiscsub, aes(x=verb1)) + geom_histogram(binwidth=2.5, fill="white", color="black", boundary=0) + xlab("Verbal Ability Grade 1") + ylab("Count") + xlim(0,50) + theme_classic() ``` ```{r} ggplot(wiscsub, aes(x=verb1, y = verb2)) + geom_point() + stat_ellipse(color="blue", alpha=.7) + xlab("Verbal Ability Grade 1") + ylab("Verbal Ability Grade 2") + ylim(0,45) + xlim(0,45) + theme_classic() ``` ```{r} model2 <- lm(verb2 ~ 1 + verb1, data = wiscsub, na.action = na.exclude) summary(model2) ``` ```{r} ggplot(data=wiscsub, aes(x=verb1,y=verb2)) + geom_point(size = 2, shape=19) + geom_smooth(method=lm,se=TRUE,fullrange=TRUE,colour="red", size=2) + labs(x= "Verbal Ability Grade 1", y= "Verbal Ability Grade 2") + xlim(0,50) + ylim(0,50) + #theme with white background theme_bw() + #eliminate background, gridlines, and chart border theme( plot.background = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.border = element_blank() ) + #draws x and y axis line theme(axis.line = element_line(color = 'black')) + #set size of axis labels and titles theme(axis.text = element_text(size=12), axis.title = element_text(size=14)) ``` ```{r} #calculate the mean centered variable wiscsub$verb1_star <- wiscsub$verb1 - mean(wiscsub$verb1, na.rm = TRUE) ``` ```{r} model3 <- lm(verb2 ~ 1 + verb1_star, data = wiscsub, na.action = na.exclude) summary(model3) ``` ```{r} ggplot(data=wiscsub, aes(x=verb1_star,y=verb2)) + geom_point(size = 2, shape=19) + geom_smooth(method=lm,se=TRUE,fullrange=TRUE,colour="red", size=2) + labs(x= "Sample-Centered Verbal Ability Grade 1", y= "Verbal Ability Grade 2") + xlim(-20,20) + ylim(0,50) + #theme with white background theme_bw() + #eliminate background, gridlines, and chart border theme( plot.background = element_blank() ,panel.grid.major = element_blank() ,panel.grid.minor = element_blank() ,panel.border = element_blank() ) + #draws x and y axis line theme(axis.line = element_line(color = 'black')) + #set size of axis labels and titles theme(axis.text = element_text(size=12), axis.title = element_text(size=14)) ``` ```{r} describe(wiscsub$momed) ``` ```{r} ggplot(data=wiscsub, aes(x=momed)) + geom_histogram(binwidth=2.5, fill="white", color="black") + xlim(0,20) + xlab("Number of Years of Mother's Education") + ylab("Count") + theme_classic() ``` ```{r} ggplot(data=wiscsub, aes(x=momed, y = verb2)) + geom_point() + xlim(0,20) + ylim(0,50) + xlab("Mother's Education (Years)") + ylab("Verbal Ability Grade 2") + theme_classic() ``` ```{r} # Calculate mean-centered version of mother's education variable wiscsub$momed_star <- wiscsub$momed - mean(wiscsub$momed) ``` ```{r} model4 <- lm(verb2 ~ 1 + verb1_star + momed_star, data = wiscsub, na.action = na.exclude) summary(model4) ``` ```{r} #create an empty matrix for the predicted scores pred_matrix <- matrix(NA, nrow = 32, ncol = 13) #fill in all the cells of the matrix for(momed in c(-5:7)){ for(verb in c(-15:16)){ curr_row <- verb + 16 #why would things not come out right without this? curr_col <- momed + 6 #prediction equation pred_matrix[curr_row, curr_col] = 25.41534 + 0.66786*(verb) + 0.41454*(momed) } } #Making 3-d perspective plot persp(x = -15:16, y = -5:7, z = pred_matrix, col="lightblue", shade=.7, border = NA, expand = .5, xlab="Grade 1 Verbal ",ylab="Years of Mom Ed",zlab="Grade 2 Verbal", main = "Prediction Surface for Grade 2 Verbal", theta=-30,phi=30,d=5) ``` ```{r} model5 <- lm(verb2 ~ 1 + verb1_star + grad + I(verb1_star*grad), data = wiscsub, na.action = na.exclude) summary(model5) ``` ```{r} #Alternative formula # model5 <- lm(verb2 ~ 1 + verb1_star + grad + verb1_star:grad, # data = wiscsub, # na.action = na.exclude) # summary(model5) ``` ```{r} #plot of moderation ggplot(data=wiscsub, aes(x=verb2,y=verb1, color = factor(grad))) + geom_jitter() + stat_smooth(method='lm', se=TRUE, fullrange=TRUE) + xlab("1st Grade Verbal Score") + ylab("2nd Grade Verbal Score") + guides(color=guide_legend(title="HS Grad")) + theme_bw() ``` ```{r} wiscsub$momed_count <- round(wiscsub$momed) ``` ```{r} ggplot(data=wiscsub, aes(x=momed_count)) + geom_histogram(binwidth = 1, fill = 'white', color = 'black') + xlim(0,20) + xlab("Number of Years of Mother's Education") + ylab("Count") + theme_classic() ``` ## ```{r} # Note that the function is glm(), and the code to use it is almost # identical to lm() from before. The only extra option is that we # use the option 'family = "poisson" ' at the end to state that # the outcome variable is a count variable. # Then the glm() function will invoke the log-link function we # described before. ``` ```{r} model6 <- glm(momed_count ~ 1, family = "poisson", data = wiscsub, na.action = na.exclude) summary(model6) ``` ```{r} exp(2.41293 ``` ```{r} confint(model6) ``` ## ```{r} ggplot(data=wiscsub, aes(x=verb1_star,y=log(momed_count))) + geom_jitter()+ xlab("Sample-Mean Centered 1st Grade Verbal Score") + ylab("Log Number of Years of Mothers' Education") + theme_bw() ``` ## ```{r} model7 <- glm(momed_count ~ 1 + verb1_star, family = "poisson", data = wiscsub, na.action = na.exclude) summary(model7) ``` ```{r} exp(2.407459) ``` ```{r} confint(model7) ``` ```{r} ggplot(data=wiscsub, aes(x=verb1_star,y=log(momed_count))) + geom_jitter()+ xlab("Sample-Mean Centered 1st Grade Verbal Score") + ylab("Log Number of Years of Mothers' Education") + theme_bw()+ stat_smooth(method = 'lm', se = TRUE) ``` ## ```{r} ggplot(data=wiscsub, aes(x=verb1_star,y=momed_count)) + geom_jitter()+ xlab("Sample-Mean Centered 1st Grade Verbal Score") + ylab("Number of Years of Mothers' Education") + theme_bw()+ stat_smooth(method = 'glm', method.args = list(family = "poisson"), se = TRUE) ``` ## ```{r} model8 <- glm(momed_count ~ 1 + female + verb1_star + I(verb1_star*female), family = "poisson", data = wiscsub, na.action = na.exclude) summary(model8) ``` ## ```{r} ggplot(data=wiscsub, aes(x=verb1_star,y=log(momed_count), color = factor(female))) + geom_jitter() + stat_smooth(method = 'lm', se = TRUE, fullrange = TRUE) + xlab("Sample-Mean Centered 1st Grade Verbal Score") + ylab("Log Number of Years of Mothers' Education") + guides(color=guide_legend(title=paste("Female"))) + theme_bw() ``` ```{r} ggplot(data=wiscsub, aes(x=verb1_star,y=momed_count, color = factor(female))) + geom_jitter() + stat_smooth(method = 'glm', method.args = list(family = "poisson"), se = TRUE, fullrange=TRUE) + xlab("Sample-Mean Centered 1st Grade Verbal Score") + ylab("Number of Years of Mothers' Education") + guides(color=guide_legend(title=paste("Female"))) + theme_bw() ``` ## ```{r} model9 <- glm(grad ~ 1, family = "binomial", data = wiscsub, na.action = na.exclude) summary(model9) ``` ```{r} exp(-1.2340)/(1 + exp(-1.2340)) ``` ```{r} mean(wiscsub$grad) ``` ## ```{r} model10 <- glm(grad ~ 1 + verb1_star, family = "binomial", data = wiscsub, na.action = na.exclude) summary(model10) ``` ## ```{r} exp(-1.39038 )/(1+exp(-1.39038)) ``` ```{r} exp(coef(model10)) ``` ## ```{r} exp(cbind(OR = coef(model10), confint(model10))) ``` ## ```{r} ggplot(data=wiscsub, aes(x=verb1_star,y=grad)) + geom_point(alpha = .08, size = 10) + xlab("1st Grade Verbal Score") + ylab("Mothers' Graduation") + theme_bw() + stat_smooth(method = 'glm', method.args = list(family = "binomial"), se = TRUE) ``` ## ```{r} model11 <- glm(grad ~ 1 + female + verb1_star + I(verb1_star*female) , family = "binomial", data = wiscsub, na.action = na.exclude) summary(model11) ``` ## last ```{r} ggplot(data=wiscsub, aes(x=verb1_star,y=grad, color = factor(female))) + geom_point(alpha = .08, size = 10) + xlab("1st Grade Verbal Score") + ylab("Mothers' Graduation") + guides(color=guide_legend(title="Female")) + theme_bw() + stat_smooth(method = 'glm', method.args = list(family = "binomial"), se = TRUE, fullrange = TRUE) ```