--- title: "Portfolio Assignment 2" author: "Samuel Vinter" date: '2022-10-27' output: html_document --- *describe in 1-3 sentences how you would have aggregated RT across repeated measures of Word Length and Word Frequency prior to computing correlations, and what the expected degrees of freedom would have been in your subsequent correlations on aggregated measures* # Word Length We have already averaged the data across subjects, but before testing if reading time and word length are correlated, we would have now performed another aggregation of the data, where we average out the data across different word lengths. In this case, the degrees of freedom would have been equal to 4. # Word Frequency We have already averaged the data across subjects, but before testing if reading time and word frequency are correlated, we would have now performed another aggregation of the data, where we average out the data across different word frequencies. In this case, the degrees of freedom would have been equal to 25. #Averaging data by word frequency (count) in order to see number of degrees of freedom DoF <- aggregate(count ~ word , data = corr_data, mean) ```{r} #Making a barplot of the reading time in the two groups, to see the difference between these data_agr%>% ggplot(aes(x = Group, y = RT, fill = Group)) + geom_bar(stat = "summary") + geom_errorbar(stat = "summary", fun.data = "mean_se", color = "brown") + theme_minimal() + scale_fill_brewer(palette="GnBu") ``` #Bar plot data_cropped %>% ggplot(aes(x = Group, y = RT, fill = Group)) + geom_bar(stat = "summary") + geom_errorbar(stat = "summary", fun.data = "mean_se", color = "brown", width = 0.5) + theme_minimal() + scale_fill_brewer(palette="Paired")+ labs(x = "Group", y = "Reaction Time (seconds)") # Introduction For this assignment, a reading experiment was conducted on 28 participants. In this file I've analyzed the data collected from that experiment. The data is first anonymized, aggregated, and "cleaned up". After that I've conducted correlational and hypothesis-testing analyses. <hr> # Setup section <hr> ```{r setup} # echo = TRUE prints code (and results) to HTML output file. knitr::opts_chunk$set(echo = TRUE) #set working directory. knitr::opts_knit$set(root.dir = '/work/CogSci_Methods01/portolio_assignment_02-sssvint/logfiles') # clear environment. rm(list=ls()) # clears global workspace. ``` ### Importing libraries ```{r libraries} library("tidyverse") library("pastecs") library("dplyr") library("conflicted") library("stringr") library("WRS2") library("ggplot2") ``` ### Anonymizing data (commented out for knitting) ```{r load_and_anonymize} #setting wd #setwd(".") # checks whether randomization has already been done: if not, run anonymization. #(is_empty(list.files(path = getwd(), pattern = "*logfile_snew*", full.names = T))) #files <- list.files(path = getwd(), pattern = "*logfile*", full.names = T) #anonymizing data #data_out <- list() # determines number of files. #num_files <- length(files) # randomly shuffles integers in range 1:num_files. #rand_ids <- sample(seq(1,num_files,1)) # initializes file counter. #cnt_f <- 0 # loops through files. #for (f in files){ # updates counter. #cnt_f <- cnt_f + 1 # reads current CSV file. #data_out[[f]] <- read_csv(file = f, col_names = TRUE) # generates new random ID ("sXX", where XX is number from rand_ids) #data_out[[f]]$ID <- paste(c("snew", rand_ids[cnt_f]), collapse = "") # removes column with timestamp (if there is one). #data_out[[f]] <- subset(data_out[[f]], select = -c(Timestamp)) # defines new output file name. #out_name <- paste(c(getwd(), "/logfile_", unique(data_out[[f]]$ID[1]), ".csv"), collapse = "") # writes this CSV file. #write_csv(data_out[[f]], out_name, na = "NA") # now delete original file. #file.remove(f) # } ``` ### Reading anonymous data, binding it together ```{r load_anonymized_data} # get new file names. files <- list.files(path = getwd(), pattern = "*logfile_snew*", full.names = T) # read all the files and bind them together data <- map_dfr(files, read_csv, col_types = cols(Group=col_factor())) ``` ### Importing database ```{r import_database} wordfreq <- read_csv("/work/CogSci_Methods01/portolio_assignment_02-sssvint/unigram_freq.csv") ``` ### Cleaning the data ```{r data_cleaning} # renaming some data data$Group <- gsub(pattern = "TRUE", replacement = "test", as.factor(data$Group)) data$Group <- gsub(pattern = "c", replacement = "control", as.factor(data$Group)) # making sure everything is the correct type data$Age <- as.integer(data$Age) data$Group <- as.factor(data$Group) data$ID <- as.factor(data$ID) data$Gender <- as.factor(data$Gender) data$Native_Language <- as.factor(data$Native_Language) #renaming first column data <- data %>% rename_at('...1', ~'Index') #Removing text stimulus punctuation data$Word <- gsub('[[:punct:] ]+','',data$Word) #Converting to lowercase data$Word = tolower(data$Word) #changing the word length column, since the wl includes punctuation data <- data%>% mutate(WL = str_count(Word)) ``` ### Removing outliers ```{r outlier-removal} data_nout <- data %>% dplyr::filter( RT > mean(RT)-3*sd(RT) & RT < mean(RT)+3*sd(RT) ) ``` ### Removing trigger word ```{r triggerword_removal} data_notw <- data_nout %>% dplyr::filter(Word != "killed" & Word != "kissed") ``` ### Aggregating data ```{r aggregate} #Aggregating data by words but removing the trigger word agr_notw <- aggregate(RT ~ Word + WL, data = data_nout, mean) agr_notw <- agr_notw %>% dplyr::filter(Word != "killed" & Word != "kissed") #Averaging data within Indexed words (for checking ordinal word number correlation) agr_tw_ind <- aggregate(RT ~ Index + Word + WL, data = data_nout, mean) agr_notw_ind <- agr_tw_ind %>% dplyr::filter(Word != "killed" & Word != "kissed") ``` ### Checking the normality of the data ```{r normal} #Plot base plot_norm <- ggplot(data_nout, aes(x = RT)) #Histogram plot plot_norm + geom_histogram(aes(y = ..density.., fill = "Reaction Time"), fill = "cyan", color = "lightblue") + stat_function(fun = dnorm, args = list(mean = mean(data_nout$RT), sd = sd(data_nout$RT)), color = "red", size = 1, binwidth = 2) + #Aesthetics ggtitle("Histogram of Reaction Time")+ labs(x = "Reaction Time (seconds)", y = "Density") + scale_fill_brewer(palette="Set2") #QQ plot ggplot(data_nout, aes(sample = RT)) + stat_qq() + stat_qq_line(color = "red") + labs(x = "Theoretical Quantiles", y = "Sample Quantiles") + ggtitle("Normal QQ-plot of Reaction Time") #Running statistical Tests and rounding to two decimals round(pastecs::stat.desc(cbind(data_nout$RT), basic = FALSE, norm = TRUE), 2) ``` The data is not normally distributed (skewness and kurtosis are incredibly high, and the p-value is less than .05), so I will be using non-parametric methods (Kendall) to test the correlations. <hr> # Analysis (correlational) section <hr> ### Word length - Reaction time correlation ```{r cor_wl} Cor_RT_WL <- cor.test(agr_notw$WL, agr_notw$RT, method = 'kendall') Cor_RT_WL ``` ### Plotting the word length correlation ```{r cor_wl_plot} ggplot(agr_notw, aes(x = WL, y = RT)) + geom_point() + geom_smooth(method = lm) ``` ### Word frequency - Reaction time correlation ```{r cor_freq} # calculating rt for each word avg_word_rt <- agr_notw %>% group_by(Word) %>% summarise(avg_rt = mean(RT)) # renaming column to match database avg_word_rt <- rename(avg_word_rt, word = Word) # combining database and logfiles freq_rt_comp <- inner_join(avg_word_rt, wordfreq, by = "word") cor_RT_FRQ <- cor.test(freq_rt_comp$count, freq_rt_comp$avg_rt, method = 'kendall') cor_RT_FRQ ``` ### Plotting the word frequency correlation ```{r cor_freq_plot} ggplot(freq_rt_comp, aes(x = count, y = avg_rt)) + geom_point() + geom_smooth(method = lm) ``` ### Ordinal number-Reaction time correlation ```{r cor_index} cor_RT_NUM <- cor.test(agr_notw_ind$RT, agr_notw_ind$Index, method = 'kendall') cor_RT_NUM ``` ### Plotting the ordinal number correlation ```{r cor_index_plot} ggplot(agr_notw_ind, aes(x = Index, y = RT)) + geom_point() + geom_smooth(method = lm) ``` Kendall's rank correlation tau was used to compare the relationship between word length and reaction time, word frequency and reaction time, and ordinal word number and reaction time. The results showed that there was a significant relationship between word length and the reaction time r(49) = 0.22, p < .05. The tau value was 0.2214343, which indicates a positive relationship between the two variables. There was not a significant relationship between word frequency and reaction time r(49) = -0.95, p > .05. The tau value was -0.09176471, which indicates a weak negative relationship between the two variables. There was a significant relationship between ordinal word number and reaction time r(89) = -4.34, p < .05. The tau value was -0.3094017, which indicates a negative relationship between the two variables. <hr> # Hypothesis testing section <hr> This section includes an investigation on how semantic-contextual expectations predict reading times for specific, semantically unexpected words. Our Hypotheses: • H0: There is no difference in the mean reading times in the two conditions of your reading experiment. • H1: There is a difference in the mean reading times in the two conditions of your reading experiment. As per the assignment, we single out the specific reading time for the trigger (differing) word, as well as the following word in two separate analyses. Then we compare the means of the reading times for those words across conditions using the appropriate type of t-test. As we use between-subject data and the variances are not equal, we use the Welch’s t-test moving forward. ### Filtering the data to only have the trigger word ```{r t_test_filter_trigger} data_test1 <- data_nout %>% dplyr::filter(Index == 73) data_test1_c <- data_test1 %>% dplyr::filter(Group == "control") data_test1_t <- data_test1 %>% dplyr::filter(Group == "test") ``` ### Checking normality of trigger word reaction time ```{r tw_normal} round(pastecs::stat.desc(cbind(data_test1_c$RT), basic = FALSE, norm = TRUE), 2) round(pastecs::stat.desc(cbind(data_test1_t$RT), basic = FALSE, norm = TRUE), 2) ``` ### Transforming the data ```{r transform_trig} rt_sqrtc <- data_test1_c %>% mutate(square_rtc = sqrt(RT)) rt_sqrtt <- data_test1_t %>% mutate(square_rtt = sqrt(RT)) ``` ### Checking the normality of transformed trigger word - decided to go with square root ```{r transform_tw_normal} round(pastecs::stat.desc(cbind(rt_sqrtc$square_rtc), basic = FALSE, norm = TRUE), 2) round(pastecs::stat.desc(cbind(rt_sqrtt$square_rtt), basic = FALSE, norm = TRUE), 2) ``` ### Filtering the data to only have the word after the trigger word ```{r filter_posttrig} data_test2 <- data_nout %>% dplyr::filter(Index == 74) data_test2_c <- data_test2 %>% dplyr::filter(Group == "control") data_test2_t <- data_test2 %>% dplyr::filter(Group == "test") ``` ### Checking normality of post-trigger word reaction time ```{r postw_normal} round(pastecs::stat.desc(cbind(data_test2_c$RT), basic = FALSE, norm = TRUE), 2) round(pastecs::stat.desc(cbind(data_test2_t$RT), basic = FALSE, norm = TRUE), 2) ``` ### Transforming the data ```{r transform_postw} rt_sqrtc2 <- data_test2_c %>% mutate(square_rtc = sqrt(RT)) rt_sqrtt2 <- data_test2_t %>% mutate(square_rtt = sqrt(RT)) ``` ### Checking the normality of transformed trigger word (square root) ```{r transform_postw_normal} round(pastecs::stat.desc(cbind(rt_sqrtc2$square_rtc), basic = FALSE, norm = TRUE), 2) round(pastecs::stat.desc(cbind(rt_sqrtt2$square_rtt), basic = FALSE, norm = TRUE), 2) ``` ## T-test for trigger word ```{r t_test_trigger} t_test_trig <- t.test(rt_sqrtc$RT, rt_sqrtt$RT, var.equal = FALSE) t_test_trig ``` ### Plotting the t-test data (for trigger word - control group) ```{r t_test_trig_plot_control} ggplot(rt_sqrtc, aes(sample = square_rtc)) + stat_qq() + stat_qq_line(color = "red") + labs(x = "Theoretical Quantiles", y = "Sample Quantiles") + ggtitle("Normal QQ-plot of Reaction Time: Control Group, Trigger Word") + theme_minimal() ``` ### Plotting the t-test data (for trigger word - test group) ```{r t_test_trig_plot_test} ggplot(rt_sqrtt, aes(sample = square_rtt)) + stat_qq() + stat_qq_line(color = "red") + labs(x = "Theoretical Quantiles", y = "Sample Quantiles") + ggtitle("Normal QQ-plot of Reaction Time: Test Group, Trigger Word") + theme_minimal() ``` ## T-test for word after trigger word ```{r t_test_posttrig} t_test_post <- t.test(rt_sqrtc2$RT, rt_sqrtt2$RT, var.equal = FALSE) t_test_post ``` ### Plotting the t-test data (for post-trigger word - control group) ```{r posttrig_t_plot_control} ggplot(rt_sqrtc2, aes(sample = square_rtc)) + stat_qq() + stat_qq_line(color = "red") + labs(x = "Theoretical Quantiles", y = "Sample Quantiles") + ggtitle("Normal QQ-plot of Reaction Time: Control Group, Post-Trigger Word") + theme_minimal() ``` ### Plotting the t-test data (for post-trigger word - test group) ```{r posttrig_t_plot_test} ggplot(rt_sqrtt2, aes(sample = square_rtt)) + stat_qq() + stat_qq_line(color = "red") + labs(x = "Theoretical Quantiles", y = "Sample Quantiles") + ggtitle("Normal QQ-plot of Reaction Time: Test Group, Post-Trigger Word") + theme_minimal() ``` The Welch Two Sample t-test was used to compare the means of two groups. The results showed that there was not a significant difference between the two groups for the trigger word reaction time, t(18.234) = -1.7639, p = .0945. There was also not a significant difference in the reaction time for the word after the trigger word, t(11.125) = -1.567, p = 0.1451. Either way, the null hypothesis cannot be rejected. That is to say, there is no significant difference in the mean reading times in the two groups for the trigger word or the word that follows.