# Portfolio 2
---
title: "Reading Experiment"
author: "Marie Thomsen"
date: '2022-10-24'
output: html_document
---
# Defining setup chunk and working directory
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_knit$set(root.dir = '/work/CogSci_Methods01/portolio_assignment_02-marievthomsen')
```
# Installing packages
```{r}
library(tidyverse)
library(gridExtra)
library(pastecs)
library(stringi)
library(WRS2)
```
#Preparing data
```{r}
#Loading the data
files <- list.files(path = 'logfile_new.zip', pattern = ".csv",full.names = T)
```
```{r}
#Anonymising the data
data_out <- list()
num_files <- length(files)
rand_ids <- sample(seq(1,num_files,1))
cnt_f <- 0
for (f in files){
cnt_f <- cnt_f + 1
data_out[[f]] <- read_csv(file = f, col_names = TRUE)
data_out[[f]]$ID <- paste(c("snew", rand_ids[cnt_f]), collapse = "")
out_name <- paste(c('/work/CogSci_Methods01/portolio_assignment_02-marievthomsen/logfile_new.zip', "/logfile_", unique(data_out[[f]]$ID[1]), ".csv"), collapse = "")
write_csv(data_out[[f]], out_name, na = "NA")
file.remove(f)
}
```
```{r}
#Creating a dataframe
files <- list.files(path = getwd(), pattern = "*logfile_snew*", full.names = T)
data <- map_dfr(files, read_csv)
```
## Cleaning data
```{r}
#Changing names of columns
names(data)[6] <- 'reaction_time'
names(data)[8]<- 'word_length'
names(data)[1]<- 'column_1'
names(data)[7]<- 'condition'
#Removing punctuation from words in the data frame
data$Word <- sub(",","",data$Word)
data$Word <- sub("\\.","",data$Word)
data$word_length <- nchar(data$Word)
#Ensuring ID and Condition are factors
data$condition <-as.factor(data$condition)
data$ID <- as.factor(data$ID)
```
## Aggregating by test subject to detect irregularities
```{r}
ggplot(data, aes(x = column_1, y = reaction_time, colour = ID)) +
geom_point()
```
```{r}
ggplot(data, aes(x = ID, y = reaction_time, fill = ID)) +
geom_boxplot(outlier.shape = NA) +
coord_cartesian(ylim = c(0.1, 1.5)) +
stat_summary(fun = mean, geom = "point", shape = 23)
```
#Correlation section
##Preparing Data
```{r}
#New data frame where the rows with chopsticks and socks are eliminated
as.factor(data$Word)
data_filter <- filter(data, Word != "socks")
data_filter <- filter(data_filter, Word != "chopsticks")
#Adding transformations to the data frame
data_filter <- data_filter %>%
mutate(log_reaction_time = log(reaction_time),
sqrt_reaction_time = sqrt(reaction_time),
reaction_time_1 = (1/reaction_time))
#Creating a column with ordinal word number
ordinal_number <- data_filter$column_1 + 1
data_filter <- mutate(data_filter, ordinal_number)
```
## Testing for normality in reaction time
```{r}
#Probability density histogram of reaction time
ggplot(data_filter, aes(x = reaction_time)) +
geom_histogram(aes(y = ..density..), binwidth = 0.25) +
ggtitle("Probability Density of Reaction Time") +
stat_function(fun = dnorm,
args = list(mean = mean(data_filter$reaction_time, na.rm = TRUE),
sd = sd(data_filter$reaction_time, na.rm = TRUE)),
colour= "red", size = 1) +
theme_classic() + xlim(range(data_filter$reaction_time))
#QQ-plot
qqnorm(data_filter$reaction_time)
qqline(data_filter$reaction_time)
#Shapiro-Wilks test
shapiro.test(data_filter$reaction_time)
```
## Transforming the data
```{r}
data_transform <- select(data_filter, reaction_time) %>%
mutate(log_reaction_time = log(reaction_time),
sqrt_reaction_time = sqrt(reaction_time),
reaction_time_1 = (1/reaction_time))
round(stat.desc(data_transform, norm = TRUE), digits = 2)[c("skew.2SE", "kurt.2SE", "normtest.p"), ]
```
### Transformation using logarithm
```{r}
ggplot(data_filter, aes(x = log_reaction_time)) +
geom_histogram(aes(y = ..density..),
binwidth = 0.25,
color= 'plum4', fill = 'lightblue2') +
stat_function(fun = dnorm,
args = list( mean = mean(data_filter$log_reaction_time, na.rm = TRUE),
sd = sd(data_filter$log_reaction_time, na.rm = TRUE)),
colour = "red", size = 1)
ggplot(data_filter, aes(sample = log_reaction_time)) +
stat_qq() +
stat_qq_line() +
labs(x = "Theoretical quantiles", y = "Sample quantiles")
```
### Transformation using squareroot
```{r}
ggplot(data_filter, aes(x = sqrt_reaction_time)) +
geom_histogram(aes(y = ..density..),
binwidth = 0.25,
color= 'plum4', fill = 'lightblue2') +
stat_function(fun = dnorm,
args = list( mean = mean(data_filter$reaction_time_1, na.rm = TRUE),
sd = sd(data_filter$reaction_time_1, na.rm = TRUE)),
colour = "red", size = 1)
ggplot(data_filter, aes(sample = reaction_time_1)) +
stat_qq() +
stat_qq_line() +
labs(x = "Theoretical quantiles", y = "Sample quantiles")
```
### Transformation using reciprocity
```{r}
ggplot(data_filter, aes(x = reaction_time_1)) +
geom_histogram(aes(y = ..density..),
binwidth = 0.25,
color= 'plum4', fill = 'lightblue2') +
stat_function(fun = dnorm,
args = list( mean = mean(data_filter$sqrt_reaction_time, na.rm = TRUE),
sd = sd(data_filter$sqrt_reaction_time, na.rm = TRUE)),
colour = "red", size = 1)
ggplot(data_filter, aes(sample = sqrt_reaction_time)) +
stat_qq() +
stat_qq_line() +
labs(x = "Theoretical quantiles", y = "Sample quantiles")
```
### Word length vs reaction time
```{r}
#Creating a scatter plot with word length and the reaction time
ggplot(data_filter, aes(x = word_length, y = reaction_time)) +
geom_point() +
geom_smooth(method = lm, se=TRUE, colour = 'red') +
labs(x = 'Word Length', y = 'Reaction Time') +
ggtitle('Word Length vs Reaction Time') +
theme_minimal()
```
#### Testing correlational assumption
```{r}
cor.test(data_filter$word_length, data_filter$reaction_time, method = 'kendall')
```
### Word frequency vs reaction time
#### Preparing data
```{r}
#Loading in our data frame with word frequency values
wf <- read_csv("Word_frequency.csv")
#Removing punctuation from the data frame
wf$Word <- sub(",","",wf$Word)
wf$Word <- sub("\\.","",wf$Word)
#Changing the name of the second column in word frequency data frame
names(wf)[2] <- 'frequency'
#Creating a new data frame with word frequencies for each word
wf <- wf %>%
select(Word,frequency) %>%
distinct()
data_filter <- data_filter %>%
inner_join(wf)
#Creating data frame with mean reaction time of each word
mean_rt <- data_filter %>%
group_by(Word) %>%
summarise(mean_rt = mean(reaction_time), frequency) %>%
distinct()
```
#### Plotting mean reaction time in scatterplot
```{r}
#Creating scatter plot with assigned word frequency and reaction time
ggplot(mean_rt, aes(x = frequency, y = mean_rt)) +
geom_point() +
geom_smooth(method = lm, se=TRUE, colour = 'red') +
labs(x = 'Word Frequency', y = 'Reaction Time') +
ggtitle('Word Frequency vs Reaction Time') +
theme_minimal()
```
#### Testing correlational assumption
```{r}
cor.test(data_filter$reaction_time, as.numeric(data_filter$frequency), method = 'spearman')
```
### Ordinal word number vs reaction time
```{r}
#Creating scatter plot with ordinal word number and logged reaction time
ggplot(data_tf, aes(x = ordinal_number, y = reaction_time)) +
geom_point() +
geom_smooth(method = lm, se=TRUE, colour = 'red') +
labs(x = 'Ordinal Word Number', y = 'Logged Reaction Time') +
ggtitle('Ordinal Word Number vs Reaction Time') +
theme_minimal()
```
#### Testing correlational assumption
```{r}
cor.test(data_filter$ordinal_number, data_filter$reaction_time, method = 'spearman')
```
# Hypothesis Testing
Null-hypothesis H0: there is no difference in the mean response time between the experimental and control condition
Alternative hypothesis HA: there is a difference in the mean response time between the experimental and control condition
## Preparing the data
```{r}
#New data frame with the control word and the following word in the control condition
data_control <- filter(data, column_1 == 128 | column_1 == 129, condition == "Control")
#New data frame with the semantically surprising word and the following word in the experimental condition
data_experimental <- filter(data, column_1 == 128 | column_1 == 129, condition == "Experimental")
```
```{r}
WRS2::yuen(reaction_time ~ condition, data = data_control) # computed on sample.