# CogCom exam
Plan:
- Predictors: characterization, nondom, a-dom, b-dom, age of participant, gender of participant
- Table showing the amount of participants in each condition and gender distribution
-
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
```
```{r, include=FALSE}
getwd()
list.files()
gender_data <- read.csv("data_cogcom45cases.csv")
```
```{r, loading packages, include=FALSE}
install.packages("pacman")
pacman::p_load(tidyverse, lme4, lmerTest)
```
```{r, cleaning up the data, include=FALSE}
gender_data = gender_data[-1,]
gender_data = gender_data[, - (2:3) & (22:29)]
```
```{r}
# remove rows
gender_data <- gender_data[-1,]
gender_data <- gender_data[, - (2:3)]
gender_data <- gender_data[,-(22:29)]
```
```{r}
#remove more rows
gender_data <- gender_data[,-(2:4)]
gender_data <- gender_data[,-(4:12)]
gender_data$TE05 <- stringr::str_replace_all(gender_data$TE05, "TE02", "Non-dom")
gender_data$TE05 <- stringr::str_replace_all(gender_data$TE05, "TE03", "A-dom")
gender_data$TE05 <- stringr::str_replace_all(gender_data$TE05, "TE04", "B-dom")
gender_data$TE09 <- stringr::str_replace_all(gender_data$TE09, "I did not assign a gender to person A", "No gender")
gender_data$TE10 <- stringr::str_replace_all(gender_data$TE10, "I did not assign a gender to person B", "No gender")
```
```{r}
table(gender_data$TE05)
table(gender_data$DC03)
table(gender_data$TE09)
table(gender_data$TE10)
```
```{r}
gender_data$CASE <- as.numeric(gender_data$CASE)
gender_data$CASE <- as.factor(gender_data$CASE)
gender_data$CASE <- as.numeric(gender_data$CASE)
```
knitr::opts_knit$set(root.dir = '/Users/lina/Downloads')
gender_data%>%
group_by(Dominance)%>%
table(Gender_A)
table(gender_data$Dominance)
table(gender_data$Gender_A)
table(gender_data$Gender_B)
# start of the analysis
## descriptive statistics for categorical data
1. overall count
2. frequencies
3. proportions
4. visualization
```{r}
# table
table(gender_data$Dominance)
table(gender_data$Gender_A)
table(gender_data$Gender_B)
```
```{r}
# frequency counts created by contingency tables (multidimensional so combinations of several variables)
tableA <- table(gender_data$Gender, gender_data$Dominance, gender_data$Gender_A)
ftable(tableA)
tableB <- table(gender_data$Gender, gender_data$Dominance, gender_data$Gender_B)
ftable(tableB)
```
```{r}
# with proportions
# assigned gender based on dominance version
propA <- table(gender_data$Dominance, gender_data$Gender_A)
propB<- table(gender_data$Dominance, gender_data$Gender_B)
prop.table(propA)
prop.table(propB)
# assigned gedner based on gender
prop_genderA <- table(gender_data$Gender, gender_data$Gender_A)
prop_genderB<- table(gender_data$Gender, gender_data$Gender_B)
prop.table(prop_genderA)
prop.table(prop_genderB)
```
```{r}
# cross table
CrossTable(gender_data$Dominance, gender_data$Gender_A)
CrossTable(gender_data$Dominance, gender_data$Gender_B)
```
### Visualization
```{r}
# re-order levels
reorder_size <- function(x) {
factor(x, levels = names(sort(table(x), decreasing = TRUE)))
}
plot_A <- ggplot(gender_data, aes(x = reorder_size(Gender_A))) +
geom_bar(aes(y = (..count..)/sum(..count..)), width = 0.5, fill= "steelblue3") +
xlab("Assigned gender Person A ") +
scale_y_continuous(labels = scales::percent, name = "Proportion") +
facet_grid(Gender~ Dominance)+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+theme_bw()
plot_B <- ggplot(gender_data, aes(x = reorder_size(Gender_B))) +
geom_bar(aes(y = (..count..)/sum(..count..)), width = 0.5, fill= "steelblue3") +
xlab("Assigned gender Person B ") +
scale_y_continuous(labels = scales::percent, name = "Proportion") +
facet_grid(Gender~ Dominance)+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+theme_bw()
plot_A
plot_B
```
### chi-square and tests with only assigned gender
### making two data frames and putting them back together
```{r}
dfA <- tibble(gender_data)[1:4]
dfA$Description <- gender_data$Description_A
dfA$Gender_P <- gender_data$Gender_A
dfA$Explanation <- gender_data$Explanation
dfA$Person <- "A"
dfB <- tibble(gender_data)[1:4]
dfB$Description <- gender_data$Description_B
dfB$Gender_P <- gender_data$Gender_B
dfB$Explanation <- gender_data$Explanation
dfB$Person <- "B"
dfAB <- bind_rows(dfA, dfB)
dfAB$Gender_P <- as.factor(dfAB$Gender_P)
```
### logistic regression
```{r}
# removing data without assigend gender
dfAB2 <- subset(dfAB, Gender_P != "No gender")
dfAB2$Gender_P <- as.factor(dfAB2$Gender_P)
# setting reference level
dfAB2$Gender_P <- relevel(dfAB2$Gender_P, ref = "Female")
```
```{r}
model <- glm(Gender_P ~ Gender + Dominance + Age + Person, dfAB2, family = "binomial")
```
```{r}
# splitting data in test and train data set
sample <- sample(c(TRUE, FALSE), nrow(dfAB2), replace=TRUE, prob=c(0.8,0.2))
train <- dfAB2[sample, ]
test <- dfAB2[!sample,]
```
```{r}
predictions <- predict(model, test, type = "response")
predictions
actual <- as.factor(test$Gender_P)
actual
df_pred <- tibble(predictions, actual)
# comparison of actual and predictions
df_pred <- df_pred %>%
mutate(predictions = if_else(predictions < 0.5, "Female", "Male"))
# transform probabilities to 0 or 1
df_pred$predictions <- as.factor(df_pred$predictions)
cm <- confusionMatrix(df_pred$predictions, df_pred$actual)
cm
```
```{r}
plot(model)
```
### including not assigned gender
```{r}
# test and train data set
sample2 <- sample(c(TRUE, FALSE), nrow(dfAB), replace=TRUE, prob=c(0.8,0.2))
train2 <- dfAB[sample2, ]
test2 <- dfAB[!sample2,]
```
```{r}
# setting the reference
train$Gender_P <- relevel(train$Gender_P, ref = "No gender")
```
```{r}
multinom_model <- multinom(Gender_P ~ Dominance + Gender + Age + Person + (1|CASE) , data = dfAB)
# Checking the model
summary(multinom_model)
exp(coef(multinom_model))
```
```{r}
predictions2 <- predict(multinom_model, test2, type = "class")
predictions2
actual2 <- as.factor(test2$Gender_P)
actual2
df_pred2 <- tibble(predictions2, actual2)
# comparison of actual and predictions
df_pred2 <- df_pred2
df_pred2$predictions2 <- as.factor(df_pred2$predictions2)
cm2 <- confusionMatrix(df_pred2$predictions2, df_pred2$actual2)
cm2
```