# A3 machine learning
## part 1 - simulating data
### packages2
```{r}
pacman::p_load(msm, tidyverse, brms, grid, gridExtra, readxl, metafor, dplyr, magrittr, reshape, tidymodels)
```
### (load data)
```{r}
data <- read_csv("Ass3_empiricalData1.csv")
```
-------
### Simulation for the informed dataset
( først en chunk for df 1 (6 fra ma og 4 noise) og for df 2(baseline kun med noise)
#Simulating data (df 1 = with 6 from the meta-analysis, 4 with just random noise)
```{r}
# Define population size
n <- 100
trials <- 10
# Define effect sizes
#see table 2 from the article for the numbers
InformedEffectMean <- c(0.25, -0.55, 0.74, -1.26, 0.05, 1.89, 0, 0, 0, 0)
# Define individual variability from population and across trials and measurement error
IndividualSD <- 1
TrialSD <- 0.5
Error <- 0.2
# Conditions
Schizophrenia <- rnorm(1, rnorm(1, 0.21, 0.5), 0.2)
Control <- rnorm(1, rnorm(1, -0.21, 0.5), 0.2)
# For each pair of participant, we need to identity the true effect size
for(i in seq(10)) {
temp_informed <- tibble(
ID = seq(n),
TrueEffect = rnorm(n, InformedEffectMean[i], IndividualSD),
Variable = paste0("v", i))
if(i == 1) {
d_informed_true <- temp_informed
} else {
d_informed_true <- rbind(d_informed_true, temp_informed)
}
}
# Create tibble with one row per trial
d_trial <- tibble(expand_grid(ID = seq(n), Trial = seq(trials), Group = c("Schizophrenia", "Control")))
d_informed <- merge(d_informed_true, d_trial)
for(i in seq(nrow(d_informed))){
d_informed$measurement[i] <- ifelse(d_informed$Group[i]=="Schizophrenia",
rnorm(1, rnorm(1, d_informed$TrueEffect[i]/2, TrialSD), Error),
rnorm(1, rnorm(1, -d_informed$TrueEffect[i]/2, TrialSD), Error))
}
d_informed_wide <- d_informed %>%
mutate(TrueEffect= NULL) %>%
pivot_wider(names_from = Variable,
values_from = measurement)
```
### Simulation for the skeptic dataset
(df 2 = baseline dataset including only 10 noise variables)
```{r}
#... and now we do the same for the second data set (baseline dataset including only 10 noise variables)
skeptic_EffectMean <- rep(0,10)
for(i in seq(10)) {
temp_skeptic <- tibble(
ID = seq(n),
TrueEffect = rnorm(n, skeptic_EffectMean[i], IndividualSD),
Variable = paste0("v", i))
if(i == 1) {
d_informed_true <- temp_informed
d_skeptic_true <- temp_skeptic
} else {
d_skeptic_true <- rbind(d_skeptic_true, temp_skeptic)
}
}
d_skeptic <- merge(d_skeptic_true, d_trial)
for(i in seq(nrow(d_skeptic))){
d_skeptic$measurement[i] <- ifelse(d_skeptic$Group[i]=="Schizophrenia",
rnorm(1, rnorm(1, d_skeptic$TrueEffect[i]/2, TrialSD), Error),
rnorm(1, rnorm(1, -d_skeptic$TrueEffect[i]/2, TrialSD), Error))
}
d_skeptic_wide <- d_skeptic %>%
mutate(TrueEffect= NULL) %>%
pivot_wider(names_from = Variable,
values_from = measurement)
```
------------------------------------------------------------------------------------------------------
## Part 2 - ML pipeline on simulated data
### Starting with the informed dataset
#first: lets make ID and Trials as character - so they won't be scaled and centered in the next step
```{r}
d_informed_wide <- d_informed_wide %>%
mutate(ID = as.factor(ID)) %>%
mutate(Trial = as.factor(Trial))
d_skeptic_wide <- d_skeptic_wide %>%
mutate(ID = as.factor(ID)) %>%
mutate(Trial = as.factor(Trial))
```
i) create a data budget (e.g. balanced training and test sets);
```{r}
TestID <- sample(seq(n), 20)
#Using the ! to say what is not in the first one put in the second one
train_informed <- d_informed_wide %>% subset(!(ID %in% TestID))
test_informed <- d_informed_wide %>% subset((ID %in% TestID))
train_skeptic <- d_skeptic_wide %>% subset(!(ID %in% TestID))
test_skeptic<- d_skeptic_wide %>% subset((ID %in% TestID))
```
ii) pre-process the data (e.g. scaling the features);
```{r}
#Informed
scaled_informed <- train_informed %>%
recipe(Group ~ .) %>% #define the outcome
step_scale(all_numeric()) %>% #scales all the numeric values
step_center(all_numeric()) %>% #centers all the numeric values
prep(training = train_informed, retain = TRUE)
#creating new scaled dataframes
train_informed_scaled <- juice(scaled_informed)
test_informed_scaled <- bake(scaled_informed, new_data = test_informed)
```
```{r}
#Skeptic
scaled_skeptic <- train_skeptic %>%
recipe(Group ~ .) %>% #define the outcome
step_scale(all_numeric()) %>% #scales all the numeric values
step_center(all_numeric()) %>% #centers all the numeric values
prep(training = train_skeptic, retain = TRUE)
#creating new scaled dataframes
train_skeptic_scaled <- juice(scaled_skeptic)
test_skeptic_scaled <- bake(scaled_skeptic, new_data = test_skeptic)
```
iii) fit and assess a classification algorithm on the training data (e.g. Bayesian multilevel logistic regression);
```{r}
#create formula for informed
f1 <- bf(Group ~ 1 + v1 + v2 + v3 + v4 + v5 +v6 +v7 +v8 +v9 +v10)
#create formula for skeptic
s_f1 <- bf(Group ~ 1 + v1 + v2 + v3 + v4 + v5 +v6 +v7 +v8 +v9 +v10)
```
```{r}
#get prios for informed
get_prior(f1,
train_informed_scaled,
family = bernoulli)
#get prios for skeptic
get_prior(s_f1,
train_skeptic_scaled,
family = bernoulli)
```
#setting priors
```{r}
#informed
f1_prior <- c(
prior(normal(0, 1), class = Intercept),
prior(normal(0, 0.3), class = b)
)
# Skeptic
s_f1_prior <- c(
prior(normal(0, 1), class = Intercept),
prior(normal(0, 0.3), class = b)
)
```
#make model
```{r}
fitted_f1_prior <- brm(
f1,
train_informed_scaled,
family = bernoulli,
prior = f1_prior,
sample_prior = T,
iter = 4000,
warmup = 2000,
cores = 4,
backend ="cmdstanr",
threads = threading(2),
refresh=0,
chains = 4,
control = list(
adapt_delta = 0.999,
max_treedepth = 20))
#skeptic
fitted_s_f1_prior <-
brm(
s_f1,
train_skeptic_scaled,
family = bernoulli,
prior = s_f1_prior,
sample_prior = T,
iter = 4000,
warmup = 2000,
cores = 4,
backend ="cmdstanr",
threads = threading(2),
refresh=0,
chains = 4,
control = list(
adapt_delta = 0.999,
max_treedepth = 20))
```
#look at pp-check
```{r}
#informed
pp_check(fitted_f1_prior, ndraws = 100)
#skeptic
pp_check(fitted_s_f1_prior, ndraws = 100)
```
#Parameter recovery
```{r}
#informed
print(fitted_f1_prior)
#skeptic
print(fitted_s_f1_prior)
```
#Prior posterier update checks
```{r}
#informed
variables(fitted_f1_prior)
Posterior_f1 <- as_draws_df(fitted_f1_prior)
ggplot(Posterior_f1) +
geom_density(aes(prior_Intercept), fill="chartreuse2", color="black",alpha=0.6) +
geom_density(aes(b_Intercept), fill="deeppink", color="black",alpha=0.6) +
xlab('Intercept') +
theme_classic()+
ggtitle("intercept for study effect size")+
theme(plot.title = element_text(size = 10, face = "bold"))
#skeptic
variables(fitted_s_f1_prior)
Posterior_s_f1 <- as_draws_df(fitted_s_f1_prior)
ggplot(Posterior_s_f1) +
geom_density(aes(prior_Intercept), fill="chartreuse2", color="black",alpha=0.6) +
geom_density(aes(b_Intercept), fill="deeppink", color="black",alpha=0.6) +
xlab('Intercept') +
theme_classic()+
ggtitle("intercept for study effect size")+
theme(plot.title = element_text(size = 10, face = "bold"))
#green = prior
#pink = simulated data
```
iv) assess performance on the test set;
```{r}
PerformanceProb <- tibble(expand_grid(
Sample = seq(4000),
Model =c("Our_model"),
Setup= c("informed", "skeptic"),
Type= c("training", "test"))
)
# informed
test_informed_scaled$PredictionsPerc0 <- predict(fitted_f1_prior, newdata = test_informed_scaled, allow_new_levels = T)[,1]
test_informed_scaled$Predictions0[test_informed_scaled$PredictionsPerc0 > 0.5] <- "Schizophrenia"
test_informed_scaled$Predictions0[test_informed_scaled$PredictionsPerc0 <= 0.5] <- "Control"
train_informed_scaled$PredictionsPerc0 <- predict(fitted_f1_prior)[,1]
train_informed_scaled$Predictions0[train_informed_scaled$PredictionsPerc0 > 0.5] <- "Schizophrenia"
train_informed_scaled$Predictions0[train_informed_scaled$PredictionsPerc0 <= 0.5] <- "Control"
# skeptic
test_skeptic_scaled$PredictionsPerc1 <- predict(fitted_s_f1_prior, newdata = test_skeptic_scaled, allow_new_levels = T)[,1]
test_skeptic_scaled$Predictions1[test_skeptic_scaled$PredictionsPerc1 > 0.5] <- "Schizophrenia"
test_skeptic_scaled$Predictions1[test_skeptic_scaled$PredictionsPerc1 <= 0.5] <- "Control"
train_skeptic_scaled$PredictionsPerc1 <- predict(fitted_s_f1_prior)[,1]
train_skeptic_scaled$Predictions1[train_skeptic_scaled$PredictionsPerc1 > 0.5] <- "Schizophrenia"
train_skeptic_scaled$Predictions1[train_skeptic_scaled$PredictionsPerc1 <= 0.5] <- "Control"
# informed
train0 <- inv_logit_scaled(posterior_linpred(fitted_f1_prior,
summary = F))
test0 <- inv_logit_scaled(posterior_linpred(fitted_f1_prior,
summary = F,
newdata = test_informed_scaled,
allow_new_levels = T ))
# skeptic
train1 <- inv_logit_scaled(posterior_linpred(fitted_s_f1_prior,
summary = F))
test1 <- inv_logit_scaled(posterior_linpred(fitted_s_f1_prior,
summary = F,
newdata = test_skeptic_scaled,
allow_new_levels = T ))
# informed
test_informed_scaled <- test_informed_scaled %>%
mutate(Group = as.factor(Group),
Predictions0 = as.factor((Predictions0)))
train_informed_scaled <- train_informed_scaled %>%
mutate(Group = as.factor(Group),
Predictions0 = as.factor((Predictions0)))
# skeptic
test_skeptic_scaled <- test_skeptic_scaled %>%
mutate(Group = as.factor(Group),
Predictions1 = as.factor(Predictions1))
train_skeptic_scaled <- train_skeptic_scaled %>%
mutate(Group = as.factor(Group),
Predictions1 = as.factor(Predictions1))
for (i in seq(4000)){
train_informed_scaled$Predictions0 <- as.factor(ifelse(train0[i,] > 0.5, "Schizophrenia", "Control"))
test_informed_scaled$Predictions0 <- as.factor(ifelse(test0[i,] > 0.5, "Schizophrenia", "Control"))
PerformanceProb$Accuracy[PerformanceProb$Sample==i & PerformanceProb$Model == "Our_model" & PerformanceProb$Setup == "informed" & PerformanceProb$Type =="training"] <-
accuracy(train_informed_scaled, truth = Group, estimate = Predictions0)[, ".estimate"]
PerformanceProb$Accuracy[PerformanceProb$Sample==i & PerformanceProb$Model == "Our_model" & PerformanceProb$Setup == "informed" & PerformanceProb$Type =="test"] <-
accuracy(test_informed_scaled, truth = Group, estimate = Predictions0)[, ".estimate"]
train_skeptic_scaled$Predictions1 <- as.factor(ifelse(train1[i,] > 0.5, "Schizophrenia", "Control"))
test_skeptic_scaled$Predictions1 <- as.factor(ifelse(test1[i,] > 0.5, "Schizophrenia", "Control"))
PerformanceProb$Accuracy[PerformanceProb$Sample == i & PerformanceProb$Model == "Our_model" & PerformanceProb$Setup == "skeptic" & PerformanceProb$Type =="training"] <-
accuracy(train_skeptic_scaled, truth = Group, estimate = Predictions1)[, ".estimate"]
PerformanceProb$Accuracy[PerformanceProb$Sample == i & PerformanceProb$Model == "Our_model" & PerformanceProb$Setup == "skeptic" & PerformanceProb$Type =="test"] <-
accuracy(test_skeptic_scaled, truth = Group, estimate = Predictions1)[, ".estimate"]
}
```
#### Assesing average performance for the training and test set in the informed simualtion
```{r}
conf_mat(
train_informed_scaled,
truth = Group,
estimate = Predictions0,
dnn = c("Prediction", "truth"))
metrics(
train_informed_scaled,
truth = Group,
estimate = Predictions0)
```
```{r}
conf_mat(
test_informed_scaled,
truth = Group,
estimate = Predictions0,
dnn = c("Prediction", "truth"))
metrics(
test_informed_scaled,
truth = Group,
estimate = Predictions0)
```
#### Assesing average performance for the training and test set in the skeptic simualtion
```{r}
conf_mat(
train_skeptic_scaled,
truth = Group,
estimate = Predictions0,
dnn = c("Prediction", "truth"))
metrics(
train_skeptic_scaled,
truth = Group,
estimate = Predictions0)
```
```{r}
conf_mat(
test_skeptic_scaled,
truth = Group,
estimate = Predictions0,
dnn = c("Prediction", "truth"))
metrics(
test_skeptic_scaled,
truth = Group,
estimate = Predictions0)
```
# Plotting the accuracy
```{r}
ggplot(PerformanceProb) +
geom_point(aes(x = Setup, y = as.numeric(Accuracy), colour = Type)) + geom_abline(intercept = 0.5, slope = 0, col=c("Purple"), linetype = c("dashed")) +
theme_minimal() +
ylab("Accuracy") +
xlab("Type") +
theme_minimal() +
ggtitle("Accuracy between informed and sceptic")
```
v) discuss whether performance is as expected and feature importance is as expected.
### Part III - Applying the ML pipeline to empirical data
Download the empirical dataset from brightspace and apply your ML pipeline to the new data, adjusting where needed. Warning: in the simulated dataset we only had 10 features, now you have many more! Such is the life of the ML practitioner. Consider the impact a higher number of features will have on your ML inference, and decide whether you need to cut down the number of features before running the pipeline (or alternatively expand the pipeline to add feature selection).
### Load in empirical data and clean it
```{r}
data <- read_csv("Ass3_empiricalData1.csv")
data <- data %>%
select(-Language) %>%
select(-NewID) %>%
select(-Corpus) %>%
mutate(Diagnosis= as.factor(Diagnosis)) %>%
mutate(PatID = as.factor(PatID))
```
```{r}
head(data)
```
### Data budgeting
```{r}
set.seed(222)
data_split <- initial_split(data, prop = 4/5, strata = Gender)
train_edata <- training(data_split)
test_edata <- testing(data_split)
train_edata <- train_edata%>%
select(-Gender) %>%
select(-Trial)
test_edata <- test_edata%>%
select(-Gender) %>%
select(-Trial)
### Data preprocessing
rec_train <- train_edata %>%
recipe(Diagnosis ~ .) %>%
step_scale(all_numeric()) %>%
step_center(all_numeric()) %>%
prep(training = train_edata, retain = TRUE)
train_scaled <- juice(rec_train)
test_scaled <- bake(rec_train, new_data = test_edata)
```
### Principal component analysis
```{r}
#packages
pacman::p_load(tidytext)
install.packages('BiocManager')
#recipe for PCA incl. prep
pca_rec <- recipe(Diagnosis~., data = train_scaled) %>%
update_role(PatID, new_role = "id") %>%
step_pca(all_numeric(), id = "pca")%>%
prep()
#table where the values are "loadings" i.e. how much they count in the Principle components
tidied_pca <- tidy(pca_rec, 1)
#data frame where five principle components are included
pca_bake <- bake(pca_rec, train_scaled)
pca_b_test <- bake(pca_rec, test_scaled)
#plots from Riccardo
tidied_pca %>%
filter(component %in% paste0("PC", 1:5)) %>%
mutate(component = fct_inorder(component)) %>%
ggplot(aes(value, terms, fill = terms)) +
geom_col(show.legend = FALSE) +
facet_wrap(~component, nrow = 1) +
labs(y = NULL)
tidied_pca %>%
filter(component %in% paste0("PC", 1:15)) %>%
group_by(component) %>%
top_n(15, abs(value)) %>%
ungroup() %>%
mutate(terms = reorder_within(terms, abs(value), component)) %>%
ggplot(aes(abs(value), terms, fill = value > 0)) +
geom_col() +
facet_wrap(~component, scales = "free_y") +
scale_y_reordered() +
labs(
x = "Absolute value of contribution",
y = NULL, fill = "Positive?"
)
```
### Feature selection
```{r}
pacman::p_load(DALEX, DALEXtra, kernlab, randomForest, xgboost, knitr, dotwhisker, caret)
LogisticRegression_edata <- logistic_reg() %>%
set_mode("classification") %>%
set_engine("glm") %>%
fit(Diagnosis ~ . , data = pca_bake)
explainer_lm <- explain_tidymodels(
LogisticRegression_edata,
data = pca_bake,
y = as.numeric(pca_bake$Diagnosis) -1,
label = "logReg",
verbose = FALSE)
explainer_lm %>%
model_parts() %>%
plot(show_boxplots = FALSE) +
ggtitle("Feature Importance", "")
```
### Making the formula
```{r}
emp_f <- bf(Diagnosis ~ 1 + PC1 + PC2 + PC3 + (1 | PatID))
```
### Get prior
```{r}
get_prior(emp_f,
pca_bake,
family = bernoulli)
```
### Set priors
```{r}
emp_f_priors <- c(
prior(normal(0, 1), class = Intercept),
prior(normal(0, 0.3), class = b),
prior(normal(0, 1), class = sd)
)
```
### Building model
```{r}
emp_f_prior <-
brm(
emp_f,
pca_bake,
family = bernoulli,
prior = emp_f_priors,
sample_prior = "only",
iter = 4000,
warmup = 2000,
cores = 4,
refresh=0,
chains = 4,
control = list(
adapt_delta = 0.999,
max_treedepth = 20))
```
### Prior predictive checks
```{r}
pp_check(emp_f_prior, ndraws = 100)
```
### Fit the model
```{r}
fitted_emp_f_prior <-
brm(
emp_f,
pca_bake,
family = bernoulli,
prior = emp_f_priors,
sample_prior = T,
iter = 4000,
warmup = 2000,
cores = 4,
refresh=0,
chains = 4,
control = list(
adapt_delta = 0.999,
max_treedepth = 20))
```
### Posterior predictive checks
```{r}
pp_check(fitted_emp_f_prior, ndraws = 100)
```
### Prior posterior update checks
```{r}
variables(fitted_emp_f_prior)
Posterior_emp_f1 <- as_draws_df(fitted_emp_f_prior)
ggplot(Posterior_emp_f1) +
geom_density(aes(prior_Intercept), fill="chartreuse2", color="black",alpha=0.6) +
geom_density(aes(b_Intercept), fill="deeppink", color="black",alpha=0.6) +
xlab('Intercept') +
theme_classic()+
ggtitle("intercept")+
theme(plot.title = element_text(size = 10, face = "bold"))
```
### Performance check
```{r}
#Performance check
PerformanceProb2 <- tibble(expand_grid(
Sample = seq(4000),
Type= c("training", "test"))
)
pca_b_test <- pca_b_test %>%
mutate(Diagnosis = as.factor(Diagnosis))
pca_bake <- pca_bake %>%
mutate(Diagnosis = as.factor(Diagnosis))
pca_bake$PredictionsPerc2 <- predict(fitted_emp_f_prior)[,1]
pca_bake$Predictions2[pca_bake$PredictionsPerc2 > 0.5] <- "SCZ"
pca_bake$Predictions2[pca_bake$PredictionsPerc2 <= 0.5] <- "CT"
pca_b_test$PredictionsPerc2 <- predict(fitted_emp_f_prior, newdata = pca_b_test, allow_new_levels = T)[,1]
pca_b_test$Predictions2[pca_b_test$PredictionsPerc2 >= 0.5] <- "SCZ"
pca_b_test$Predictions2[pca_b_test$PredictionsPerc2 < 0.5] <- "CT"
train2 <- inv_logit_scaled(posterior_linpred(fitted_emp_f_prior,
summary = F))
test2 <- inv_logit_scaled(posterior_linpred(fitted_emp_f_prior,
summary = F,
newdata = pca_b_test,
allow_new_levels = T ))
# Loop calculating accuracy
for (i in seq(4000)){
train_scaled$Predictions2 <- as.factor(ifelse(train2[i,] > 0.5, "SCZ", "CT"))
test_scaled$Predictions2 <- as.factor(ifelse(test2[i,] > 0.5, "SCZ", "CT"))
PerformanceProb2$Accuracy[PerformanceProb2$Sample == i & PerformanceProb2$Type =="training"] <-
accuracy(train_scaled, truth = Diagnosis, estimate = Predictions2)[, ".estimate"]
PerformanceProb2$Accuracy[PerformanceProb2$Sample == i & PerformanceProb2$Type =="test"] <-
accuracy(test_scaled, truth = Diagnosis, estimate = Predictions2)[, ".estimate"]
}
```
```{r}
#Predictions as factor is nesecary for the next step
pca_b_test <- pca_b_test %>%
mutate(Diagnosis = as.factor(Diagnosis),
Predictions2 = as.factor(Predictions2))
pca_bake <- pca_bake %>%
mutate(Diagnosis = as.factor(Diagnosis),
Predictions2 = as.factor(Predictions2))
# Table showing predictions versus actual diagnosis
conf_mat(
pca_b_test,
truth = Diagnosis,
estimate = Predictions2,
dnn = c("Predictions", "Truth")
)
```
```{r}
# Table showing accuracy
metrics(pca_b_test,
truth = Diagnosis, estimate = Predictions2) %>%
knitr::kable()
```
```{r}
#Plotting the accuracy
ggplot(PerformanceProb2) +
geom_point(aes(x = Type, y = as.numeric(Accuracy))) + geom_abline(intercept = 0.5, slope = 0, col=c("Purple"), linetype = c("dashed")) +
theme_minimal() +
ylab("Accuracy") +
xlab("Type") +
theme_minimal() +
ggtitle("Accuracy between test and train")
```