# A3 machine learning ## part 1 - simulating data ### packages2 ```{r} pacman::p_load(msm, tidyverse, brms, grid, gridExtra, readxl, metafor, dplyr, magrittr, reshape, tidymodels) ``` ### (load data) ```{r} data <- read_csv("Ass3_empiricalData1.csv") ``` ------- ### Simulation for the informed dataset ( først en chunk for df 1 (6 fra ma og 4 noise) og for df 2(baseline kun med noise) #Simulating data (df 1 = with 6 from the meta-analysis, 4 with just random noise) ```{r} # Define population size n <- 100 trials <- 10 # Define effect sizes #see table 2 from the article for the numbers InformedEffectMean <- c(0.25, -0.55, 0.74, -1.26, 0.05, 1.89, 0, 0, 0, 0) # Define individual variability from population and across trials and measurement error IndividualSD <- 1 TrialSD <- 0.5 Error <- 0.2 # Conditions Schizophrenia <- rnorm(1, rnorm(1, 0.21, 0.5), 0.2) Control <- rnorm(1, rnorm(1, -0.21, 0.5), 0.2) # For each pair of participant, we need to identity the true effect size for(i in seq(10)) { temp_informed <- tibble( ID = seq(n), TrueEffect = rnorm(n, InformedEffectMean[i], IndividualSD), Variable = paste0("v", i)) if(i == 1) { d_informed_true <- temp_informed } else { d_informed_true <- rbind(d_informed_true, temp_informed) } } # Create tibble with one row per trial d_trial <- tibble(expand_grid(ID = seq(n), Trial = seq(trials), Group = c("Schizophrenia", "Control"))) d_informed <- merge(d_informed_true, d_trial) for(i in seq(nrow(d_informed))){ d_informed$measurement[i] <- ifelse(d_informed$Group[i]=="Schizophrenia", rnorm(1, rnorm(1, d_informed$TrueEffect[i]/2, TrialSD), Error), rnorm(1, rnorm(1, -d_informed$TrueEffect[i]/2, TrialSD), Error)) } d_informed_wide <- d_informed %>% mutate(TrueEffect= NULL) %>% pivot_wider(names_from = Variable, values_from = measurement) ``` ### Simulation for the skeptic dataset (df 2 = baseline dataset including only 10 noise variables) ```{r} #... and now we do the same for the second data set (baseline dataset including only 10 noise variables) skeptic_EffectMean <- rep(0,10) for(i in seq(10)) { temp_skeptic <- tibble( ID = seq(n), TrueEffect = rnorm(n, skeptic_EffectMean[i], IndividualSD), Variable = paste0("v", i)) if(i == 1) { d_informed_true <- temp_informed d_skeptic_true <- temp_skeptic } else { d_skeptic_true <- rbind(d_skeptic_true, temp_skeptic) } } d_skeptic <- merge(d_skeptic_true, d_trial) for(i in seq(nrow(d_skeptic))){ d_skeptic$measurement[i] <- ifelse(d_skeptic$Group[i]=="Schizophrenia", rnorm(1, rnorm(1, d_skeptic$TrueEffect[i]/2, TrialSD), Error), rnorm(1, rnorm(1, -d_skeptic$TrueEffect[i]/2, TrialSD), Error)) } d_skeptic_wide <- d_skeptic %>% mutate(TrueEffect= NULL) %>% pivot_wider(names_from = Variable, values_from = measurement) ``` ------------------------------------------------------------------------------------------------------ ## Part 2 - ML pipeline on simulated data ### Starting with the informed dataset #first: lets make ID and Trials as character - so they won't be scaled and centered in the next step ```{r} d_informed_wide <- d_informed_wide %>% mutate(ID = as.factor(ID)) %>% mutate(Trial = as.factor(Trial)) d_skeptic_wide <- d_skeptic_wide %>% mutate(ID = as.factor(ID)) %>% mutate(Trial = as.factor(Trial)) ``` i) create a data budget (e.g. balanced training and test sets); ```{r} TestID <- sample(seq(n), 20) #Using the ! to say what is not in the first one put in the second one train_informed <- d_informed_wide %>% subset(!(ID %in% TestID)) test_informed <- d_informed_wide %>% subset((ID %in% TestID)) train_skeptic <- d_skeptic_wide %>% subset(!(ID %in% TestID)) test_skeptic<- d_skeptic_wide %>% subset((ID %in% TestID)) ``` ii) pre-process the data (e.g. scaling the features); ```{r} #Informed scaled_informed <- train_informed %>% recipe(Group ~ .) %>% #define the outcome step_scale(all_numeric()) %>% #scales all the numeric values step_center(all_numeric()) %>% #centers all the numeric values prep(training = train_informed, retain = TRUE) #creating new scaled dataframes train_informed_scaled <- juice(scaled_informed) test_informed_scaled <- bake(scaled_informed, new_data = test_informed) ``` ```{r} #Skeptic scaled_skeptic <- train_skeptic %>% recipe(Group ~ .) %>% #define the outcome step_scale(all_numeric()) %>% #scales all the numeric values step_center(all_numeric()) %>% #centers all the numeric values prep(training = train_skeptic, retain = TRUE) #creating new scaled dataframes train_skeptic_scaled <- juice(scaled_skeptic) test_skeptic_scaled <- bake(scaled_skeptic, new_data = test_skeptic) ``` iii) fit and assess a classification algorithm on the training data (e.g. Bayesian multilevel logistic regression); ```{r} #create formula for informed f1 <- bf(Group ~ 1 + v1 + v2 + v3 + v4 + v5 +v6 +v7 +v8 +v9 +v10) #create formula for skeptic s_f1 <- bf(Group ~ 1 + v1 + v2 + v3 + v4 + v5 +v6 +v7 +v8 +v9 +v10) ``` ```{r} #get prios for informed get_prior(f1, train_informed_scaled, family = bernoulli) #get prios for skeptic get_prior(s_f1, train_skeptic_scaled, family = bernoulli) ``` #setting priors ```{r} #informed f1_prior <- c( prior(normal(0, 1), class = Intercept), prior(normal(0, 0.3), class = b) ) # Skeptic s_f1_prior <- c( prior(normal(0, 1), class = Intercept), prior(normal(0, 0.3), class = b) ) ``` #make model ```{r} fitted_f1_prior <- brm( f1, train_informed_scaled, family = bernoulli, prior = f1_prior, sample_prior = T, iter = 4000, warmup = 2000, cores = 4, backend ="cmdstanr", threads = threading(2), refresh=0, chains = 4, control = list( adapt_delta = 0.999, max_treedepth = 20)) #skeptic fitted_s_f1_prior <- brm( s_f1, train_skeptic_scaled, family = bernoulli, prior = s_f1_prior, sample_prior = T, iter = 4000, warmup = 2000, cores = 4, backend ="cmdstanr", threads = threading(2), refresh=0, chains = 4, control = list( adapt_delta = 0.999, max_treedepth = 20)) ``` #look at pp-check ```{r} #informed pp_check(fitted_f1_prior, ndraws = 100) #skeptic pp_check(fitted_s_f1_prior, ndraws = 100) ``` #Parameter recovery ```{r} #informed print(fitted_f1_prior) #skeptic print(fitted_s_f1_prior) ``` #Prior posterier update checks ```{r} #informed variables(fitted_f1_prior) Posterior_f1 <- as_draws_df(fitted_f1_prior) ggplot(Posterior_f1) + geom_density(aes(prior_Intercept), fill="chartreuse2", color="black",alpha=0.6) + geom_density(aes(b_Intercept), fill="deeppink", color="black",alpha=0.6) + xlab('Intercept') + theme_classic()+ ggtitle("intercept for study effect size")+ theme(plot.title = element_text(size = 10, face = "bold")) #skeptic variables(fitted_s_f1_prior) Posterior_s_f1 <- as_draws_df(fitted_s_f1_prior) ggplot(Posterior_s_f1) + geom_density(aes(prior_Intercept), fill="chartreuse2", color="black",alpha=0.6) + geom_density(aes(b_Intercept), fill="deeppink", color="black",alpha=0.6) + xlab('Intercept') + theme_classic()+ ggtitle("intercept for study effect size")+ theme(plot.title = element_text(size = 10, face = "bold")) #green = prior #pink = simulated data ``` iv) assess performance on the test set; ```{r} PerformanceProb <- tibble(expand_grid( Sample = seq(4000), Model =c("Our_model"), Setup= c("informed", "skeptic"), Type= c("training", "test")) ) # informed test_informed_scaled$PredictionsPerc0 <- predict(fitted_f1_prior, newdata = test_informed_scaled, allow_new_levels = T)[,1] test_informed_scaled$Predictions0[test_informed_scaled$PredictionsPerc0 > 0.5] <- "Schizophrenia" test_informed_scaled$Predictions0[test_informed_scaled$PredictionsPerc0 <= 0.5] <- "Control" train_informed_scaled$PredictionsPerc0 <- predict(fitted_f1_prior)[,1] train_informed_scaled$Predictions0[train_informed_scaled$PredictionsPerc0 > 0.5] <- "Schizophrenia" train_informed_scaled$Predictions0[train_informed_scaled$PredictionsPerc0 <= 0.5] <- "Control" # skeptic test_skeptic_scaled$PredictionsPerc1 <- predict(fitted_s_f1_prior, newdata = test_skeptic_scaled, allow_new_levels = T)[,1] test_skeptic_scaled$Predictions1[test_skeptic_scaled$PredictionsPerc1 > 0.5] <- "Schizophrenia" test_skeptic_scaled$Predictions1[test_skeptic_scaled$PredictionsPerc1 <= 0.5] <- "Control" train_skeptic_scaled$PredictionsPerc1 <- predict(fitted_s_f1_prior)[,1] train_skeptic_scaled$Predictions1[train_skeptic_scaled$PredictionsPerc1 > 0.5] <- "Schizophrenia" train_skeptic_scaled$Predictions1[train_skeptic_scaled$PredictionsPerc1 <= 0.5] <- "Control" # informed train0 <- inv_logit_scaled(posterior_linpred(fitted_f1_prior, summary = F)) test0 <- inv_logit_scaled(posterior_linpred(fitted_f1_prior, summary = F, newdata = test_informed_scaled, allow_new_levels = T )) # skeptic train1 <- inv_logit_scaled(posterior_linpred(fitted_s_f1_prior, summary = F)) test1 <- inv_logit_scaled(posterior_linpred(fitted_s_f1_prior, summary = F, newdata = test_skeptic_scaled, allow_new_levels = T )) # informed test_informed_scaled <- test_informed_scaled %>% mutate(Group = as.factor(Group), Predictions0 = as.factor((Predictions0))) train_informed_scaled <- train_informed_scaled %>% mutate(Group = as.factor(Group), Predictions0 = as.factor((Predictions0))) # skeptic test_skeptic_scaled <- test_skeptic_scaled %>% mutate(Group = as.factor(Group), Predictions1 = as.factor(Predictions1)) train_skeptic_scaled <- train_skeptic_scaled %>% mutate(Group = as.factor(Group), Predictions1 = as.factor(Predictions1)) for (i in seq(4000)){ train_informed_scaled$Predictions0 <- as.factor(ifelse(train0[i,] > 0.5, "Schizophrenia", "Control")) test_informed_scaled$Predictions0 <- as.factor(ifelse(test0[i,] > 0.5, "Schizophrenia", "Control")) PerformanceProb$Accuracy[PerformanceProb$Sample==i & PerformanceProb$Model == "Our_model" & PerformanceProb$Setup == "informed" & PerformanceProb$Type =="training"] <- accuracy(train_informed_scaled, truth = Group, estimate = Predictions0)[, ".estimate"] PerformanceProb$Accuracy[PerformanceProb$Sample==i & PerformanceProb$Model == "Our_model" & PerformanceProb$Setup == "informed" & PerformanceProb$Type =="test"] <- accuracy(test_informed_scaled, truth = Group, estimate = Predictions0)[, ".estimate"] train_skeptic_scaled$Predictions1 <- as.factor(ifelse(train1[i,] > 0.5, "Schizophrenia", "Control")) test_skeptic_scaled$Predictions1 <- as.factor(ifelse(test1[i,] > 0.5, "Schizophrenia", "Control")) PerformanceProb$Accuracy[PerformanceProb$Sample == i & PerformanceProb$Model == "Our_model" & PerformanceProb$Setup == "skeptic" & PerformanceProb$Type =="training"] <- accuracy(train_skeptic_scaled, truth = Group, estimate = Predictions1)[, ".estimate"] PerformanceProb$Accuracy[PerformanceProb$Sample == i & PerformanceProb$Model == "Our_model" & PerformanceProb$Setup == "skeptic" & PerformanceProb$Type =="test"] <- accuracy(test_skeptic_scaled, truth = Group, estimate = Predictions1)[, ".estimate"] } ``` #### Assesing average performance for the training and test set in the informed simualtion ```{r} conf_mat( train_informed_scaled, truth = Group, estimate = Predictions0, dnn = c("Prediction", "truth")) metrics( train_informed_scaled, truth = Group, estimate = Predictions0) ``` ```{r} conf_mat( test_informed_scaled, truth = Group, estimate = Predictions0, dnn = c("Prediction", "truth")) metrics( test_informed_scaled, truth = Group, estimate = Predictions0) ``` #### Assesing average performance for the training and test set in the skeptic simualtion ```{r} conf_mat( train_skeptic_scaled, truth = Group, estimate = Predictions0, dnn = c("Prediction", "truth")) metrics( train_skeptic_scaled, truth = Group, estimate = Predictions0) ``` ```{r} conf_mat( test_skeptic_scaled, truth = Group, estimate = Predictions0, dnn = c("Prediction", "truth")) metrics( test_skeptic_scaled, truth = Group, estimate = Predictions0) ``` # Plotting the accuracy ```{r} ggplot(PerformanceProb) + geom_point(aes(x = Setup, y = as.numeric(Accuracy), colour = Type)) + geom_abline(intercept = 0.5, slope = 0, col=c("Purple"), linetype = c("dashed")) + theme_minimal() + ylab("Accuracy") + xlab("Type") + theme_minimal() + ggtitle("Accuracy between informed and sceptic") ``` v) discuss whether performance is as expected and feature importance is as expected. ### Part III - Applying the ML pipeline to empirical data Download the empirical dataset from brightspace and apply your ML pipeline to the new data, adjusting where needed. Warning: in the simulated dataset we only had 10 features, now you have many more! Such is the life of the ML practitioner. Consider the impact a higher number of features will have on your ML inference, and decide whether you need to cut down the number of features before running the pipeline (or alternatively expand the pipeline to add feature selection). ### Load in empirical data and clean it ```{r} data <- read_csv("Ass3_empiricalData1.csv") data <- data %>% select(-Language) %>% select(-NewID) %>% select(-Corpus) %>% mutate(Diagnosis= as.factor(Diagnosis)) %>% mutate(PatID = as.factor(PatID)) ``` ```{r} head(data) ``` ### Data budgeting ```{r} set.seed(222) data_split <- initial_split(data, prop = 4/5, strata = Gender) train_edata <- training(data_split) test_edata <- testing(data_split) train_edata <- train_edata%>% select(-Gender) %>% select(-Trial) test_edata <- test_edata%>% select(-Gender) %>% select(-Trial) ### Data preprocessing rec_train <- train_edata %>% recipe(Diagnosis ~ .) %>% step_scale(all_numeric()) %>% step_center(all_numeric()) %>% prep(training = train_edata, retain = TRUE) train_scaled <- juice(rec_train) test_scaled <- bake(rec_train, new_data = test_edata) ``` ### Principal component analysis ```{r} #packages pacman::p_load(tidytext) install.packages('BiocManager') #recipe for PCA incl. prep pca_rec <- recipe(Diagnosis~., data = train_scaled) %>% update_role(PatID, new_role = "id") %>% step_pca(all_numeric(), id = "pca")%>% prep() #table where the values are "loadings" i.e. how much they count in the Principle components tidied_pca <- tidy(pca_rec, 1) #data frame where five principle components are included pca_bake <- bake(pca_rec, train_scaled) pca_b_test <- bake(pca_rec, test_scaled) #plots from Riccardo tidied_pca %>% filter(component %in% paste0("PC", 1:5)) %>% mutate(component = fct_inorder(component)) %>% ggplot(aes(value, terms, fill = terms)) + geom_col(show.legend = FALSE) + facet_wrap(~component, nrow = 1) + labs(y = NULL) tidied_pca %>% filter(component %in% paste0("PC", 1:15)) %>% group_by(component) %>% top_n(15, abs(value)) %>% ungroup() %>% mutate(terms = reorder_within(terms, abs(value), component)) %>% ggplot(aes(abs(value), terms, fill = value > 0)) + geom_col() + facet_wrap(~component, scales = "free_y") + scale_y_reordered() + labs( x = "Absolute value of contribution", y = NULL, fill = "Positive?" ) ``` ### Feature selection ```{r} pacman::p_load(DALEX, DALEXtra, kernlab, randomForest, xgboost, knitr, dotwhisker, caret) LogisticRegression_edata <- logistic_reg() %>% set_mode("classification") %>% set_engine("glm") %>% fit(Diagnosis ~ . , data = pca_bake) explainer_lm <- explain_tidymodels( LogisticRegression_edata, data = pca_bake, y = as.numeric(pca_bake$Diagnosis) -1, label = "logReg", verbose = FALSE) explainer_lm %>% model_parts() %>% plot(show_boxplots = FALSE) + ggtitle("Feature Importance", "") ``` ### Making the formula ```{r} emp_f <- bf(Diagnosis ~ 1 + PC1 + PC2 + PC3 + (1 | PatID)) ``` ### Get prior ```{r} get_prior(emp_f, pca_bake, family = bernoulli) ``` ### Set priors ```{r} emp_f_priors <- c( prior(normal(0, 1), class = Intercept), prior(normal(0, 0.3), class = b), prior(normal(0, 1), class = sd) ) ``` ### Building model ```{r} emp_f_prior <- brm( emp_f, pca_bake, family = bernoulli, prior = emp_f_priors, sample_prior = "only", iter = 4000, warmup = 2000, cores = 4, refresh=0, chains = 4, control = list( adapt_delta = 0.999, max_treedepth = 20)) ``` ### Prior predictive checks ```{r} pp_check(emp_f_prior, ndraws = 100) ``` ### Fit the model ```{r} fitted_emp_f_prior <- brm( emp_f, pca_bake, family = bernoulli, prior = emp_f_priors, sample_prior = T, iter = 4000, warmup = 2000, cores = 4, refresh=0, chains = 4, control = list( adapt_delta = 0.999, max_treedepth = 20)) ``` ### Posterior predictive checks ```{r} pp_check(fitted_emp_f_prior, ndraws = 100) ``` ### Prior posterior update checks ```{r} variables(fitted_emp_f_prior) Posterior_emp_f1 <- as_draws_df(fitted_emp_f_prior) ggplot(Posterior_emp_f1) + geom_density(aes(prior_Intercept), fill="chartreuse2", color="black",alpha=0.6) + geom_density(aes(b_Intercept), fill="deeppink", color="black",alpha=0.6) + xlab('Intercept') + theme_classic()+ ggtitle("intercept")+ theme(plot.title = element_text(size = 10, face = "bold")) ``` ### Performance check ```{r} #Performance check PerformanceProb2 <- tibble(expand_grid( Sample = seq(4000), Type= c("training", "test")) ) pca_b_test <- pca_b_test %>% mutate(Diagnosis = as.factor(Diagnosis)) pca_bake <- pca_bake %>% mutate(Diagnosis = as.factor(Diagnosis)) pca_bake$PredictionsPerc2 <- predict(fitted_emp_f_prior)[,1] pca_bake$Predictions2[pca_bake$PredictionsPerc2 > 0.5] <- "SCZ" pca_bake$Predictions2[pca_bake$PredictionsPerc2 <= 0.5] <- "CT" pca_b_test$PredictionsPerc2 <- predict(fitted_emp_f_prior, newdata = pca_b_test, allow_new_levels = T)[,1] pca_b_test$Predictions2[pca_b_test$PredictionsPerc2 >= 0.5] <- "SCZ" pca_b_test$Predictions2[pca_b_test$PredictionsPerc2 < 0.5] <- "CT" train2 <- inv_logit_scaled(posterior_linpred(fitted_emp_f_prior, summary = F)) test2 <- inv_logit_scaled(posterior_linpred(fitted_emp_f_prior, summary = F, newdata = pca_b_test, allow_new_levels = T )) # Loop calculating accuracy for (i in seq(4000)){ train_scaled$Predictions2 <- as.factor(ifelse(train2[i,] > 0.5, "SCZ", "CT")) test_scaled$Predictions2 <- as.factor(ifelse(test2[i,] > 0.5, "SCZ", "CT")) PerformanceProb2$Accuracy[PerformanceProb2$Sample == i & PerformanceProb2$Type =="training"] <- accuracy(train_scaled, truth = Diagnosis, estimate = Predictions2)[, ".estimate"] PerformanceProb2$Accuracy[PerformanceProb2$Sample == i & PerformanceProb2$Type =="test"] <- accuracy(test_scaled, truth = Diagnosis, estimate = Predictions2)[, ".estimate"] } ``` ```{r} #Predictions as factor is nesecary for the next step pca_b_test <- pca_b_test %>% mutate(Diagnosis = as.factor(Diagnosis), Predictions2 = as.factor(Predictions2)) pca_bake <- pca_bake %>% mutate(Diagnosis = as.factor(Diagnosis), Predictions2 = as.factor(Predictions2)) # Table showing predictions versus actual diagnosis conf_mat( pca_b_test, truth = Diagnosis, estimate = Predictions2, dnn = c("Predictions", "Truth") ) ``` ```{r} # Table showing accuracy metrics(pca_b_test, truth = Diagnosis, estimate = Predictions2) %>% knitr::kable() ``` ```{r} #Plotting the accuracy ggplot(PerformanceProb2) + geom_point(aes(x = Type, y = as.numeric(Accuracy))) + geom_abline(intercept = 0.5, slope = 0, col=c("Purple"), linetype = c("dashed")) + theme_minimal() + ylab("Accuracy") + xlab("Type") + theme_minimal() + ggtitle("Accuracy between test and train") ```