--- tags: Rscripts --- # ANCOVA Assumptions testing ### Author: Jocelyn P. Colella R code (figures embedded, summary results at the bottom of page) ``` # TESTING ASSUMPTIONS fOR ANCOVA # Following: https://www.datanovia.com/en/lessons/ancova-in-r/#:~:text=The%20Analysis%20of%20Covariance%20(ANCOVA,two%20or%20more%20independent%20groups.) #ASSUMPTIONS: # 1 - For each independent variable, the relationship between the dependent variable (y) and the covariate (x) is linear # 2 - VIOLATED - homogeneity of regression slopes # 3 - VIOLATED normality of residuals # 4 - VIOLATED homogeneity of variances (e.g., variance of residuals equal for all groups) # 5 - VIOLATED No outliers # ASSUMPTION 1 (NOT VIOLATED) - linear relationship between dependent and independent variables # To check this, make a scatter plot between covariate (weight) and outcome/dependent variable (EE, RQ, VO2, VCO2, etc) all_F_EEvWt <- ggplot(all_F,aes(x = weight, y = EE, color = experiment, fill = experiment)) + geom_smooth(method="lm") all_M_EEvWt <- ggplot(all_M,aes(x = weight, y = EE, color = experiment, fill = experiment)) + geom_smooth(method="lm") overall_MF_fig <- ggarrange(all_F_EEvWt, all_M_EEvWt, labels = c("Females (all exp)", "Males (all exp)"), ncol = 2, nrow = 1) overall_MF_fig ``` ``` # Create paneled figure by looping through all daytime/nighttime portions of each experimental group #Male & female dataframes to loop through and plot M_df_list = c("BL_M", "BL_day_M", "BL_night_M", "hot_M", "hot_day_M", "hot_night_M", "cold_M", "cold_day_M", "cold_night_M") F_df_list = c("BL_F", "BL_day_F", "BL_night_F", "hot_F", "hot_day_F", "hot_night_F", "cold_F", "cold_day_F", "cold_night_F") # Corresponding plot titles title_list = c("Baseline", "Baseline: daytime", "Baseline: nighttime", "Hot", "Hot: daytime", "Hot: nighttime", "Cold", "Cold: daytime", "Cold: nighttime") # Plot function myplot <- function(data, title){ ggplot(data,aes(x = weight, y = EE)) + #, color = experiment, fill = experiment)) + geom_smooth(method="lm", color="black", fill = "grey") + stat_poly_eq(formula = x~y, aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")), parse = TRUE, size = 3) + labs(title = title) } ################### # Generate MALE plots count = 1 M_plot_list <- list() for(i in F_df_list){ print(myplot(get(i), title_list[count])) M_plot_list[[count]] <- (myplot(get(i), title_list[count])) count = count + 1 } #Plot 9 paneled figure of lm plots M_panel_fig <- grid.arrange(M_plot_list[[1]], M_plot_list[[2]], M_plot_list[[3]], M_plot_list[[4]], M_plot_list[[5]], M_plot_list[[6]], M_plot_list[[7]], M_plot_list[[8]], M_plot_list[[9]], top = "Males", nrow = 3) M_panel_fig #EXPORT PANELED FIGURE TO PDF pdf("Males_EExWeight_2July2020.pdf", width = 8.5, height = 11) grid.arrange(M_plot_list[[1]], M_plot_list[[2]], M_plot_list[[3]], M_plot_list[[4]], M_plot_list[[5]], M_plot_list[[6]], M_plot_list[[7]], M_plot_list[[8]], M_plot_list[[9]], top = "Males", nrow = 3) dev.off() ``` ![](https://i.imgur.com/lsUd5qz.jpg) ``` #################### # Generate FEMALE plots count = 1 F_plot_list <- list() for(i in F_df_list){ print(myplot(get(i), title_list[count])) F_plot_list[[count]] <- (myplot(get(i), title_list[count])) count = count + 1 } #Plot 9 paneled figure of lm plots F_panel_fig <- grid.arrange(F_plot_list[[1]], F_plot_list[[2]], F_plot_list[[3]], F_plot_list[[4]], F_plot_list[[5]], F_plot_list[[6]], F_plot_list[[7]], F_plot_list[[8]], F_plot_list[[9]], top = "Females",nrow = 3) F_panel_fig #EXPORT PANELED FIGURE TO PDF pdf("Females_EExWeight_25June2020.pdf", width = 8.5, height = 11) grid.arrange(F_plot_list[[1]], F_plot_list[[2]], F_plot_list[[3]], F_plot_list[[4]], F_plot_list[[5]], F_plot_list[[6]], F_plot_list[[7]], F_plot_list[[8]], F_plot_list[[9]], top = "Females", nrow = 3) dev.off() ``` ![](https://i.imgur.com/Y5L6TYk.jpg) ``` # ASSUMPTION 2 (VIOLATED) - Homogeneity of regression slopes # Covariate is independent of the treatment effects (i.e. the covariate and independent variables are independent) # Check if there is a significant interaction between the covariate (weight) and the grouping (exp) variable allexp_df_list = c("all_F", "all_day_F", "all_night_F", "all_M", "all_day_M", "all_night_M") # Single example of this test all_M %>% anova_test(EE ~ experiment*weight) #ANOVA Table (type II tests) # Effect DFn DFd F p p<.05 ges #1 experiment 2 6464 732.125 2.51e-287 * 0.185 #2 weight 1 6464 98.697 4.30e-23 * 0.015 #3 experiment:weight 2 6464 11.653 8.87e-06 * 0.004 ### SIGNIFICANCE OF THE INTERACTION VIOLATES ASSUMPTION # Loop through all M/F datasets that contain all 3 experiments (NOTE: instead of printing we could have the loop write to a file) for(i in allexp_df_list){ print(i) print(get(i) %>% anova_test(EE ~ experiment*weight)) } # A significant p value indicates an assumption violation = e.g., # ASSUMPTION 3 (VIOLATED) - Normality of residuals # Single example of this test model <- lm(EE ~ weight + experiment, data = all_day_F) model.metrics <- augment(model) %>% select(-.hat, -.sigma, -.fitted, -.se.fit) dim(model.metrics) subsamp <- model.metrics[sample(nrow(model.metrics), 4999), ] #shapiro_test can only examine <5000 samples, so randomly downsample shapiro_test(model.metrics$.resid) #test normality of residuals using model.metrics or subsamp (depending on size) #variable statistic p.value #1 subsamp$.resid 0.896 6.83e-50 **** Significance indicates that residuals are NON NORMALLY DISTRIBUTED #LOOP (NOTE: instead of printing we could have the loop write to a file) for(i in allexp_df_list){ print(i) model <- lm(EE ~ weight + experiment, data = get(i)) model.metrics <- augment(model) %>% select(-.hat, -.sigma, -.fitted, -.se.fit) if (dim(model.metrics)[1] < 5000){ print(shapiro_test(model.metrics$.resid)) } else { subsamp <- model.metrics[sample(nrow(model.metrics), 4999), ] #shapiro_test can only examine <5000 samples, so randomly downsample print(shapiro_test(subsamp$.resid)) } } # ASSUMPTION 4 (VIOLATED) - homogeneity of variances (e.g., variance of residuals equal for all groups) model.metrics %>% levene_test(.resid ~ model.metrics$experiment) #LOOP (NOTE: instead of printing we could have the loop write to a file) for(i in allexp_df_list){ print(i) model <- lm(EE ~ weight + experiment, data = get(i)) model.metrics <- augment(model) %>% select(-.hat, -.sigma, -.fitted, -.se.fit) print(model.metrics %>% levene_test(.resid ~ model.metrics$experiment)) } #Significant pvalues indicate a violation of this assumption - variance are not homogeneous # output will contain a warning message: "group coerced to factor" (that's ok because we want the diff experiments [BL, hot, cold] to be treated as groups/factors) # ASSUMPTION 5 (VIOLATED) - No outliers # Outliers can be identified by examining the standardized residual (or studentized residual) which is the residual divided by its estimated standard error # Standardizd residuals can be interpreted as teh number of standard errors away from the regression line # Check out residual values: model.metrics$.std.resid #Identify any that are more than 3 standard deviations from the mean model.metrics %>% filter(abs(.std.resid) > 3) %>% as.data.frame() for(i in allexp_df_list){ print(i) model <- lm(EE ~ weight + experiment, data = get(i)) model.metrics <- augment(model) %>% select(-.hat, -.sigma, -.fitted, -.se.fit) print(model.metrics %>% filter(abs(.std.resid) > 3) %>% as.data.frame()) } ``` SUMMARY results for assumptions (3, 2, and 5) | data subset | Shapiro Wilks pval (#3) | anova_test interaction (exp/weight) pval (#2) | # of outliers (#5) | | -------- | -------- | -------- |-------- | | all_F | 6.83E-50 | 0.008 | 99 | all_day_F | 1.68E-43 | 0.022 | 56 | all_night_F | 1.08E-25 | 0.008 | 8 | all_M | 9.36E-49 | 0.004 | 80 | all_day_M | 9.34E-45 | 0.003 | 59 | all_night_M | 5.72E-22 | 0.007 | 0 # Remove outliers and rerun tests ``` # REMOVE Male outliers for then remak data subsets and check for outliers again model <- lm(EE ~ weight + experiment, data = all_M) model.metrics <- augment(model) %>% select(-.hat, -.sigma, -.fitted, -.se.fit) summ <- model.metrics %>% filter(abs(.std.resid) > 3) %>% as.data.frame() M_OL_list <- c() for (outlier in 1:nrow(summ)){ this_weight <- summ[outlier, "weight"] this_EE <- summ[outlier, "EE"] print(which(all_M$weight == this_weight & all_M$EE == this_EE)) M_OL_list <- c(M_OL_list, which(all_M$weight == this_weight & all_M$EE == this_EE)) } #Remove outlier indices from male dataset all_noOL_M <- all_M[-c(M_OL_list),] ### Repeat for females model <- lm(EE ~ weight + experiment, data = all_F) model.metrics <- augment(model) %>% select(-.hat, -.sigma, -.fitted, -.se.fit) summ <- model.metrics %>% filter(abs(.std.resid) > 3) %>% as.data.frame() summ F_OL_list <- c() for (outlier in 1:nrow(summ)){ this_weight <- summ[outlier, "weight"] this_EE <- summ[outlier, "EE"] print(which(all_F$weight == this_weight & all_F$EE == this_EE)) F_OL_list <- c(F_OL_list, which(all_F$weight == this_weight & all_F$EE == this_EE)) } print(F_OL_list) length(F_OL_list) #Remove outlier indices from male dataset all_noOL_F <- all_F[-c(F_OL_list),] ``` ## Rerun tests of ANCOVA assumption using no-outlier (noOL) datasets ``` ############################################################### ############################################################### ############################################################### # TEST ASSUMPTIONS fOR ANCOVA (https://www.datanovia.com/en/lessons/ancova-in-r/#:~:text=The%20Analysis%20of%20Covariance%20(ANCOVA,two%20or%20more%20independent%20groups.) # 1 - For each independent variable, the relationship between the dependent variable (y) and the covariate (x) is linear # 2 - VIOLATED - homogeneity of regression slopes # 3 - VIOLATED normality of residuals # 4 - VIOLATED homogeneity of variances (e.g., variance of residuals equal for all groups) # 5 - VIOLATED No outliers (handled above and run rerun below) # ASSUMPTION 1 (NOT VIOLATED) - linear relationship between dependent and independent variables #Nighttime baseline slop is OPPOSITE that of the hot and cold experiments # TO CHECK, make a scatter plot between covariate (weight) and outcome/dependent variable (EE, RQ, VO2, VCO2, etc) all_noOL_F_EEvWt <- ggplot(all_noOL_F,aes(x = weight, y = EE, color = experiment, fill = experiment)) + geom_smooth(method="lm") all_noOL_M_EEvWt <- ggplot(all_noOL_M,aes(x = weight, y = EE, color = experiment, fill = experiment)) + geom_smooth(method="lm") overall_noOL_MF_fig <- ggarrange(all_noOL_F_EEvWt, all_noOL_M_EEvWt, labels = c("Females (all exp)", "Males (all exp)"), ncol = 2, nrow = 1) overall_noOL_MF_fig ``` ![](https://i.imgur.com/6rKUOsg.jpg) ``` # Create paneled figure by looping through all daytime/nighttime portions of each experimental group #Male & female dataframes to loop through and plot M_df_list = c("BL_noOL_M", "BL_day_noOL_M", "BL_night_noOL_M", "hot_noOL_M", "hot_day_noOL_M", "hot_night_noOL_M", "cold_noOL_M", "cold_day_noOL_M", "cold_night_noOL_M") F_df_list = c("BL_noOL_F", "BL_day_noOL_F", "BL_night_noOL_F", "hot_noOL_F", "hot_day_noOL_F", "hot_night_noOL_F", "cold_noOL_F", "cold_day_noOL_F", "cold_night_noOL_F") # Corresponding plot titles title_list = c("Baseline", "Baseline: daytime", "Baseline: nighttime", "Hot", "Hot: daytime", "Hot: nighttime", "Cold", "Cold: daytime", "Cold: nighttime") # Plot function myplot <- function(data, title){ ggplot(data,aes(x = weight, y = EE)) + #, color = experiment, fill = experiment)) + geom_smooth(method="lm", color="black", fill = "grey") + stat_poly_eq(formula = x~y, aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")), parse = TRUE, size = 3) + labs(title = title) } ################### # Generate MALE plots count = 1 M_plot_list <- list() for(i in F_df_list){ print(myplot(get(i), title_list[count])) M_plot_list[[count]] <- (myplot(get(i), title_list[count])) count = count + 1 } #Plot 9 paneled figure of lm plots M_panel_fig <- grid.arrange(M_plot_list[[1]], M_plot_list[[2]], M_plot_list[[3]], M_plot_list[[4]], M_plot_list[[5]], M_plot_list[[6]], M_plot_list[[7]], M_plot_list[[8]], M_plot_list[[9]], top = "Males", nrow = 3) M_panel_fig #EXPORT PANELED FIGURE TO PDF pdf("Males_noOL_EExWeight_2July2020.pdf", width = 8.5, height = 11) grid.arrange(M_plot_list[[1]], M_plot_list[[2]], M_plot_list[[3]], M_plot_list[[4]], M_plot_list[[5]], M_plot_list[[6]], M_plot_list[[7]], M_plot_list[[8]], M_plot_list[[9]], top = "Males", nrow = 3) dev.off() ``` ![](https://i.imgur.com/0t8KK5U.jpg) ### Note: inverse relationship in previous plots PURELY driven by outliers ``` #################### # Generate FEMALE plots count = 1 F_plot_list <- list() for(i in F_df_list){ print(myplot(get(i), title_list[count])) F_plot_list[[count]] <- (myplot(get(i), title_list[count])) count = count + 1 } #Plot 9 paneled figure of lm plots F_panel_fig <- grid.arrange(F_plot_list[[1]], F_plot_list[[2]], F_plot_list[[3]], F_plot_list[[4]], F_plot_list[[5]], F_plot_list[[6]], F_plot_list[[7]], F_plot_list[[8]], F_plot_list[[9]], top = "Females",nrow = 3) F_panel_fig #EXPORT PANELED FIGURE TO PDF pdf("Females_noOL_EExWeight_2July2020.pdf", width = 8.5, height = 11) grid.arrange(F_plot_list[[1]], F_plot_list[[2]], F_plot_list[[3]], F_plot_list[[4]], F_plot_list[[5]], F_plot_list[[6]], F_plot_list[[7]], F_plot_list[[8]], F_plot_list[[9]], top = "Females", nrow = 3) dev.off() ``` ![](https://i.imgur.com/DPi2QFB.jpg) ### Note: inverse relationship in previous plots PURELY driven by outliers ``` # ASSUMPTION 2 (VIOLATED) - Homogeneity of regression slopes # Covariate is independent of the treatment effects (i.e. the covariate and independent variables are independent) # Check if there is a significant interaction between the covariate (weight) and the grouping (exp) variable allexp_df_list = c("all_noOL_F", "all_day_noOL_F", "all_night_noOL_F", "all_noOL_M", "all_day_noOL_M", "all_night_noOL_M") # Single example of this test all_noOL_M %>% anova_test(EE ~ experiment*weight) #ANOVA Table (type II tests) # Effect DFn DFd F p p<.05 ges #1 experiment 2 6464 732.125 2.51e-287 * 0.185 #2 weight 1 6464 98.697 4.30e-23 * 0.015 #3 experiment:weight 2 6464 11.653 8.87e-06 * 0.004 ### SIGNIFICANCE OF THE INTERACTION VIOLATES ASSUMPTION # Loop through all M/F datasets that contain all 3 experiments (NOTE: instead of printing we could have the loop write to a file) for(i in allexp_df_list){ print(i) print(get(i) %>% anova_test(EE ~ experiment*weight)) } # A significant p value indicates an assumption violation = e.g., # ASSUMPTION 3 (VIOLATED) - Normality of residuals # Single example of this test model <- lm(EE ~ weight + experiment, data = all_day_noOL_F) model.metrics <- augment(model) %>% select(-.hat, -.sigma, -.fitted, -.se.fit) dim(model.metrics) #subsamp <- model.metrics[sample(nrow(model.metrics), 4999), ] #shapiro_test can only examine <5000 samples, so randomly downsample shapiro_test(model.metrics$.resid) #test normality of residuals using model.metrics or subsamp (depending on size) #variable statistic p.value #1 subsamp$.resid 0.896 6.83e-50 **** Significance indicates that residuals are NON NORMALLY DISTRIBUTED #LOOP (NOTE: instead of printing we could have the loop write to a file) for(i in allexp_df_list){ print(i) model <- lm(EE ~ weight + experiment, data = get(i)) model.metrics <- augment(model) %>% select(-.hat, -.sigma, -.fitted, -.se.fit) if (dim(model.metrics)[1] < 5000){ print(shapiro_test(model.metrics$.resid)) } else { subsamp <- model.metrics[sample(nrow(model.metrics), 4999), ] #shapiro_test can only examine <5000 samples, so randomly downsample print(shapiro_test(subsamp$.resid)) } } # ASSUMPTION 4 (VIOLATED) - homogeneity of variances (e.g., variance of residuals equal for all groups) model.metrics %>% levene_test(.resid ~ model.metrics$experiment) #LOOP (NOTE: instead of printing we could have the loop write to a file) for(i in allexp_df_list){ print(i) model <- lm(EE ~ weight + experiment, data = get(i)) model.metrics <- augment(model) %>% select(-.hat, -.sigma, -.fitted, -.se.fit) print(model.metrics %>% levene_test(.resid ~ model.metrics$experiment)) } #Significant pvalues indicate a violation of this assumption - variance are not homogeneous # output will contain a warning message: "group coerced to factor" (that's ok because we want the diff experiments [BL, hot, cold] to be treated as groups/factors) # ASSUMPTION 5 (VIOLATED) - No outliers # Outliers can be identified by examining the standardized residual (or studentized residual) which is the residual divided by its estimated standard error # Standardizd residuals can be interpreted as teh number of standard errors away from the regression line # Check out residual values: model.metrics$.std.resid #Identify any that are more than 3 standard deviations from the mean model.metrics %>% filter(abs(.std.resid) > 3) %>% as.data.frame() for(i in allexp_df_list){ print(i) model <- lm(EE ~ weight + experiment, data = get(i)) model.metrics <- augment(model) %>% select(-.hat, -.sigma, -.fitted, -.se.fit) print(model.metrics %>% filter(abs(.std.resid) > 3) %>% as.data.frame()) } ``` SUMMARY of results ![](https://i.imgur.com/PdH5Vcw.jpg)