---
tags: Rscripts
---
# ANCOVA Assumptions testing
### Author: Jocelyn P. Colella
R code (figures embedded, summary results at the bottom of page)
```
# TESTING ASSUMPTIONS fOR ANCOVA
# Following: https://www.datanovia.com/en/lessons/ancova-in-r/#:~:text=The%20Analysis%20of%20Covariance%20(ANCOVA,two%20or%20more%20independent%20groups.)
#ASSUMPTIONS:
# 1 - For each independent variable, the relationship between the dependent variable (y) and the covariate (x) is linear
# 2 - VIOLATED - homogeneity of regression slopes
# 3 - VIOLATED normality of residuals
# 4 - VIOLATED homogeneity of variances (e.g., variance of residuals equal for all groups)
# 5 - VIOLATED No outliers
# ASSUMPTION 1 (NOT VIOLATED) - linear relationship between dependent and independent variables
# To check this, make a scatter plot between covariate (weight) and outcome/dependent variable (EE, RQ, VO2, VCO2, etc)
all_F_EEvWt <- ggplot(all_F,aes(x = weight, y = EE, color = experiment, fill = experiment)) + geom_smooth(method="lm")
all_M_EEvWt <- ggplot(all_M,aes(x = weight, y = EE, color = experiment, fill = experiment)) + geom_smooth(method="lm")
overall_MF_fig <- ggarrange(all_F_EEvWt, all_M_EEvWt, labels = c("Females (all exp)", "Males (all exp)"), ncol = 2, nrow = 1)
overall_MF_fig
```
```
# Create paneled figure by looping through all daytime/nighttime portions of each experimental group
#Male & female dataframes to loop through and plot
M_df_list = c("BL_M", "BL_day_M", "BL_night_M",
"hot_M", "hot_day_M", "hot_night_M",
"cold_M", "cold_day_M", "cold_night_M")
F_df_list = c("BL_F", "BL_day_F", "BL_night_F",
"hot_F", "hot_day_F", "hot_night_F",
"cold_F", "cold_day_F", "cold_night_F")
# Corresponding plot titles
title_list = c("Baseline", "Baseline: daytime", "Baseline: nighttime",
"Hot", "Hot: daytime", "Hot: nighttime",
"Cold", "Cold: daytime", "Cold: nighttime")
# Plot function
myplot <- function(data, title){
ggplot(data,aes(x = weight, y = EE)) + #, color = experiment, fill = experiment)) +
geom_smooth(method="lm", color="black", fill = "grey") +
stat_poly_eq(formula = x~y, aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")),
parse = TRUE, size = 3) +
labs(title = title)
}
###################
# Generate MALE plots
count = 1
M_plot_list <- list()
for(i in F_df_list){
print(myplot(get(i), title_list[count]))
M_plot_list[[count]] <- (myplot(get(i), title_list[count]))
count = count + 1
}
#Plot 9 paneled figure of lm plots
M_panel_fig <- grid.arrange(M_plot_list[[1]], M_plot_list[[2]], M_plot_list[[3]], M_plot_list[[4]], M_plot_list[[5]], M_plot_list[[6]], M_plot_list[[7]], M_plot_list[[8]], M_plot_list[[9]],
top = "Males", nrow = 3)
M_panel_fig
#EXPORT PANELED FIGURE TO PDF
pdf("Males_EExWeight_2July2020.pdf", width = 8.5, height = 11)
grid.arrange(M_plot_list[[1]], M_plot_list[[2]], M_plot_list[[3]], M_plot_list[[4]], M_plot_list[[5]], M_plot_list[[6]], M_plot_list[[7]], M_plot_list[[8]], M_plot_list[[9]],
top = "Males",
nrow = 3)
dev.off()
```

```
####################
# Generate FEMALE plots
count = 1
F_plot_list <- list()
for(i in F_df_list){
print(myplot(get(i), title_list[count]))
F_plot_list[[count]] <- (myplot(get(i), title_list[count]))
count = count + 1
}
#Plot 9 paneled figure of lm plots
F_panel_fig <- grid.arrange(F_plot_list[[1]], F_plot_list[[2]], F_plot_list[[3]], F_plot_list[[4]], F_plot_list[[5]], F_plot_list[[6]], F_plot_list[[7]], F_plot_list[[8]], F_plot_list[[9]],
top = "Females",nrow = 3)
F_panel_fig
#EXPORT PANELED FIGURE TO PDF
pdf("Females_EExWeight_25June2020.pdf", width = 8.5, height = 11)
grid.arrange(F_plot_list[[1]], F_plot_list[[2]], F_plot_list[[3]], F_plot_list[[4]], F_plot_list[[5]], F_plot_list[[6]], F_plot_list[[7]], F_plot_list[[8]], F_plot_list[[9]],
top = "Females",
nrow = 3)
dev.off()
```

```
# ASSUMPTION 2 (VIOLATED) - Homogeneity of regression slopes
# Covariate is independent of the treatment effects (i.e. the covariate and independent variables are independent)
# Check if there is a significant interaction between the covariate (weight) and the grouping (exp) variable
allexp_df_list = c("all_F", "all_day_F", "all_night_F", "all_M", "all_day_M", "all_night_M")
# Single example of this test
all_M %>% anova_test(EE ~ experiment*weight)
#ANOVA Table (type II tests)
# Effect DFn DFd F p p<.05 ges
#1 experiment 2 6464 732.125 2.51e-287 * 0.185
#2 weight 1 6464 98.697 4.30e-23 * 0.015
#3 experiment:weight 2 6464 11.653 8.87e-06 * 0.004 ### SIGNIFICANCE OF THE INTERACTION VIOLATES ASSUMPTION
# Loop through all M/F datasets that contain all 3 experiments (NOTE: instead of printing we could have the loop write to a file)
for(i in allexp_df_list){
print(i)
print(get(i) %>% anova_test(EE ~ experiment*weight))
}
# A significant p value indicates an assumption violation = e.g.,
# ASSUMPTION 3 (VIOLATED) - Normality of residuals
# Single example of this test
model <- lm(EE ~ weight + experiment, data = all_day_F)
model.metrics <- augment(model) %>% select(-.hat, -.sigma, -.fitted, -.se.fit)
dim(model.metrics)
subsamp <- model.metrics[sample(nrow(model.metrics), 4999), ] #shapiro_test can only examine <5000 samples, so randomly downsample
shapiro_test(model.metrics$.resid) #test normality of residuals using model.metrics or subsamp (depending on size)
#variable statistic p.value
#1 subsamp$.resid 0.896 6.83e-50 **** Significance indicates that residuals are NON NORMALLY DISTRIBUTED
#LOOP (NOTE: instead of printing we could have the loop write to a file)
for(i in allexp_df_list){
print(i)
model <- lm(EE ~ weight + experiment, data = get(i))
model.metrics <- augment(model) %>% select(-.hat, -.sigma, -.fitted, -.se.fit)
if (dim(model.metrics)[1] < 5000){
print(shapiro_test(model.metrics$.resid))
} else {
subsamp <- model.metrics[sample(nrow(model.metrics), 4999), ] #shapiro_test can only examine <5000 samples, so randomly downsample
print(shapiro_test(subsamp$.resid))
}
}
# ASSUMPTION 4 (VIOLATED) - homogeneity of variances (e.g., variance of residuals equal for all groups)
model.metrics %>% levene_test(.resid ~ model.metrics$experiment)
#LOOP (NOTE: instead of printing we could have the loop write to a file)
for(i in allexp_df_list){
print(i)
model <- lm(EE ~ weight + experiment, data = get(i))
model.metrics <- augment(model) %>% select(-.hat, -.sigma, -.fitted, -.se.fit)
print(model.metrics %>% levene_test(.resid ~ model.metrics$experiment))
}
#Significant pvalues indicate a violation of this assumption - variance are not homogeneous
# output will contain a warning message: "group coerced to factor" (that's ok because we want the diff experiments [BL, hot, cold] to be treated as groups/factors)
# ASSUMPTION 5 (VIOLATED) - No outliers
# Outliers can be identified by examining the standardized residual (or studentized residual) which is the residual divided by its estimated standard error
# Standardizd residuals can be interpreted as teh number of standard errors away from the regression line
# Check out residual values:
model.metrics$.std.resid
#Identify any that are more than 3 standard deviations from the mean
model.metrics %>% filter(abs(.std.resid) > 3) %>% as.data.frame()
for(i in allexp_df_list){
print(i)
model <- lm(EE ~ weight + experiment, data = get(i))
model.metrics <- augment(model) %>% select(-.hat, -.sigma, -.fitted, -.se.fit)
print(model.metrics %>% filter(abs(.std.resid) > 3) %>% as.data.frame())
}
```
SUMMARY results for assumptions (3, 2, and 5)
| data subset | Shapiro Wilks pval (#3) | anova_test interaction (exp/weight) pval (#2) | # of outliers (#5) |
| -------- | -------- | -------- |-------- |
| all_F | 6.83E-50 | 0.008 | 99
| all_day_F | 1.68E-43 | 0.022 | 56
| all_night_F | 1.08E-25 | 0.008 | 8
| all_M | 9.36E-49 | 0.004 | 80
| all_day_M | 9.34E-45 | 0.003 | 59
| all_night_M | 5.72E-22 | 0.007 | 0
# Remove outliers and rerun tests
```
# REMOVE Male outliers for then remak data subsets and check for outliers again
model <- lm(EE ~ weight + experiment, data = all_M)
model.metrics <- augment(model) %>% select(-.hat, -.sigma, -.fitted, -.se.fit)
summ <- model.metrics %>% filter(abs(.std.resid) > 3) %>% as.data.frame()
M_OL_list <- c()
for (outlier in 1:nrow(summ)){
this_weight <- summ[outlier, "weight"]
this_EE <- summ[outlier, "EE"]
print(which(all_M$weight == this_weight & all_M$EE == this_EE))
M_OL_list <- c(M_OL_list, which(all_M$weight == this_weight & all_M$EE == this_EE))
}
#Remove outlier indices from male dataset
all_noOL_M <- all_M[-c(M_OL_list),]
### Repeat for females
model <- lm(EE ~ weight + experiment, data = all_F)
model.metrics <- augment(model) %>% select(-.hat, -.sigma, -.fitted, -.se.fit)
summ <- model.metrics %>% filter(abs(.std.resid) > 3) %>% as.data.frame()
summ
F_OL_list <- c()
for (outlier in 1:nrow(summ)){
this_weight <- summ[outlier, "weight"]
this_EE <- summ[outlier, "EE"]
print(which(all_F$weight == this_weight & all_F$EE == this_EE))
F_OL_list <- c(F_OL_list, which(all_F$weight == this_weight & all_F$EE == this_EE))
}
print(F_OL_list)
length(F_OL_list)
#Remove outlier indices from male dataset
all_noOL_F <- all_F[-c(F_OL_list),]
```
## Rerun tests of ANCOVA assumption using no-outlier (noOL) datasets
```
###############################################################
###############################################################
###############################################################
# TEST ASSUMPTIONS fOR ANCOVA (https://www.datanovia.com/en/lessons/ancova-in-r/#:~:text=The%20Analysis%20of%20Covariance%20(ANCOVA,two%20or%20more%20independent%20groups.)
# 1 - For each independent variable, the relationship between the dependent variable (y) and the covariate (x) is linear
# 2 - VIOLATED - homogeneity of regression slopes
# 3 - VIOLATED normality of residuals
# 4 - VIOLATED homogeneity of variances (e.g., variance of residuals equal for all groups)
# 5 - VIOLATED No outliers (handled above and run rerun below)
# ASSUMPTION 1 (NOT VIOLATED) - linear relationship between dependent and independent variables
#Nighttime baseline slop is OPPOSITE that of the hot and cold experiments
# TO CHECK, make a scatter plot between covariate (weight) and outcome/dependent variable (EE, RQ, VO2, VCO2, etc)
all_noOL_F_EEvWt <- ggplot(all_noOL_F,aes(x = weight, y = EE, color = experiment, fill = experiment)) + geom_smooth(method="lm")
all_noOL_M_EEvWt <- ggplot(all_noOL_M,aes(x = weight, y = EE, color = experiment, fill = experiment)) + geom_smooth(method="lm")
overall_noOL_MF_fig <- ggarrange(all_noOL_F_EEvWt, all_noOL_M_EEvWt, labels = c("Females (all exp)", "Males (all exp)"), ncol = 2, nrow = 1)
overall_noOL_MF_fig
```

```
# Create paneled figure by looping through all daytime/nighttime portions of each experimental group
#Male & female dataframes to loop through and plot
M_df_list = c("BL_noOL_M", "BL_day_noOL_M", "BL_night_noOL_M",
"hot_noOL_M", "hot_day_noOL_M", "hot_night_noOL_M",
"cold_noOL_M", "cold_day_noOL_M", "cold_night_noOL_M")
F_df_list = c("BL_noOL_F", "BL_day_noOL_F", "BL_night_noOL_F",
"hot_noOL_F", "hot_day_noOL_F", "hot_night_noOL_F",
"cold_noOL_F", "cold_day_noOL_F", "cold_night_noOL_F")
# Corresponding plot titles
title_list = c("Baseline", "Baseline: daytime", "Baseline: nighttime",
"Hot", "Hot: daytime", "Hot: nighttime",
"Cold", "Cold: daytime", "Cold: nighttime")
# Plot function
myplot <- function(data, title){
ggplot(data,aes(x = weight, y = EE)) + #, color = experiment, fill = experiment)) +
geom_smooth(method="lm", color="black", fill = "grey") +
stat_poly_eq(formula = x~y, aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")),
parse = TRUE, size = 3) +
labs(title = title)
}
###################
# Generate MALE plots
count = 1
M_plot_list <- list()
for(i in F_df_list){
print(myplot(get(i), title_list[count]))
M_plot_list[[count]] <- (myplot(get(i), title_list[count]))
count = count + 1
}
#Plot 9 paneled figure of lm plots
M_panel_fig <- grid.arrange(M_plot_list[[1]], M_plot_list[[2]], M_plot_list[[3]], M_plot_list[[4]], M_plot_list[[5]], M_plot_list[[6]], M_plot_list[[7]], M_plot_list[[8]], M_plot_list[[9]],
top = "Males", nrow = 3)
M_panel_fig
#EXPORT PANELED FIGURE TO PDF
pdf("Males_noOL_EExWeight_2July2020.pdf", width = 8.5, height = 11)
grid.arrange(M_plot_list[[1]], M_plot_list[[2]], M_plot_list[[3]], M_plot_list[[4]], M_plot_list[[5]], M_plot_list[[6]], M_plot_list[[7]], M_plot_list[[8]], M_plot_list[[9]],
top = "Males",
nrow = 3)
dev.off()
```

### Note: inverse relationship in previous plots PURELY driven by outliers
```
####################
# Generate FEMALE plots
count = 1
F_plot_list <- list()
for(i in F_df_list){
print(myplot(get(i), title_list[count]))
F_plot_list[[count]] <- (myplot(get(i), title_list[count]))
count = count + 1
}
#Plot 9 paneled figure of lm plots
F_panel_fig <- grid.arrange(F_plot_list[[1]], F_plot_list[[2]], F_plot_list[[3]], F_plot_list[[4]], F_plot_list[[5]], F_plot_list[[6]], F_plot_list[[7]], F_plot_list[[8]], F_plot_list[[9]],
top = "Females",nrow = 3)
F_panel_fig
#EXPORT PANELED FIGURE TO PDF
pdf("Females_noOL_EExWeight_2July2020.pdf", width = 8.5, height = 11)
grid.arrange(F_plot_list[[1]], F_plot_list[[2]], F_plot_list[[3]], F_plot_list[[4]], F_plot_list[[5]], F_plot_list[[6]], F_plot_list[[7]], F_plot_list[[8]], F_plot_list[[9]],
top = "Females",
nrow = 3)
dev.off()
```

### Note: inverse relationship in previous plots PURELY driven by outliers
```
# ASSUMPTION 2 (VIOLATED) - Homogeneity of regression slopes
# Covariate is independent of the treatment effects (i.e. the covariate and independent variables are independent)
# Check if there is a significant interaction between the covariate (weight) and the grouping (exp) variable
allexp_df_list = c("all_noOL_F", "all_day_noOL_F", "all_night_noOL_F", "all_noOL_M", "all_day_noOL_M", "all_night_noOL_M")
# Single example of this test
all_noOL_M %>% anova_test(EE ~ experiment*weight)
#ANOVA Table (type II tests)
# Effect DFn DFd F p p<.05 ges
#1 experiment 2 6464 732.125 2.51e-287 * 0.185
#2 weight 1 6464 98.697 4.30e-23 * 0.015
#3 experiment:weight 2 6464 11.653 8.87e-06 * 0.004 ### SIGNIFICANCE OF THE INTERACTION VIOLATES ASSUMPTION
# Loop through all M/F datasets that contain all 3 experiments (NOTE: instead of printing we could have the loop write to a file)
for(i in allexp_df_list){
print(i)
print(get(i) %>% anova_test(EE ~ experiment*weight))
}
# A significant p value indicates an assumption violation = e.g.,
# ASSUMPTION 3 (VIOLATED) - Normality of residuals
# Single example of this test
model <- lm(EE ~ weight + experiment, data = all_day_noOL_F)
model.metrics <- augment(model) %>% select(-.hat, -.sigma, -.fitted, -.se.fit)
dim(model.metrics)
#subsamp <- model.metrics[sample(nrow(model.metrics), 4999), ] #shapiro_test can only examine <5000 samples, so randomly downsample
shapiro_test(model.metrics$.resid) #test normality of residuals using model.metrics or subsamp (depending on size)
#variable statistic p.value
#1 subsamp$.resid 0.896 6.83e-50 **** Significance indicates that residuals are NON NORMALLY DISTRIBUTED
#LOOP (NOTE: instead of printing we could have the loop write to a file)
for(i in allexp_df_list){
print(i)
model <- lm(EE ~ weight + experiment, data = get(i))
model.metrics <- augment(model) %>% select(-.hat, -.sigma, -.fitted, -.se.fit)
if (dim(model.metrics)[1] < 5000){
print(shapiro_test(model.metrics$.resid))
} else {
subsamp <- model.metrics[sample(nrow(model.metrics), 4999), ] #shapiro_test can only examine <5000 samples, so randomly downsample
print(shapiro_test(subsamp$.resid))
}
}
# ASSUMPTION 4 (VIOLATED) - homogeneity of variances (e.g., variance of residuals equal for all groups)
model.metrics %>% levene_test(.resid ~ model.metrics$experiment)
#LOOP (NOTE: instead of printing we could have the loop write to a file)
for(i in allexp_df_list){
print(i)
model <- lm(EE ~ weight + experiment, data = get(i))
model.metrics <- augment(model) %>% select(-.hat, -.sigma, -.fitted, -.se.fit)
print(model.metrics %>% levene_test(.resid ~ model.metrics$experiment))
}
#Significant pvalues indicate a violation of this assumption - variance are not homogeneous
# output will contain a warning message: "group coerced to factor" (that's ok because we want the diff experiments [BL, hot, cold] to be treated as groups/factors)
# ASSUMPTION 5 (VIOLATED) - No outliers
# Outliers can be identified by examining the standardized residual (or studentized residual) which is the residual divided by its estimated standard error
# Standardizd residuals can be interpreted as teh number of standard errors away from the regression line
# Check out residual values:
model.metrics$.std.resid
#Identify any that are more than 3 standard deviations from the mean
model.metrics %>% filter(abs(.std.resid) > 3) %>% as.data.frame()
for(i in allexp_df_list){
print(i)
model <- lm(EE ~ weight + experiment, data = get(i))
model.metrics <- augment(model) %>% select(-.hat, -.sigma, -.fitted, -.se.fit)
print(model.metrics %>% filter(abs(.std.resid) > 3) %>% as.data.frame())
}
```
SUMMARY of results
