Try   HackMD

ANCOVA Assumptions testing

Author: Jocelyn P. Colella

R code (figures embedded, summary results at the bottom of page)

# TESTING ASSUMPTIONS fOR ANCOVA
# Following: https://www.datanovia.com/en/lessons/ancova-in-r/#:~:text=The%20Analysis%20of%20Covariance%20(ANCOVA,two%20or%20more%20independent%20groups.)

  #ASSUMPTIONS:
  # 1 - For each independent variable, the relationship between the dependent variable (y) and the covariate (x) is linear
  # 2 - VIOLATED - homogeneity of regression slopes
  # 3 - VIOLATED normality of residuals
  # 4 - VIOLATED homogeneity of variances (e.g., variance of residuals equal for all groups)
  # 5 - VIOLATED No outliers


# ASSUMPTION 1 (NOT VIOLATED) -  linear relationship between dependent and independent variables
# To check this, make a scatter plot between covariate (weight) and outcome/dependent variable (EE, RQ, VO2, VCO2, etc)
all_F_EEvWt <- ggplot(all_F,aes(x = weight, y = EE, color = experiment, fill = experiment)) + geom_smooth(method="lm")
all_M_EEvWt <- ggplot(all_M,aes(x = weight, y = EE, color = experiment, fill = experiment)) + geom_smooth(method="lm")

overall_MF_fig <- ggarrange(all_F_EEvWt, all_M_EEvWt, labels = c("Females (all exp)", "Males (all exp)"), ncol = 2, nrow = 1)
overall_MF_fig
# Create paneled figure by looping through all daytime/nighttime portions of each experimental group
#Male & female dataframes to loop through and plot
M_df_list = c("BL_M", "BL_day_M", "BL_night_M",
              "hot_M",  "hot_day_M", "hot_night_M",
              "cold_M", "cold_day_M", "cold_night_M")
F_df_list = c("BL_F", "BL_day_F", "BL_night_F",
              "hot_F",  "hot_day_F", "hot_night_F",
              "cold_F", "cold_day_F", "cold_night_F")
# Corresponding plot titles
title_list = c("Baseline", "Baseline: daytime", "Baseline: nighttime",
                 "Hot",  "Hot: daytime", "Hot: nighttime",
                 "Cold", "Cold: daytime", "Cold: nighttime")

# Plot function
myplot <- function(data, title){
  ggplot(data,aes(x = weight, y = EE)) + #, color = experiment, fill = experiment)) +
    geom_smooth(method="lm", color="black", fill = "grey") +
    stat_poly_eq(formula = x~y, aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")),
                 parse = TRUE, size = 3) +
    labs(title = title)
}

###################
# Generate MALE plots
count = 1
M_plot_list <- list()
for(i in F_df_list){
  print(myplot(get(i), title_list[count]))
  M_plot_list[[count]] <- (myplot(get(i), title_list[count]))
  count = count + 1
}

#Plot 9 paneled figure of lm plots
M_panel_fig <- grid.arrange(M_plot_list[[1]], M_plot_list[[2]], M_plot_list[[3]], M_plot_list[[4]], M_plot_list[[5]], M_plot_list[[6]], M_plot_list[[7]], M_plot_list[[8]], M_plot_list[[9]],
             top = "Males", nrow = 3)
M_panel_fig

#EXPORT PANELED FIGURE TO PDF
pdf("Males_EExWeight_2July2020.pdf", width = 8.5, height = 11)
grid.arrange(M_plot_list[[1]], M_plot_list[[2]], M_plot_list[[3]], M_plot_list[[4]], M_plot_list[[5]], M_plot_list[[6]], M_plot_list[[7]], M_plot_list[[8]], M_plot_list[[9]],
             top = "Males",
             nrow = 3)
dev.off()

Image Not Showing Possible Reasons
  • The image file may be corrupted
  • The server hosting the image is unavailable
  • The image path is incorrect
  • The image format is not supported
Learn More →

####################
# Generate FEMALE plots
count = 1
F_plot_list <- list()
for(i in F_df_list){
  print(myplot(get(i), title_list[count]))
  F_plot_list[[count]] <- (myplot(get(i), title_list[count]))
  count = count + 1
}
#Plot 9 paneled figure of lm plots
F_panel_fig <- grid.arrange(F_plot_list[[1]], F_plot_list[[2]], F_plot_list[[3]], F_plot_list[[4]], F_plot_list[[5]], F_plot_list[[6]], F_plot_list[[7]], F_plot_list[[8]], F_plot_list[[9]],
             top = "Females",nrow = 3)
F_panel_fig

#EXPORT PANELED FIGURE TO PDF
pdf("Females_EExWeight_25June2020.pdf", width = 8.5, height = 11)
grid.arrange(F_plot_list[[1]], F_plot_list[[2]], F_plot_list[[3]], F_plot_list[[4]], F_plot_list[[5]], F_plot_list[[6]], F_plot_list[[7]], F_plot_list[[8]], F_plot_list[[9]],
             top = "Females",
             nrow = 3)
dev.off()

Image Not Showing Possible Reasons
  • The image file may be corrupted
  • The server hosting the image is unavailable
  • The image path is incorrect
  • The image format is not supported
Learn More →

# ASSUMPTION 2 (VIOLATED) - Homogeneity of regression slopes
# Covariate is independent of the treatment effects (i.e. the covariate and independent variables are independent)
# Check if there is a significant interaction between the covariate (weight) and the grouping (exp) variable
allexp_df_list = c("all_F", "all_day_F", "all_night_F", "all_M", "all_day_M", "all_night_M")

# Single example of this test
all_M %>% anova_test(EE ~ experiment*weight)
#ANOVA Table (type II tests)
# Effect DFn  DFd       F         p p<.05   ges
#1        experiment   2 6464 732.125 2.51e-287     * 0.185
#2            weight   1 6464  98.697  4.30e-23     * 0.015
#3 experiment:weight   2 6464  11.653  8.87e-06     * 0.004 ### SIGNIFICANCE OF THE INTERACTION VIOLATES ASSUMPTION

# Loop through all M/F datasets that contain all 3 experiments (NOTE: instead of printing we could have the loop write to a file)
for(i in allexp_df_list){
  print(i)
  print(get(i) %>% anova_test(EE ~ experiment*weight))
}
  # A significant p value indicates an assumption violation = e.g., 


# ASSUMPTION 3 (VIOLATED) - Normality of residuals 
# Single example of this test
model <- lm(EE ~ weight + experiment, data = all_day_F)
model.metrics <- augment(model) %>% select(-.hat, -.sigma, -.fitted, -.se.fit)
dim(model.metrics)
subsamp <- model.metrics[sample(nrow(model.metrics), 4999), ] #shapiro_test can only examine <5000 samples, so randomly downsample
shapiro_test(model.metrics$.resid) #test normality of residuals using model.metrics or subsamp (depending on size)
  #variable         statistic  p.value
  #1 subsamp$.resid     0.896 6.83e-50 **** Significance indicates that residuals are NON NORMALLY DISTRIBUTED

#LOOP (NOTE: instead of printing we could have the loop write to a file)
for(i in allexp_df_list){
  print(i)
  model <- lm(EE ~ weight + experiment, data = get(i))
  model.metrics <- augment(model) %>% select(-.hat, -.sigma, -.fitted, -.se.fit)
  if (dim(model.metrics)[1] < 5000){
    print(shapiro_test(model.metrics$.resid))
  } else {
    subsamp <- model.metrics[sample(nrow(model.metrics), 4999), ] #shapiro_test can only examine <5000 samples, so randomly downsample
    print(shapiro_test(subsamp$.resid))
  }
}


# ASSUMPTION 4 (VIOLATED) - homogeneity of variances (e.g., variance of residuals equal for all groups)
model.metrics %>% levene_test(.resid ~ model.metrics$experiment)

#LOOP (NOTE: instead of printing we could have the loop write to a file)
for(i in allexp_df_list){
  print(i)
  model <- lm(EE ~ weight + experiment, data = get(i))
  model.metrics <- augment(model) %>% select(-.hat, -.sigma, -.fitted, -.se.fit)
  print(model.metrics %>% levene_test(.resid ~ model.metrics$experiment))
}
  #Significant pvalues indicate a violation of this assumption - variance are not homogeneous
  # output will contain a warning message: "group coerced to factor" (that's ok because we want the diff experiments [BL, hot, cold] to be treated as groups/factors)

# ASSUMPTION 5 (VIOLATED) - No outliers
# Outliers can be identified by examining the standardized residual (or studentized residual) which is the residual divided by its estimated standard error
# Standardizd residuals can be interpreted as teh number of standard errors away from the regression line
# Check out residual values:
model.metrics$.std.resid
#Identify any that are more than 3 standard deviations from the mean
model.metrics %>%  filter(abs(.std.resid) > 3) %>% as.data.frame()

for(i in allexp_df_list){
  print(i)
  model <- lm(EE ~ weight + experiment, data = get(i))
  model.metrics <- augment(model) %>% select(-.hat, -.sigma, -.fitted, -.se.fit)
  print(model.metrics %>% filter(abs(.std.resid) > 3) %>% as.data.frame())
}

SUMMARY results for assumptions (3, 2, and 5)

data subset Shapiro Wilks pval (#3) anova_test interaction (exp/weight) pval (#2) # of outliers (#5)
all_F 6.83E-50 0.008 99
all_day_F 1.68E-43 0.022 56
all_night_F 1.08E-25 0.008 8
all_M 9.36E-49 0.004 80
all_day_M 9.34E-45 0.003 59
all_night_M 5.72E-22 0.007 0

Remove outliers and rerun tests

# REMOVE Male outliers for then remak data subsets and check for outliers again
model <- lm(EE ~ weight + experiment, data = all_M)
model.metrics <- augment(model) %>% select(-.hat, -.sigma, -.fitted, -.se.fit)
summ <- model.metrics %>% filter(abs(.std.resid) > 3) %>% as.data.frame()

M_OL_list <- c()
for (outlier in 1:nrow(summ)){
  this_weight <- summ[outlier, "weight"]
  this_EE  <- summ[outlier, "EE"]
  print(which(all_M$weight == this_weight & all_M$EE == this_EE))
  M_OL_list <- c(M_OL_list, which(all_M$weight == this_weight & all_M$EE == this_EE))
}

#Remove outlier indices from male dataset
all_noOL_M <- all_M[-c(M_OL_list),]


### Repeat for females
model <- lm(EE ~ weight + experiment, data = all_F)
model.metrics <- augment(model) %>% select(-.hat, -.sigma, -.fitted, -.se.fit)
summ <- model.metrics %>% filter(abs(.std.resid) > 3) %>% as.data.frame()
summ
F_OL_list <- c()
for (outlier in 1:nrow(summ)){
  this_weight <- summ[outlier, "weight"]
  this_EE  <- summ[outlier, "EE"]
  print(which(all_F$weight == this_weight & all_F$EE == this_EE))
  F_OL_list <- c(F_OL_list, which(all_F$weight == this_weight & all_F$EE == this_EE))
}
print(F_OL_list)
length(F_OL_list)

#Remove outlier indices from male dataset
all_noOL_F <- all_F[-c(F_OL_list),]

Rerun tests of ANCOVA assumption using no-outlier (noOL) datasets

###############################################################
###############################################################
###############################################################
# TEST ASSUMPTIONS fOR ANCOVA (https://www.datanovia.com/en/lessons/ancova-in-r/#:~:text=The%20Analysis%20of%20Covariance%20(ANCOVA,two%20or%20more%20independent%20groups.)
  # 1 - For each independent variable, the relationship between the dependent variable (y) and the covariate (x) is linear
  # 2 - VIOLATED - homogeneity of regression slopes
  # 3 - VIOLATED normality of residuals
  # 4 - VIOLATED homogeneity of variances (e.g., variance of residuals equal for all groups)
  # 5 - VIOLATED No outliers (handled above and run rerun below)


# ASSUMPTION 1 (NOT VIOLATED) -  linear relationship between dependent and independent variables
  #Nighttime baseline slop is OPPOSITE that of the hot and cold experiments
# TO CHECK, make a scatter plot between covariate (weight) and outcome/dependent variable (EE, RQ, VO2, VCO2, etc)
all_noOL_F_EEvWt <- ggplot(all_noOL_F,aes(x = weight, y = EE, color = experiment, fill = experiment)) + geom_smooth(method="lm")
all_noOL_M_EEvWt <- ggplot(all_noOL_M,aes(x = weight, y = EE, color = experiment, fill = experiment)) + geom_smooth(method="lm")

overall_noOL_MF_fig <- ggarrange(all_noOL_F_EEvWt, all_noOL_M_EEvWt, labels = c("Females (all exp)", "Males (all exp)"), ncol = 2, nrow = 1)
overall_noOL_MF_fig


# Create paneled figure by looping through all daytime/nighttime portions of each experimental group
#Male & female dataframes to loop through and plot
M_df_list = c("BL_noOL_M", "BL_day_noOL_M", "BL_night_noOL_M",
              "hot_noOL_M",  "hot_day_noOL_M", "hot_night_noOL_M",
              "cold_noOL_M", "cold_day_noOL_M", "cold_night_noOL_M")
F_df_list = c("BL_noOL_F", "BL_day_noOL_F", "BL_night_noOL_F",
              "hot_noOL_F",  "hot_day_noOL_F", "hot_night_noOL_F",
              "cold_noOL_F", "cold_day_noOL_F", "cold_night_noOL_F")
# Corresponding plot titles
title_list = c("Baseline", "Baseline: daytime", "Baseline: nighttime",
                 "Hot",  "Hot: daytime", "Hot: nighttime",
                 "Cold", "Cold: daytime", "Cold: nighttime")

# Plot function
myplot <- function(data, title){
  ggplot(data,aes(x = weight, y = EE)) + #, color = experiment, fill = experiment)) +
    geom_smooth(method="lm", color="black", fill = "grey") +
    stat_poly_eq(formula = x~y, aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")),
                 parse = TRUE, size = 3) +
    labs(title = title)
}

###################
# Generate MALE plots
count = 1
M_plot_list <- list()
for(i in F_df_list){
  print(myplot(get(i), title_list[count]))
  M_plot_list[[count]] <- (myplot(get(i), title_list[count]))
  count = count + 1
}

#Plot 9 paneled figure of lm plots
M_panel_fig <- grid.arrange(M_plot_list[[1]], M_plot_list[[2]], M_plot_list[[3]], M_plot_list[[4]], M_plot_list[[5]], M_plot_list[[6]], M_plot_list[[7]], M_plot_list[[8]], M_plot_list[[9]],
             top = "Males", nrow = 3)
M_panel_fig

#EXPORT PANELED FIGURE TO PDF
pdf("Males_noOL_EExWeight_2July2020.pdf", width = 8.5, height = 11)
grid.arrange(M_plot_list[[1]], M_plot_list[[2]], M_plot_list[[3]], M_plot_list[[4]], M_plot_list[[5]], M_plot_list[[6]], M_plot_list[[7]], M_plot_list[[8]], M_plot_list[[9]],
             top = "Males",
             nrow = 3)
dev.off()

Note: inverse relationship in previous plots PURELY driven by outliers

####################
# Generate FEMALE plots
count = 1
F_plot_list <- list()
for(i in F_df_list){
  print(myplot(get(i), title_list[count]))
  F_plot_list[[count]] <- (myplot(get(i), title_list[count]))
  count = count + 1
}
#Plot 9 paneled figure of lm plots
F_panel_fig <- grid.arrange(F_plot_list[[1]], F_plot_list[[2]], F_plot_list[[3]], F_plot_list[[4]], F_plot_list[[5]], F_plot_list[[6]], F_plot_list[[7]], F_plot_list[[8]], F_plot_list[[9]],
             top = "Females",nrow = 3)
F_panel_fig

#EXPORT PANELED FIGURE TO PDF
pdf("Females_noOL_EExWeight_2July2020.pdf", width = 8.5, height = 11)
grid.arrange(F_plot_list[[1]], F_plot_list[[2]], F_plot_list[[3]], F_plot_list[[4]], F_plot_list[[5]], F_plot_list[[6]], F_plot_list[[7]], F_plot_list[[8]], F_plot_list[[9]],
             top = "Females",
             nrow = 3)
dev.off()

Note: inverse relationship in previous plots PURELY driven by outliers

# ASSUMPTION 2 (VIOLATED) - Homogeneity of regression slopes
# Covariate is independent of the treatment effects (i.e. the covariate and independent variables are independent)
# Check if there is a significant interaction between the covariate (weight) and the grouping (exp) variable
allexp_df_list = c("all_noOL_F", "all_day_noOL_F", "all_night_noOL_F", "all_noOL_M", "all_day_noOL_M", "all_night_noOL_M")

# Single example of this test
all_noOL_M %>% anova_test(EE ~ experiment*weight)
#ANOVA Table (type II tests)
# Effect DFn  DFd       F         p p<.05   ges
#1        experiment   2 6464 732.125 2.51e-287     * 0.185
#2            weight   1 6464  98.697  4.30e-23     * 0.015
#3 experiment:weight   2 6464  11.653  8.87e-06     * 0.004 ### SIGNIFICANCE OF THE INTERACTION VIOLATES ASSUMPTION

# Loop through all M/F datasets that contain all 3 experiments (NOTE: instead of printing we could have the loop write to a file)
for(i in allexp_df_list){
  print(i)
  print(get(i) %>% anova_test(EE ~ experiment*weight))
}
  # A significant p value indicates an assumption violation = e.g., 


# ASSUMPTION 3 (VIOLATED) - Normality of residuals 
# Single example of this test
model <- lm(EE ~ weight + experiment, data = all_day_noOL_F)
model.metrics <- augment(model) %>% select(-.hat, -.sigma, -.fitted, -.se.fit)
dim(model.metrics)
#subsamp <- model.metrics[sample(nrow(model.metrics), 4999), ] #shapiro_test can only examine <5000 samples, so randomly downsample
shapiro_test(model.metrics$.resid) #test normality of residuals using model.metrics or subsamp (depending on size)
  #variable         statistic  p.value
  #1 subsamp$.resid     0.896 6.83e-50 **** Significance indicates that residuals are NON NORMALLY DISTRIBUTED

#LOOP (NOTE: instead of printing we could have the loop write to a file)
for(i in allexp_df_list){
  print(i)
  model <- lm(EE ~ weight + experiment, data = get(i))
  model.metrics <- augment(model) %>% select(-.hat, -.sigma, -.fitted, -.se.fit)
  if (dim(model.metrics)[1] < 5000){
    print(shapiro_test(model.metrics$.resid))
  } else {
    subsamp <- model.metrics[sample(nrow(model.metrics), 4999), ] #shapiro_test can only examine <5000 samples, so randomly downsample
    print(shapiro_test(subsamp$.resid))
  }
}


# ASSUMPTION 4 (VIOLATED) - homogeneity of variances (e.g., variance of residuals equal for all groups)
model.metrics %>% levene_test(.resid ~ model.metrics$experiment)

#LOOP (NOTE: instead of printing we could have the loop write to a file)
for(i in allexp_df_list){
  print(i)
  model <- lm(EE ~ weight + experiment, data = get(i))
  model.metrics <- augment(model) %>% select(-.hat, -.sigma, -.fitted, -.se.fit)
  print(model.metrics %>% levene_test(.resid ~ model.metrics$experiment))
}
  #Significant pvalues indicate a violation of this assumption - variance are not homogeneous
  # output will contain a warning message: "group coerced to factor" (that's ok because we want the diff experiments [BL, hot, cold] to be treated as groups/factors)

# ASSUMPTION 5 (VIOLATED) - No outliers
# Outliers can be identified by examining the standardized residual (or studentized residual) which is the residual divided by its estimated standard error
# Standardizd residuals can be interpreted as teh number of standard errors away from the regression line
# Check out residual values:
model.metrics$.std.resid
#Identify any that are more than 3 standard deviations from the mean
model.metrics %>%  filter(abs(.std.resid) > 3) %>% as.data.frame()

for(i in allexp_df_list){
  print(i)
  model <- lm(EE ~ weight + experiment, data = get(i))
  model.metrics <- augment(model) %>% select(-.hat, -.sigma, -.fitted, -.se.fit)
  print(model.metrics %>% filter(abs(.std.resid) > 3) %>% as.data.frame())
}

SUMMARY of results