---
tags: Rscripts
---
# Remove Outliers from raw data
### Author: Jocelyn P. Colella
Script identifies outliers (>3 sd from the mean) and creates a new csv with outliers excluded
Also manually corrects mis-typed weight prior to all downstream analyses.
```
library(ggplot2)
library(tidyverse)
library(car)
library(lubridate)
library(ggpubr)
library(ggpmisc)
library(gridExtra)
library(rlist)
library(ggplot2)
library(broom)
library(nlme)
library(rstatix)
library(compositions)
library(dplyr)
### Input data available through Matt's dropbox: https://www.dropbox.com/sh/swb9lmthcxgn3qj/AADtwfBu-y2KQCeQC-9RjodDa?dl=0
### Download: analysis_data_final.csv -> RENAME: analysis_data_final_23June2020.csv
setwd("~/Documents/Jocie/projects/Pero_respo/Analyses/data/")
data <- read_csv("analysis_data_final_2July2020.csv",
col_types = cols(Sex = col_character(),
EE = col_double(),
H2Omg = col_double(),
RQ = col_double(),
Animal_ID = col_character(),
Deg_C = col_double(),
weight = col_double(),
experiment = col_character(),
StartTime = col_character(), #col_time(format = "%H:%M:%S"), - changed for easy use of lubridate
SD_VCO2 = col_double(),
SD_VO2 = col_double(),
SD_H2Omg = col_double(),
VO2 = col_double(),
VCO2 = col_double(),
StartDate = col_date(format = "%Y-%m-%d"),
hour = col_integer()))
data$time_in_S <- period_to_seconds(hms(data$StartTime)) #Total seconds to reach that point in the day
#Replace mis-typed weight (29.980) with correct weight (19.980)
data$weight[data$weight == 29.980] <- 19.980
# Establish when each interval/transition starts and stops
#Daytime interval: hrs:8:00-21:00
daytime_interval <- period_to_seconds(hms("09:00:00")):period_to_seconds(hms("20:00:00"))
#Night time: hrs 22:00-5:00
nighttime_interval <- c((period_to_seconds(hms("21:00:01")):period_to_seconds(hms("24:59:59"))), #evening portion of 'nighttime'
(period_to_seconds(hms("00:00:00")):period_to_seconds(hms("06:00:00")))) #morning portion of 'nightitme'
#Morning transition (t1): 6:00-9:00
t1_interval <- period_to_seconds(hms("06:00:00")):period_to_seconds(hms("09:00:00"))
#Evening transition (t2): 20-21:00
t2_interval <- period_to_seconds(hms("20:00:00")):period_to_seconds(hms("21:00:00"))
#### Create data subsets for males and females for each experiment (day, night, baseline [BL]) and each time block [e.g., t1/transition1, daytime, t2/transistion2, nighttime])
# all males and females across all 3 experiments (BL, hot, cold)
all_M = data[data$Sex == 'M', ]
all_F = data[data$Sex == 'F', ]
# IDENTIFY AND REMOVE OUTLIERS
### MALES ###
count = 1
dependent_variables = c("EE", "RQ", "VO2", "VCO2", "H2Omg")
varname_list = c("EE_OLs","RQ_OLs","VO2_OLs","VCO2_OLs","H2O_OLs")
for (DV in dependent_variables){
print(DV)
model <- lm(as.formula(paste0(DV, " ~ weight + experiment")), data = all_M)
model.metrics <- augment(model) %>% select(-.hat, -.sigma, -.fitted, -.se.fit)
summ <- model.metrics %>% filter(abs(.std.resid) > 3) %>% as.data.frame()
OL_list = c()
masterlist = c()
for (each_outlier_row in 1:nrow(summ)){
this_weight <- summ[each_outlier_row, "weight"]
this_DV <- summ[each_outlier_row, DV]
OL_list <- c(OL_list, (which(all_M$weight == this_weight & all_M[DV] == this_DV)))
}
print(OL_list)
#Assign list of outliers to specified variable lists (OL_list_EE, RQ, VO2, VCO2, mgH2O)
assign(paste("OL_list_", DV, sep = ""), OL_list)
}
masterlist <- c(OL_list_EE, OL_list_RQ, OL_list_VO2, OL_list_VCO2, OL_list_H2Omg)
masterlist_noDup <- unique(masterlist)
all_noOL_M <- all_M[-c(masterlist_noDup),]
dim(all_M) #6470
dim(all_noOL_M) #6255
write.csv(all_noOL_M, "all_noOL_M.csv", row.names = FALSE)
#### FEMALES ####
count = 1
dependent_variables = c("EE", "RQ", "VO2", "VCO2", "H2Omg")
varname_list = c("EE_OLs","RQ_OLs","VO2_OLs","VCO2_OLs","H2O_OLs")
for (DV in dependent_variables){
print(DV)
model <- lm(as.formula(paste0(DV, " ~ weight + experiment")), data = all_F)
model.metrics <- augment(model) %>% select(-.hat, -.sigma, -.fitted, -.se.fit)
summ <- model.metrics %>% filter(abs(.std.resid) > 3) %>% as.data.frame()
OL_list = c()
masterlist = c()
for (each_outlier_row in 1:nrow(summ)){
this_weight <- summ[each_outlier_row, "weight"]
this_DV <- summ[each_outlier_row, DV]
OL_list <- c(OL_list, (which(all_F$weight == this_weight & all_F[DV] == this_DV)))
}
print(OL_list)
assign(paste("OL_list_", DV, sep = ""), OL_list)
}
masterlist <- c(OL_list_EE, OL_list_RQ, OL_list_VO2, OL_list_VCO2, OL_list_H2Omg)
masterlist_noDup <- unique(masterlist)
all_noOL_F <- all_F[-c(masterlist_noDup),]
write.csv(all_noOL_F, "all_noOL_F.csv", row.names=FALSE)
```