# Emily and Craig RNA-Seq
## RNA-Seq
The below script was used to run a differential gene expression analysis in R.
To run the below script, open this page in Edit mode (pencil in top left corner), then copy and paste *everything* below this sentence into an Rmd file in RStudio.
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install(version = "3.10")
BiocManager::install(c("GenomicFeatures", "AnnotationDbi","edgeR"))
BiocManager::install("baySeq")
BiocManager::install("ALL", "topGO")
BiocManager::install("Rgraphviz")
rm(list=ls())
library(edgeR)
#library(baySeq)
library(statmod)
library(RColorBrewer)
library(pheatmap)
library(tidyverse)
#library(biomaRt)
library(biomartr)
library(topGO)
library(ALL)
```
## R Markdown
```{r Create design matrix and fit model, echo = FALSE}
setwd("Albertson")
x <- read.csv("rnaRawCounts_full.csv", header=TRUE,sep=",", row.names = 1)
head(x)
cpm.reads <- cpm(x)
#write.csv(cpm.reads, file="All_samples_cpm.csv", row.names=TRUE)
#-----------------Create data frame with info from target (samples) ----------------------
sample <- colnames(x)
targets <- as.data.frame(sample) %>% mutate(
species = c(rep("MZ", times=11), rep("TRC", times= 11)),
treatment = c(rep("benthic", times = 5), rep("pelagic", times = 6), rep("benthic", times = 5), rep("pelagic", times = 6))) %>%
mutate(across(!sample, as.factor))
#design <- targets %>% select(species, treatment) %>%
# mutate(species.MZ = ifelse(species == "MZ", 1, 0),
# species.TRC = ifelse(species=="TRC", 1, 0),
# treatment.Pelagic = ifelse(treatment == "pelagic", 1, 0),
# TRC.pelagic = ifelse(species == "MZ" & "treatment"))
group <- factor(paste(targets$species, targets$treatment, sep="."))
targets$group <- group
species <- targets$species
treatment <- targets$treatment
#-------------Model 1: Treat all groups separately: normalize counts table, add groups, and fit model -----------------------
# design matrix for pairwise comparisons
design <- model.matrix(~0 + group)
# design matrix for cross-species comparisons
design2 <- model.matrix(~0 + species)
#design matrix for cross-treatment comparisons
design3 <- model.matrix(~0 + treatment)
y <- DGEList(counts=x, group=group)
#Make MDS plot to ensure samples group together
points <- c(rep(19, times=5), rep(18, times = 6), rep(15, times=5), rep(16, times=6))
colors <- c(rep("dark blue", times=5), rep("blue", times = 6), rep("dark green", times=5), rep("green", times=6))
plotMDS(y, col=colors, pch=points)
legend("top", legend=levels(group), pch=points[c(1,6,12,18)], col=colors[c(1,6,12,18)], ncol=1)
# Calculate normalization factors
# Estimate dispersion - either generally with estimateDisp() or specifically with estimateGLMTagwiseDisp()
# glmQLFit() will fit the negative binomial model
# glmQLFTest() will test for differential expression
# glmTreat() allows us to test thresholded hypotheses under the glm framework.
# See page 31 of edgeR manual
# May want to fit an additive model to test for main effects, and a separate interaction model to test for interaction effects
y.full <- y
keep <- filterByExpr(y, design = design) #Filter out lowly expressed genes that will mess with comparisons
y <- y[keep,,keep.lib.sizes=FALSE]
y <- calcNormFactors(y)
y1 <- estimateDisp(y, design, robust=TRUE) #Estimate dispersion for pairwise comparisons
y2 <- estimateDisp(y, design2, robust = TRUE) #Estimate dispersion for cross-species comparisons
y3 <- estimateDisp(y, design3, robust = TRUE)
#Plot biological coefficient of variation
#plotBCV(y)
fit1 <- glmQLFit(y1, design, robust=TRUE) #Fit the model for treatments
fit2 <- glmQLFit(y2, design2, robust=TRUE) #Fit the model for species
fit3 <- glmQLFit(y3, design3, robust=TRUE) #Fit the model for treatment
#plotQLDisp(fit2)
#Make design matrix for comparisons
comparisons1 <- makeContrasts(MZ.benthic_v_TRC.benthic = groupMZ.benthic - groupTRC.benthic,
MZ.pelagic_v_TRC.pelagic = groupMZ.pelagic - groupTRC.pelagic,
MZ.benthic_v_MZ.pelagic = groupMZ.benthic - groupMZ.pelagic,
TRC.benthic_v_TRC.pelagic = groupTRC.benthic - groupTRC.pelagic,
MZ.pelagic_v_TRC.benthic = groupMZ.pelagic - groupTRC.benthic, #DEGs in preferred environment
MZ.benthic_v_TRC.pelagic = groupMZ.benthic - groupTRC.pelagic, #DEGs in non-preferred environment
species_env = (groupMZ.benthic - groupMZ.pelagic) - (groupTRC.benthic - groupTRC.pelagic), #IDs genes that respond differently based on the species when the environment is accounted for.
env_species = (groupMZ.benthic - groupTRC.benthic) - (groupMZ.pelagic - groupTRC.pelagic), #IDs genes that respo1nd differently based on the environment when species is accounted for. #No DEGs
pref_v_nonpref = (groupMZ.pelagic+groupTRC.benthic) - (groupMZ.benthic+groupTRC.pelagic),
levels=design) #No DEGs
comparisons2 <- makeContrasts(MZ_v_TRC = speciesMZ - speciesTRC,
levels = design2)
comparisons3 <- makeContrasts(benthic_v_pelagic = treatmentbenthic - treatmentpelagic,
levels = design3)
#Combine all comparisons into a single F statistic and p-value and extract top tags
comps <- glmQLFTest(fit1, contrast=comparisons1)
topTags(comps, n = 17525)
##Make heatmap of all comparisons##
logcpm <- cpm(y, log=TRUE) #calculate counts per million
rownames(logcpm) <- rownames(y$counts) #save gene names as rownames
colnames(logcpm) <- paste(y$samples$group, sep="_") #name samples by group
o <- order(comps$table$PValue) #order differentially expressed genes by p value and save row index value
logcpm <- logcpm[o[1:300],] #extract counts per million of the highest DEGs specified above
#Create heat map
pheatmap(logcpm, cluster_col=FALSE, scale="row", show_rownames = FALSE)
```
```{r Pairwise comparisons for model 1, echo=FALSE}
#------------------------Perform pairwise comparisons------------------------
##Comparison 1: MZ vs TRC: benthic
#Perform differential expression analysis
qlf_MZ.benthic_v_TRC.benthic <- glmQLFTest(fit1, contrast=comparisons1[,"MZ.benthic_v_TRC.benthic"])
#Make MD plot of comparisons
#plotMD(qlf_MZ.benthic_v_TRC.benthic, main = "MZ.benthic vs TRC.benthic")
#Show number of genes differentially expressed (up and down)
summary(decideTestsDGE(qlf_MZ.benthic_v_TRC.benthic))
#Extract all differentially expressed genes at FDR < 0.05 differentially expressed genes (DEGs) - set n high so the limit is the p_value cutoff
top_DEGs1.1 <- topTags(qlf_MZ.benthic_v_TRC.benthic, n=17525)
#Save top DEGs in a csv file
#write.csv(top_DEGs1.1, file="MZ.benthic_v_TRC.benthic.csv", row.names=TRUE)
##Comparison 2: MZ vs TRC: pelagic
#Perform differential expression analysis
qlf_MZ.pelagic_v_TRC.pelagic <- glmQLFTest(fit1, contrast=comparisons1[,"MZ.pelagic_v_TRC.pelagic"])
#Make MD plot of comparisons
#plotMD(qlf_MZ.pelagic_v_TRC.pelagic, main = "MZ.pelagic vs TRC.pelagic")
#Show number of genes differentially expressed (up and down)
summary(decideTestsDGE(qlf_MZ.pelagic_v_TRC.pelagic))
#Extract all differentially expressed genes at FDR < 0.05 differentially expressed genes (DEGs) - set n high so the limit is the p_value cutoff
top_DGEs1.2 <- topTags(qlf_MZ.pelagic_v_TRC.pelagic, n=17525)
#Save top DEGs in a csv file
#write.csv(top_DGEs1.2, file="MZ.pelagic_v_TRC.pelagic.csv", row.names=TRUE)
##Comparison 3: MZ: benthic vs pelagic
#Perform differential expression analysis
qlf_MZ.benthic_v_MZ.pelagic <- glmQLFTest(fit1, contrast=comparisons1[,"MZ.benthic_v_MZ.pelagic"])
#Make MD plot of comparisons
#plotMD(qlf_MZ.benthic_v_MZ.pelagic, main = "MZ.benthic vs MZ.pelagic")
#Show number of genes differentially expressed (up and down)
summary(decideTestsDGE(qlf_MZ.benthic_v_MZ.pelagic))
#Extract all differentially expressed genes at FDR < 0.05 differentially expressed genes (DEGs) - set n high so the limit is the p_value cutoff
top_DGEs1.3 <- topTags(qlf_MZ.benthic_v_MZ.pelagic, n=17525)
#Save top DEGs in a csv file
#write.csv(top_DGEs1.3, file="MZ.benthic_v_MZ.pelagic.csv", row.names=TRUE)
##Comparison 4: TRC: benthic vs pelagic
#Perform differential expression analysis
qlf_TRC.benthic_v_TRC.pelagic <- glmQLFTest(fit1, contrast=comparisons1[,"TRC.benthic_v_TRC.pelagic"])
#Make MD plot of comparisons
#plotMD(qlf_TRC.benthic_v_TRC.pelagic, main = "TRC.benthic vs TRC.pelagic")
#Show number of genes differentially expressed (up and down)
summary(decideTestsDGE(qlf_TRC.benthic_v_TRC.pelagic))
#Extract all differentially expressed genes at FDR < 0.05 differentially expressed genes (DEGs) - set n high so the limit is the p_value cutoff
top_DGEs1.4 <- topTags(qlf_TRC.benthic_v_TRC.pelagic, n=17525)
#Save top DEGs in a csv file
#write.csv(top_DGEs1.4, file="TRC.benthic_v_TRC.pelagic.csv", row.names=TRUE)
##Comparison 5: DEGs between species in preferred environment
##MZ.pelagic vs TRC benthic
#Perform differential expression analysis
qlf_MZ.pelagic_v_TRC.benthic <- glmQLFTest(fit1, contrast=comparisons1[,"MZ.pelagic_v_TRC.benthic"])
#Make MD plot of comparisons
#plotMD(qlf_MZ.pelagic_v_TRC.benthic, main = "MZ pelagic vs TRC benthic")
#Show number of genes differentially expressed (up and down)
summary(decideTestsDGE(qlf_MZ.pelagic_v_TRC.benthic))
#Extract all differentially expressed genes at FDR < 0.05 differentially expressed genes (DEGs) - set n high so the limit is the p_value cutoff
top_DGEs1.5 <- topTags(qlf_MZ.pelagic_v_TRC.benthic, n=17525)
#Save top DEGs in a csv file
#write.csv(top_DGEs1.5, file="MZ.pelagic_v_TRC.benthic.csv", row.names=TRUE)
##Comparison 6: DEGs between species in non-preferred environment
##MZ benthic vs TRC pelagic
#Perform differential expression analysis
qlf_MZ.benthic_v_TRC.pelagic <- glmQLFTest(fit1, contrast=comparisons1[,"MZ.benthic_v_TRC.pelagic"])
#Make MD plot of comparisons
#plotMD(qlf_MZ.benthic_v_TRC.pelagic, main = "MZ benthic vs TRC pelagic")
#Show number of genes differentially expressed (up and down)
summary(decideTestsDGE(qlf_MZ.benthic_v_TRC.pelagic))
#Extract all differentially expressed genes at FDR < 0.05 differentially expressed genes (DEGs) - set n high so the limit is the p_value cutoff
top_DGEs1.6 <- topTags(qlf_MZ.benthic_v_TRC.pelagic, n=17525)
#Save top DEGs in a csv file
#write.csv(top_DGEs1.6, file="MZ.benthic_v_TRC.pelagic.csv", row.names=TRUE)
##Comparison 7: DEGs in preferred environment regardless of species
##MZ.pelagic+TRC.benthic vs MZ.benthic+TRC.pelagic
#No DEGs
#Perform differential expression analysis
qlf_pref_v_nonpref <- glmQLFTest(fit1, contrast=comparisons1[,"pref_v_nonpref"])
#Make MD plot of comparisons
#plotMD(qlf_MZ.pelagic_v_TRC.benthic, main = "MZ pelagic vs TRC benthic")
#Show number of genes differentially expressed (up and down)
summary(decideTestsDGE(qlf_pref_v_nonpref))
#Extract all differentially expressed genes at FDR < 0.05 differentially expressed genes (DEGs) - set n high so the limit is the p_value cutoff
top_DGEs1.7 <- topTags(qlf_pref_v_nonpref, n=17525)
#Save top DEGs in a csv file
#write.csv(top_DGEs1.5, file="MZ.pelagic_v_TRC.benthic.csv", row.names=TRUE)
# ##Comparison 7: DEGs that respond differently to the environment based on the species.
# #Of the genes that are differentially expressed between environments in a given species, which are different between species
# ## (MZ benthic - MZ pelagic) - (TRC benthic - TRC pelagic)
# #Perform differential expression analysis
# qlf_species <- glmQLFTest(fit, contrast=comparisons[,"MZ_v_TRC.benthic"])
# #Make MD plot of comparisons
# plotMD(qlf_species, main = "MZ vs TRC")
# #Show number of genes differentially expressed (up and down)
# summary(decideTestsDGE(qlf_species))
# #Extract all differentially expressed genes at FDR < 0.05 differentially expressed genes (DEGs) - set n high so the limit is the p_value cutoff
# top_DGEs1.7 <- topTags(qlf_species)
#
# #Save top DEGs in a csv file
# #write.csv(top_DGEs4, file="Top_tags_ALG6_vs_ALG6.UGGT1.csv", row.names=TRUE)
#
#
# ##Comparison 8: DEGs that differ between the species when we account for environment
# #Of the genes that are differentially expressed between species in a given environment, which are different between environments?
# ## (MZ benthic - TRC benthic) - (MZ pelagic - TRC pelagic)
# #Perform differential expression analysis
# qlf_environment <- glmQLFTest(fit, contrast=comparisons[,"benthic_v_pelagic.MZ"])
# #Make MD plot of comparisons
# plotMD(qlf_environment, main = "Benthic vs pelagic")
# #Show number of genes differentially expressed (up and down)
# summary(decideTestsDGE(qlf_environment))
# #Extract all differentially expressed genes at FDR < 0.05 differentially expressed genes (DEGs) - set n high so the limit is the p_value cutoff
# top_DGEs1.8 <- topTags(qlf_environment)
#
# #Save top DEGs in a csv file
#write.csv(top_DGEs4, file="Top_tags_ALG6_vs_ALG6.UGGT1.csv", row.names=TRUE)
```
```{r Pairwise comparisons for collapsed models, echo = FALSE}
#------------------------Perform pairwise comparisons------------------------
##Comparison 1:
#Perform differential expression analysis
qlf_speciesMZ_v_TRC <- glmQLFTest(fit2,contrast=comparisons2[,"MZ_v_TRC"] )
#Make MD plot of comparisons
#plotMD(qlf_speciesMZ_v_TRC, main = "MZ vs TRC")
#Show number of genes differentially expressed (up and down)
summary(decideTestsDGE(qlf_speciesMZ_v_TRC)) #17525 genes total
#Extract all differentially expressed genes at FDR < 0.05 differentially expressed genes (DEGs) - set n high so the limit is the p_value cutoff
top_DGEs2.1 <- topTags(qlf_speciesMZ_v_TRC, n = 17525)
#Save top DEGs in a csv file
#write.csv(top_DGEs2.1, file="MZ_v_TRC_all.csv", row.names=TRUE)
##Comparison 2:
#Perform differential expression analysis
qlf_environment_benthic_v_pelagic <- glmQLFTest(fit3,contrast=comparisons3[,"benthic_v_pelagic"] )
#Make MD plot of comparisons
#plotMD(qlf_environment_benthic_v_pelagic, main = "MZ vs TRC")
#Show number of genes differentially expressed (up and down)
summary(decideTestsDGE(qlf_environment_benthic_v_pelagic))
#Extract all differentially expressed genes at FDR < 0.05 differentially expressed genes (DEGs) - set n high so the limit is the p_value cutoff
top_DGEs2.2 <- topTags(qlf_environment_benthic_v_pelagic, n=17525)
#Save top DEGs in a csv file
#write.csv(top_DGEs1, file="Top_tags_ALG6_vs_ALG6.UGGT1.csv", row.names=TRUE)
```
# Enrichment analysis
## GO term overview
See the [GO overview](http://geneontology.org/docs/ontology-documentation/), including description of the three (separate) categories.
**Molecular function**: refers to the molecular-level activities performed by gene products (*e.g. catalytic activity*)
**Cellular component**: The locations in which a gene product performs a function, either cellular compartments (*e.g. mitochondrion*) or macromolecular complexes (*e.g. ribosome*). Broadly, refers to cellular anatomy, rather than a process.
**Biological process**: The larger biological processes accomplished by multiple molecular activities (*e.g. DNA repair (broad) or glucose transmembrane transport (specific)*). In other words, the specific molecular function will be under the umbrella of this.
```{r enrichment analysis setup}
#Set up for all scripts below
p <- 0.05
topDiffGenes <- function(allScore){
return(allScore < p)
}
####------------Species v species DGE for filtering-----------####
##Comparison 2.1: Species: MZ vs TRC (environment collapsed)
qlf_speciesMZ_v_TRC <- glmQLFTest(fit2,contrast=comparisons2[,"MZ_v_TRC"])
#Show number of genes differentially expressed (up and down)
summary(decideTestsDGE(qlf_speciesMZ_v_TRC))
DEGs_species <- as.data.frame(topTags(qlf_speciesMZ_v_TRC, n = 17525, p.value = p))
DEGs_species2 <- DEGs_species %>% rownames_to_column(var = "gene")
#Create vector for all genes
all_gene_names <- row.names(y$counts)
GO_all <- biomartr::getGO(organism = "Maylandia zebra",
genes = all_gene_names,
filters = "ensembl_gene_id") %>%
dplyr::rename(gene = ensembl_gene_id,
GO_term = goslim_goa_description,
GO_accession = goslim_goa_accession)
#Remove genes with no GO terms - zero of those
GO_all_noNA <- GO_all %>% filter(is.na(GO_term) == FALSE)
#allGenes_vec <- all_genes_preferred$FDR
#names(allGenes_vec_pref) <- all_genes_preferred$gene
#Count number of times each gene occurs ie number of associated GO terms
num.genes_all <- GO_all_noNA %>% group_by(gene) %>%
dplyr::summarize(num = n())
#Make list to populate with GO accession numbers and populate first element
all.genes_list <- list()
all.genes_list[[1]] <- c(GO_all_noNA$GO_accession[1:num.genes_all$num[1]])
#Loop over remainder of elements and store the GO terms in the list
##MZ
for(i in 2:nrow(num.genes_all)){
start <- sum(num.genes_all$num[1:i-1])+1 #Starting point for GO terms
end <- start + num.genes_all$num[i] - 1 #Ending point for go terms
all.genes_list[[i]] <- c(GO_all_noNA$GO_accession[start:end]) #Extract GO terms from index specified above
}
#Add gene names to list
names(all.genes_list) <- num.genes_all$gene
####---------Filter genes that are DE between species-----------####
#Filter out genes from full gene list that are DE between species
all_gene_names.filt <- all_gene_names[!all_gene_names %in% DEGs_species2$gene]
GO_all.filt <- GO_all_noNA %>% filter(gene %in% all_gene_names.filt)
num.genes_all.filt <- GO_all.filt %>% group_by(gene) %>%
dplyr::summarize(num = n())
#Make list to populate with GO accession numbers and populate first element
all.genes_list.filt <- list()
all.genes_list.filt[[1]] <- c(GO_all.filt$GO_accession[1:num.genes_all.filt$num[1]])
#Loop over remainder of elements and store the GO terms in the list
for(i in 2:nrow(num.genes_all.filt)){
start <- sum(num.genes_all.filt$num[1:i-1])+1 #Starting point for GO terms
end <- start + num.genes_all.filt$num[i] - 1 #Ending point for go terms
all.genes_list.filt[[i]] <- c(GO_all.filt$GO_accession[start:end]) #Extract GO terms from index specified above
}
#Add gene names to list
names(all.genes_list.filt) <- num.genes_all.filt$gene
```
## Enrichment analysis with topGO
Here's a useful website: http://avrilomics.blogspot.com/2015/07/using-topgo-to-test-for-go-term.html
```{r enrichment analysis for preferred environment}
##Comparison 5: DEGs in preferred environment
##MZ.pelagic vs TRC benthic
qlf_MZ.pelagic_v_TRC.benthic <- glmQLFTest(fit1, contrast=comparisons1[,"MZ.pelagic_v_TRC.benthic"])
#Show number of genes differentially expressed (up and down)
summary(decideTestsDGE(qlf_MZ.pelagic_v_TRC.benthic))
#Save all DEGs
DEGs.in.preferred.all <- as.data.frame(topTags(qlf_MZ.pelagic_v_TRC.benthic, n=17525, p.value = p)) %>%
rownames_to_column(var = "gene") %>%
mutate(species_env = ifelse(logFC > 0, "MZ_pelagic", "TRC_benthic"))
#Filter out genes that are DE between species
DEGs.in.preferred.all_filt <- DEGs.in.preferred.all %>%
anti_join(DEGs_species2, by = "gene") %>%
dplyr::select(gene, logFC, FDR, species_env)
####Get GO terms####
# #Change here to filtered dataset if wanting to ignore genes that are DE between species
# GO_preferred <- biomartr::getGO(organism = "Maylandia zebra",
# genes = DEGs.in.preferred.all_filt$gene,
# filters = "ensembl_gene_id")
#
# #Rename
# GO_preferred <- GO_preferred %>% dplyr::rename(gene = ensembl_gene_id, GO_term = goslim_goa_description, GO_accession = goslim_goa_accession)
#
# #Combine with dataframe of all (filtered) DEGs
# preferred_full.tbl <- GO_preferred %>% full_join(DEGs.in.preferred.all_filt, by = "gene") #%>%
# #dplyr::select(gene, GO_term, GO_accession, logFC, FDR, species_env)
#
# #% of genes for which there is at least one GO term
# GO_preferred %>% distinct(gene) %>%
# summarize(percent_genes_w_GO = round(n()/length(DEGs.in.preferred.all_filt$gene)*100, 1))
####--------------Separate by species_env----------------------####
#Separate out DEGs from analysis by species/environment
#Add "_all" to preferred_full.tbl to use all genes instead of filtered dataset
preferred_MZ <- DEGs.in.preferred.all_filt %>%
filter(species_env == "MZ_pelagic", FDR < p)
preferred_TRC <- DEGs.in.preferred.all_filt %>%
filter(species_env == "TRC_benthic", FDR < p)
#Make list of gene names for topGO object
MZ_DEG_names <- c(preferred_MZ$gene) #Convert to vector
MZ_geneList <- factor(as.integer(all_gene_names.filt %in% MZ_DEG_names))
names(MZ_geneList) <- all_gene_names.filt
TRC_DEG_names <- c(preferred_TRC$gene) #Convert to vector
TRC_geneList <- factor(as.integer(all_gene_names.filt %in% TRC_DEG_names))
names(TRC_geneList) <- all_gene_names.filt
#Create topGO object using whole list of genes
#Here, I've already filtered for DEGs, so I'm passing a binary vector to the function that marks genes as interesting (ie DE) or not, and then maps them to the GO terms.
GOdata_pref.MZ <- new("topGOdata",
description = "GO enrichment analysis for preferred environment",
ontology = "BP", #The 'ontology' argument can be 'BP' (biological process), 'MF' (molecular function), or 'CC' (cellular component)
allGenes = MZ_geneList, #this is the 'gene universe'
annot = annFUN.gene2GO, gene2GO = all.genes_list.filt)
GOdata_pref.TRC <- new("topGOdata",
description = "GO enrichment analysis for preferred environment",
ontology = "BP", #The 'ontology' argument can be 'BP' (biological process), 'MF' (molecular function), or 'CC' (cellular component)
allGenes = TRC_geneList, #this is the 'gene universe'
annot = annFUN.gene2GO, gene2GO = all.genes_list.filt)
#Run the KS test using a hybrid of the elim and weighted algorithm
#This is the default in topGO and seems like a solid default
result.Fis_pref.MZ <- runTest(GOdata_pref.MZ, algorithm = "weight01", statistic = "fisher")
result.Fis_pref.MZ
result.Fis_pref.TRC <- runTest(GOdata_pref.TRC, algorithm = "weight01", statistic = "fisher")
result.Fis_pref.TRC
table_pref.MZ <- GenTable(GOdata_pref.MZ,
Fis = result.Fis_pref.MZ, #Results from KS test
topNodes = length(result.Fis_pref.MZ@score), #Number of top GO terms to include in the table
numChar = 50 #Number of characters to truncate the GO term definition to
) %>% rename(P_value = Fis)
table_pref.TRC <- GenTable(GOdata_pref.TRC,
Fis = result.Fis_pref.TRC, #Results from KS test
topNodes = length(result.Fis_pref.TRC@score), #Number of top GO terms to include in the table
numChar = 50 #Number of characters to truncate the GO term definition to
) %>% rename(P_value = Fis)
# showSigOfNodes(GOdata_pref.all, score(resultKS_pref.all), firstSigNodes = 5,
# useInfo ='all',#additional info to be plotted to each node
# reverse=T, #graph it upside down or not; default is T
# wantedNodes = NULL, #can specify nodes you want to be able to find and they will be colored so you can see them easily
# type=NULL,#can change to plot pie charts supposedly but no options are listed to specify this
# showEdges=T,#default is T; if F removed arrow showing relationships
# sigForAll=T) #score/p-value of all nodes in the DAG is shown, otherwise only the score for the sigNodes; default is T
#Calculate number of significant GO terms for viz
(MZ.pref_sigNodes <- table_pref.MZ %>% filter(as.numeric(P_value) < p) %>%
summarize(n()))
(TRC.pref_sigNodes <- table_pref.TRC %>% filter(as.numeric(P_value) < p) %>%
summarize(n()))
printGraph(GOdata_pref.MZ, result.Fis_pref.MZ, firstSigNodes = 2, fn.prefix = "~/R/R_working_dir/Collaborations/Albertson/Figures/Preferred.env_MZ.filt_BP.fisher", useInfo = "all", pdfSW = TRUE)
printGraph(GOdata_pref.TRC, result.Fis_pref.TRC, firstSigNodes = 3, fn.prefix = "~/R/R_working_dir/Collaborations/Albertson/Figures/Preferred.env_TRC.filt_BP.fisher", useInfo = "all", pdfSW = TRUE)
```
```{r enrichment analysis for non-preferred environment}
##Comparison 5: DEGs in non-preferred environment
##MZ.pelagic vs TRC benthic
qlf_MZ.benthic_v_TRC.pelagic <- glmQLFTest(fit1, contrast=comparisons1[,"MZ.benthic_v_TRC.pelagic"])
#Show number of genes differentially expressed (up and down)
summary(decideTestsDGE(qlf_MZ.benthic_v_TRC.pelagic))
#Save DEGs
DEGs_nonpreferred <- as.data.frame(topTags(qlf_MZ.benthic_v_TRC.pelagic, n=17525, p.value = p)) %>%
rownames_to_column(var = "gene") %>%
mutate(species_env = ifelse(logFC > 0, "MZ_benthic", "TRC_pelagic"))
#Filter out genes that are DE between species
DEGs.in.nonpreferred.all_filt <- DEGs_nonpreferred %>%
anti_join(DEGs_species2, by = "gene") %>%
mutate(species_env = ifelse(logFC > 0, "MZ_benthic", "TRC_pelagic")) %>%
dplyr::select(gene, logFC, FDR, species_env)
####Get GO terms####
# GO_nonpreferred <- biomartr::getGO(organism = "Maylandia zebra",
# genes = DEGs.in.nonpreferred.all_filt$gene,
# filters = "ensembl_gene_id")
#
# GO_nonpreferred <- GO_nonpreferred %>% dplyr::rename(gene = ensembl_gene_id, GO_term = goslim_goa_description, GO_accession = goslim_goa_accession)
#
# #Combine with dataframe of all (filtered) DEGs
# nonpreferred_full.tbl <- GO_nonpreferred %>% full_join(DEGs_nonpreferred, by = "gene") %>%
# dplyr::select(gene, GO_term, GO_accession, logFC, FDR, species_env)
#
# #% of genes for which there is at least one GO term
# GO_nonpreferred %>% distinct(gene) %>%
# summarize(percent_genes_w_GO = round(n()/length(DEGs.in.nonpreferred.all_filt$gene)*100, 1))
####--------------Separate by species_env----------------------####
#Not sure if I'm going to use this ... update: I am
#Add "_all" to nonpreferred_full.tbl to use all genes rather than filtered dataset
nonpreferred_MZ <- DEGs.in.nonpreferred.all_filt %>%
filter(species_env == "MZ_benthic", FDR < p)
nonpreferred_TRC <- DEGs.in.nonpreferred.all_filt %>%
filter(species_env == "TRC_pelagic", FDR < p)
####Count GO terms####
# nonpreferred_MZ_GOterm.sum <- nonpreferred_MZ %>% group_by(GO_term) %>%
# summarize(MZ_benthic = n()) %>%
# arrange(desc(MZ_benthic))
#
# nonpreferred_TRC_GOterm.sum <- nonpreferred_TRC %>% group_by(GO_term) %>%
# summarize(TRC_pelagic = n()) %>%
# arrange(desc(TRC_pelagic))
#
# nonpreferred_all_GOterm.sum <- nonpreferred_MZ_GOterm.sum %>% full_join(nonpreferred_TRC_GOterm.sum, by = "GO_term")
#
# #Write output files
# write.csv(nonpreferred_all_GOterm.sum, file = "output/nonpreferred_MZ.benthic.v.TRC.pelagic_all_GOterm.sum.csv", row.names = FALSE)
#Remove genes with no GO terms
# nonpref_MZ_GO <- nonpreferred_MZ %>% filter(is.na(GO_term) == FALSE) %>%
# group_by(gene) %>%
# select(-c(GO_term, logFC, FDR, species_env))
#
# nonpref_TRC_GO <- nonpreferred_TRC %>% filter(is.na(GO_term) == FALSE) %>%
# group_by(gene) %>%
# select(-c(GO_term, logFC, FDR, species_env))
####make topGO object####
#Make vector of interesting genes for topGO object
MZ_DEG_names_nonpref <- c(nonpreferred_MZ$gene) #Convert to vector
MZ_geneList_nonpref <- factor(as.integer(all_gene_names.filt %in% MZ_DEG_names_nonpref))
names(MZ_geneList_nonpref) <- all_gene_names.filt
TRC_DEG_names_nonpref <- c(nonpreferred_TRC$gene) #Convert to vector
TRC_geneList_nonpref <- factor(as.integer(all_gene_names.filt %in% TRC_DEG_names_nonpref))
names(TRC_geneList_nonpref) <- all_gene_names.filt
#Create topGO object using whole list of genes
GOdata_nonpref.MZ <- new("topGOdata",
description = "GO enrichment analysis for nonnonpreferred environment",
ontology = "BP", #The 'ontology' argument can be 'BP' (biological process), 'MF' (molecular function), or 'CC' (cellular component)
allGenes = MZ_geneList_nonpref, #this is the 'gene universe'
annot = annFUN.gene2GO, gene2GO = all.genes_list.filt)
GOdata_nonpref.TRC <- new("topGOdata",
description = "GO enrichment analysis for nonnonpreferred environment",
ontology = "BP", #The 'ontology' argument can be 'BP' (biological process), 'MF' (molecular function), or 'CC' (cellular component)
allGenes = TRC_geneList_nonpref, #this is the 'gene universe'
annot = annFUN.gene2GO, gene2GO = all.genes_list.filt)
#Run the KS test using a hybrid of the elim and weighted algorithm
#This is the default in topGO and seems like a solid default
result.Fis_nonpref.MZ <- runTest(GOdata_nonpref.MZ, algorithm = "weight01", statistic = "fisher")
result.Fis_nonpref.MZ
result.Fis_nonpref.TRC <- runTest(GOdata_nonpref.TRC, algorithm = "weight01", statistic = "fisher")
result.Fis_nonpref.TRC
table_nonpref.MZ <- GenTable(GOdata_nonpref.MZ,
Fis = result.Fis_nonpref.MZ, #Results from Fis test
topNodes = length(result.Fis_nonpref.MZ@score), #Number of top GO terms to include in the table
numChar = 50 #Number of characters to truncate the GO term definition to
) %>% rename(P_value = Fis)
table_nonpref.TRC <- GenTable(GOdata_nonpref.TRC,
Fis = result.Fis_nonpref.TRC, #Results from Fis test
topNodes = length(result.Fis_nonpref.TRC@score), #Number of top GO terms to include in the table
numChar = 50 #Number of characters to truncate the GO term definition to
) %>% rename(P_value = Fis)
# showSigOfNodes(GOdata_nonpref.all, score(resultFis_nonpref.all), firstSigNodes = 5,
# useInfo ='all',#additional info to be plotted to each node
# reverse=T, #graph it upside down or not; default is T
# wantedNodes = NULL, #can specify nodes you want to be able to find and they will be colored so you can see them easily
# type=NULL,#can change to plot pie charts supposedly but no options are listed to specify this
# showEdges=T,#default is T; if F removed arrow showing relationships
# sigForAll=T) #score/p-value of all nodes in the DAG is shown, otherwise only the score for the sigNodes; default is T
#Calculate number of significant GO terms for viz
(MZ.nonpref_sigNodes <- table_nonpref.MZ %>% filter(as.numeric(P_value) < p) %>%
summarize(n()))
(TRC.nonpref_sigNodes <- table_nonpref.TRC %>% filter(as.numeric(P_value) < p) %>%
summarize(n()))
printGraph(GOdata_nonpref.MZ, result.Fis_nonpref.MZ, firstSigNodes = 0, fn.prefix = "~/R/R_working_dir/Collaborations/Albertson/Figures/nonpreferred.env_MZ.filt_BP.fisher", useInfo = "all", pdfSW = TRUE)
printGraph(GOdata_nonpref.TRC, result.Fis_nonpref.TRC, firstSigNodes = 1, fn.prefix = "~/R/R_working_dir/Collaborations/Albertson/Figures/nonpreferred.env_TRC.filt_BP.fisher", useInfo = "all", pdfSW = TRUE)
```
```{r enrichment analysis for pelagic environment}
#------------Using filtered dataset ie no species x species DEGs--------
##Comparison 5: DEGs in preferred environment
##MZ.pelagic vs TRC benthic
qlf_MZ.pelagic_v_TRC.pelagic <- glmQLFTest(fit1, contrast=comparisons1[,"MZ.pelagic_v_TRC.pelagic"])
#Show number of genes differentially expressed (up and down)
summary(decideTestsDGE(qlf_MZ.pelagic_v_TRC.pelagic))
#Make filtered dataset for GO term analysis
#Save DEGs
DEGs_pelagic <- as.data.frame(topTags(qlf_MZ.pelagic_v_TRC.pelagic, n=17525, p.value = p)) %>%
rownames_to_column(var = "gene") %>%
mutate(species_env = ifelse(logFC > 0, "MZ_pelagic", "TRC_pelagic"))
#Filter out genes that are DE between species
DEGs.in.pelagic.all_filt <- DEGs_pelagic %>%
anti_join(DEGs_species2, by = "gene") %>%
mutate(species_env = ifelse(logFC > 0, "MZ_pelagic", "TRC_pelagic")) %>%
dplyr::select(gene, logFC, FDR, species_env)
####Get GO terms####
# GO_pelagic <- biomartr::getGO(organism = "Maylandia zebra",
# genes = DEGs.in.pelagic.all_filt$gene,
# filters = "ensembl_gene_id")
#
# GO_pelagic <- GO_pelagic %>% dplyr::rename(gene = ensembl_gene_id, GO_term = goslim_goa_description, GO_accession = goslim_goa_accession)
#
# #Combine with dataframe of all (filtered) DEGs
# pelagic_full.tbl <- GO_pelagic %>% full_join(DEGs_pelagic, by = "gene") %>%
# dplyr::select(gene, GO_term, GO_accession, logFC, FDR, species_env)
#
# #% of genes for which there is at least one GO term
# GO_pelagic %>% distinct(gene) %>%
# summarize(percent_genes_w_GO = round(n()/length(DEGs.in.pelagic.all_filt$gene)*100, 1))
#--------------Separate by species_env----------------------
#Not sure if I'm going to use this ... UPDATE: I am
#Add "_all" to end of pelagic_full.tbl to use full dataset instead of filtered
pelagic_MZ <- DEGs.in.pelagic.all_filt %>%
filter(species_env == "MZ_pelagic", FDR < p)
pelagic_TRC <- DEGs.in.pelagic.all_filt %>%
filter(species_env == "TRC_pelagic", FDR < p)
#Count GO terms
# pelagic_MZ_GOterm.sum <- pelagic_MZ %>% group_by(GO_term) %>%
# summarize(MZ_pelagic = n()) %>%
# arrange(desc(MZ_pelagic))
#
# pelagic_TRC_GOterm.sum <- pelagic_TRC %>% group_by(GO_term) %>%
# summarize(TRC_benthic = n()) %>%
# arrange(desc(TRC_benthic))
#
# pelagic_all_GOterm.sum <- pelagic_MZ_GOterm.sum %>% full_join(pelagic_TRC_GOterm.sum, by = "GO_term")
#
# #Write output files
# write.csv(pelagic_all_GOterm.sum, file = "output/pelagic_MZ.pelagic.v.TRC.benthic_all_GOterm.sum.csv", row.names = FALSE)
#Make lists of interesting genes
MZ_DEG_names_pelagic <- c(pelagic_MZ$gene) #Convert to vector
MZ_geneList_pelagic <- factor(as.integer(all_gene_names.filt %in% MZ_DEG_names_pelagic))
names(MZ_geneList_pelagic) <- all_gene_names.filt
TRC_DEG_names_pelagic <- c(pelagic_TRC$gene) #Convert to vector
TRC_geneList_pelagic <- factor(as.integer(all_gene_names.filt %in% TRC_DEG_names_pelagic))
names(TRC_geneList_pelagic) <- all_gene_names.filt
#Create topGO object using lists of DEGs
GOdata_pelagic.MZ <- new("topGOdata",
description = "GO enrichment analysis for pelagic environment",
ontology = "BP", #The 'ontology' argument can be 'BP' (biological process), 'MF' (molecular function), or 'CC' (cellular component)
allGenes = MZ_geneList_pelagic, #this is the 'gene universe'
annot = annFUN.gene2GO, gene2GO = all.genes_list.filt)
GOdata_pelagic.TRC <- new("topGOdata",
description = "GO enrichment analysis for pelagic environment",
ontology = "BP", #The 'ontology' argument can be 'BP' (biological process), 'MF' (molecular function), or 'CC' (cellular component)
allGenes = TRC_geneList_pelagic, #this is the 'gene universe'
annot = annFUN.gene2GO, gene2GO = all.genes_list.filt)
#Run the KS test using a hybrid of the elim and weighted algorithm
#This is the default in topGO and seems like a solid default
result.Fis_pelagic.MZ <- runTest(GOdata_pelagic.MZ, algorithm = "weight01", statistic = "fisher")
result.Fis_pelagic.MZ
result.Fis_pelagic.TRC <- runTest(GOdata_pelagic.TRC, algorithm = "weight01", statistic = "fisher")
result.Fis_pelagic.TRC
table_pelagic.MZ <- GenTable(GOdata_pelagic.MZ,
Fis = result.Fis_pelagic.MZ, #Results from Fis test
topNodes = length(result.Fis_pelagic.MZ@score), #Number of top GO terms to include in the table
numChar = 50 #Number of characters to truncate the GO term definition to
) %>% rename(P_value = Fis)
table_pelagic.TRC <- GenTable(GOdata_pelagic.TRC,
Fis = result.Fis_pelagic.TRC, #Results from Fis test
topNodes = length(result.Fis_pelagic.TRC@score), #Number of top GO terms to include in the table
numChar = 50 #Number of characters to truncate the GO term definition to
) %>% rename(P_value = Fis)
# showSigOfNodes(GOdata_pelagic.all, score(resultFis_pelagic.all), firstSigNodes = 5,
# useInfo ='all',#additional info to be plotted to each node
# reverse=T, #graph it upside down or not; default is T
# wantedNodes = NULL, #can specify nodes you want to be able to find and they will be colored so you can see them easily
# type=NULL,#can change to plot pie charts supposedly but no options are listed to specify this
# showEdges=T,#default is T; if F removed arrow showing relationships
# sigForAll=T) #score/p-value of all nodes in the DAG is shown, otherwise only the score for the sigNodes; default is T
#Calculate number of significant GO terms for viz
(MZ.pelagic_sigNodes <- table_pelagic.MZ %>% filter(as.numeric(P_value) < p) %>%
summarize(n()))
(TRC.pelagic_sigNodes <- table_pelagic.TRC %>% filter(as.numeric(P_value) < p) %>%
summarize(n()))
printGraph(GOdata_pelagic.MZ, result.Fis_pelagic.MZ, firstSigNodes = 3, fn.prefix = "~/R/R_working_dir/Collaborations/Albertson/Figures/pelagic.env_MZ.filt_BP.fisher", useInfo = "all", pdfSW = TRUE)
printGraph(GOdata_pelagic.TRC, result.Fis_pelagic.TRC, firstSigNodes = 0, fn.prefix = "~/R/R_working_dir/Collaborations/Albertson/Figures/pelagic.env_TRC.filt_BP.fisher", useInfo = "all", pdfSW = TRUE)
```
```{r enrichment analysis for benthic environment}
#------------Using filtered dataset ie no species x species DEGs--------
##Comparison 5: DEGs in preferred environment
##MZ.benthic vs TRC benthic
qlf_MZ.benthic_v_TRC.benthic <- glmQLFTest(fit1, contrast=comparisons1[,"MZ.benthic_v_TRC.benthic"])
#Show number of genes differentially expressed (up and down)
summary(decideTestsDGE(qlf_MZ.benthic_v_TRC.benthic))
#Run GO analysis on filtered dataset
#Save DEGs
DEGs_benthic <- as.data.frame(topTags(qlf_MZ.benthic_v_TRC.benthic, n=17525, p.value = p)) %>%
rownames_to_column(var = "gene") %>%
mutate(species_env = ifelse(logFC > 0, "MZ_benthic", "TRC_benthic"))
#Filter out genes that are DE between species
DEGs.in.benthic.all_filt <- DEGs_benthic %>%
anti_join(DEGs_species2, by = "gene") %>%
mutate(species_env = ifelse(logFC > 0, "MZ_benthic", "TRC_benthic")) %>%
dplyr::select(gene, logFC, FDR, species_env)
####Get GO terms####
# GO_benthic <- biomartr::getGO(organism = "Maylandia zebra",
# genes = DEGs.in.benthic.all_filt$gene,
# filters = "ensembl_gene_id")
#
# GO_benthic <- GO_benthic %>% dplyr::rename(gene = ensembl_gene_id, GO_term = goslim_goa_description, GO_accession = goslim_goa_accession)
#
# #Combine with dataframe of all (filtered) DEGs
# benthic_full.tbl <- GO_benthic %>% full_join(DEGs_benthic, by = "gene") %>%
# dplyr::select(gene, GO_term, GO_accession, logFC, FDR, species_env)
#
# #% of genes for which there is at least one GO term
# GO_benthic %>% distinct(gene) %>%
# summarize(percent_genes_w_GO = round(n()/length(DEGs.in.benthic.all_filt$gene)*100, 1))
#--------------Separate by species_env----------------------
#Not sure if I'm going to use this ...
#Add "_all" at end of benthic_full.tbl if wanting to use full dataset
benthic_MZ <- DEGs.in.benthic.all_filt %>%
filter(species_env == "MZ_benthic", FDR < p)
benthic_TRC <- DEGs.in.benthic.all_filt %>%
filter(species_env == "TRC_benthic", FDR < p)
#Count GO terms
# benthic_MZ_GOterm.sum <- benthic_MZ %>% group_by(GO_term) %>%
# summarize(MZ_benthic = n()) %>%
# arrange(desc(MZ_benthic))
#
# benthic_TRC_GOterm.sum <- benthic_TRC %>% group_by(GO_term) %>%
# summarize(TRC_benthic = n()) %>%
# arrange(desc(TRC_benthic))
#
# benthic_all_GOterm.sum <- benthic_MZ_GOterm.sum %>% full_join(benthic_TRC_GOterm.sum, by = "GO_term")
#
# #Write output files
# write.csv(benthic_all_GOterm.sum, file = "output/benthic_MZ.benthic.v.TRC.benthic_all_GOterm.sum.csv", row.names = FALSE)
MZ_DEG_names_benthic <- c(benthic_MZ$gene) #Convert to vector
MZ_geneList_benthic <- factor(as.integer(all_gene_names.filt %in% MZ_DEG_names_benthic))
names(MZ_geneList_benthic) <- all_gene_names.filt
TRC_DEG_names_benthic <- c(benthic_TRC$gene) #Convert to vector
TRC_geneList_benthic <- factor(as.integer(all_gene_names.filt %in% TRC_DEG_names_benthic))
names(TRC_geneList_benthic) <- all_gene_names.filt
#Create topGO object
GOdata_benthic.MZ <- new("topGOdata",
description = "GO enrichment analysis for benthic environment",
ontology = "BP", #The 'ontology' argument can be 'BP' (biological process), 'MF' (molecular function), or 'CC' (cellular component)
allGenes = MZ_geneList_benthic, #this is the 'gene universe'
annot = annFUN.gene2GO, gene2GO = all.genes_list.filt)
GOdata_benthic.TRC <- new("topGOdata",
description = "GO enrichment analysis for benthic environment",
ontology = "BP", #The 'ontology' argument can be 'BP' (biological process), 'MF' (molecular function), or 'CC' (cellular component)
allGenes = TRC_geneList_benthic, #this is the 'gene universe'
annot = annFUN.gene2GO, gene2GO = all.genes_list.filt)
#Run the KS test using a hybrid of the elim and weighted algorithm
#This is the default in topGO and seems like a solid default
result.Fis_benthic.MZ <- runTest(GOdata_benthic.MZ, algorithm = "weight01", statistic = "fisher")
result.Fis_benthic.MZ
result.Fis_benthic.TRC <- runTest(GOdata_benthic.TRC, algorithm = "weight01", statistic = "fisher")
result.Fis_benthic.TRC
table_benthic.MZ <- GenTable(GOdata_benthic.MZ,
Fis = result.Fis_benthic.MZ, #Results from Fis test
topNodes = length(result.Fis_benthic.MZ@score), #Number of top GO terms to include in the table
numChar = 50 #Number of characters to truncate the GO term definition to
) %>% rename(P_value = Fis)
table_benthic.TRC <- GenTable(GOdata_benthic.TRC,
Fis = result.Fis_benthic.TRC, #Results from Fis test
topNodes = length(result.Fis_benthic.TRC@score), #Number of top GO terms to include in the table
numChar = 50 #Number of characters to truncate the GO term definition to
) %>% rename(P_value = Fis)
# showSigOfNodes(GOdata_benthic.all, score(resultFis_benthic.all), firstSigNodes = 5,
# useInfo ='all',#additional info to be plotted to each node
# reverse=T, #graph it upside down or not; default is T
# wantedNodes = NULL, #can specify nodes you want to be able to find and they will be colored so you can see them easily
# type=NULL,#can change to plot pie charts supposedly but no options are listed to specify this
# showEdges=T,#default is T; if F removed arrow showing relationships
# sigForAll=T) #score/p-value of all nodes in the DAG is shown, otherwise only the score for the sigNodes; default is T
(MZ.benthic_sigNodes <- table_benthic.MZ %>% filter(as.numeric(P_value) < p) %>%
summarize(n()))
(TRC.benthic_sigNodes <- table_benthic.TRC %>% filter(as.numeric(P_value) < p) %>%
summarize(n()))
printGraph(GOdata_benthic.MZ, result.Fis_benthic.MZ, firstSigNodes = 0, fn.prefix = "~/R/R_working_dir/Collaborations/Albertson/Figures/benthic.env_MZ.filt_BP.fisher", useInfo = "all", pdfSW = TRUE)
printGraph(GOdata_benthic.TRC, result.Fis_benthic.TRC, firstSigNodes = 0, fn.prefix = "~/R/R_working_dir/Collaborations/Albertson/Figures/benthic.env_TRC.filt_BP.fisher", useInfo = "all", pdfSW = TRUE)
```
```{r species vs species enrichment analysis}
qlf_speciesMZ_v_TRC <- glmQLFTest(fit2,contrast=comparisons2[,"MZ_v_TRC"] )
summary(decideTestsDGE(qlf_speciesMZ_v_TRC)) #17525 genes total
############Making topGO object for MBuna data####################
##Save all genes regardless of FDR
all_genes_species <- as.data.frame(topTags(qlf_speciesMZ_v_TRC, n=17525)) %>% #Save all genes regardless of FDR
rownames_to_column(var = "gene") %>%
mutate(species_env = ifelse(logFC > 0, "MZ", "TRC")) #%>%
head(all_genes_species)
####Extract GO terms for each gene####
# GO_species_all <- biomartr::getGO(organism = "Maylandia zebra",
# genes = all_genes_species$gene,
# filters = "ensembl_gene_id") %>%
# dplyr::rename(gene = ensembl_gene_id,
# GO_term = goslim_goa_description,
# GO_accession = goslim_goa_accession)
#% of genes for which there is at least one GO term
# GO_benthic_all %>% distinct(gene) %>%
# summarize(percent_genes_w_GO = round(n()/length(all_genes_benthic$gene)*100, 1))
# #Combine with dataframe of all DEGs
# species_full.tbl_all <- GO_species_all %>% full_join(all_genes_species, by = "gene") %>%
# dplyr::select(gene, GO_term, GO_accession, logFC, FDR, species_env)
#--------------Separate by species_env----------------------
#Not sure if I'm going to use this ... I AM
species_MZ <- all_genes_species %>%
filter(species_env == "MZ", FDR < p)
species_TRC <- all_genes_species %>%
filter(species_env == "TRC", FDR < p)
#Count GO terms
# species_MZ_GOterm.sum <- species_MZ %>% group_by(GO_term) %>%
# summarize(MZ_species = n()) %>%
# arrange(desc(MZ_species))
#
# species_TRC_GOterm.sum <- species_TRC %>% group_by(GO_term) %>%
# summarize(TRC_species = n()) %>%
# arrange(desc(TRC_species))
#
# species_all_GOterm.sum <- species_MZ_GOterm.sum %>% full_join(species_TRC_GOterm.sum, by = "GO_term")
#
# #Write output files
# write.csv(species_all_GOterm.sum, file = "output/species_MZ.species.v.TRC.species_all_GOterm.sum.csv", row.names = FALSE)
MZ_DEG_names_species <- c(species_MZ$gene) #Convert to vector
MZ_geneList_species <- factor(as.integer(all_gene_names %in% MZ_DEG_names_species))
names(MZ_geneList_species) <- all_gene_names
TRC_DEG_names_species <- c(species_TRC$gene) #Convert to vector
TRC_geneList_species <- factor(as.integer(all_gene_names %in% TRC_DEG_names_species))
names(TRC_geneList_species) <- all_gene_names
#Create topGO object using whole list of genes
GOdata_species.MZ <- new("topGOdata",
description = "GO enrichment analysis for species environment",
ontology = "BP", #The 'ontology' argument can be 'BP' (biological process), 'MF' (molecular function), or 'CC' (cellular component)
allGenes = MZ_geneList_species, #this is the 'gene universe'
annot = annFUN.gene2GO, gene2GO = all.genes_list)
GOdata_species.TRC <- new("topGOdata",
description = "GO enrichment analysis for species environment",
ontology = "BP", #The 'ontology' argument can be 'BP' (biological process), 'MF' (molecular function), or 'CC' (cellular component)
allGenes = TRC_geneList_species, #this is the 'gene universe'
annot = annFUN.gene2GO, gene2GO = all.genes_list)
#Run the KS test using a hybrid of the elim and weighted algorithm
#This is the default in topGO and seems like a solid default
result.Fis_species.MZ <- runTest(GOdata_species.MZ, algorithm = "weight01", statistic = "fisher")
result.Fis_species.MZ
result.Fis_species.TRC <- runTest(GOdata_species.TRC, algorithm = "weight01", statistic = "fisher")
result.Fis_species.TRC
table_species.MZ <- GenTable(GOdata_species.MZ,
Fis = result.Fis_species.MZ, #Results from Fis test
topNodes = length(result.Fis_species.MZ@score), #Number of top GO terms to include in the table
numChar = 50 #Number of characters to truncate the GO term definition to
) %>% rename(P_value = Fis)
table_species.TRC <- GenTable(GOdata_species.TRC,
Fis = result.Fis_species.TRC, #Results from Fis test
topNodes = length(result.Fis_species.TRC@score), #Number of top GO terms to include in the table
numChar = 50 #Number of characters to truncate the GO term definition to
) %>% rename(P_value = Fis)
# showSigOfNodes(GOdata_species.all, score(resultFis_species.all), firstSigNodes = 5,
# useInfo ='all',#additional info to be plotted to each node
# reverse=T, #graph it upside down or not; default is T
# wantedNodes = NULL, #can specify nodes you want to be able to find and they will be colored so you can see them easily
# type=NULL,#can change to plot pie charts supposedly but no options are listed to specify this
# showEdges=T,#default is T; if F removed arrow showing relationships
# sigForAll=T) #score/p-value of all nodes in the DAG is shown, otherwise only the score for the sigNodes; default is T
(MZ.species_sigNodes <- table_species.MZ %>% filter(as.numeric(P_value) < p) %>%
summarize(n()))
(TRC.species_sigNodes <- table_species.TRC %>% filter(as.numeric(P_value) < p) %>%
summarize(n()))
printGraph(GOdata_species.MZ, result.Fis_species.MZ, firstSigNodes = 5, fn.prefix = "~/R/R_working_dir/Collaborations/Albertson/Figures/species.collapsed_MZ_BP.fisher", useInfo = "all", pdfSW = TRUE)
printGraph(GOdata_species.TRC, result.Fis_species.TRC, firstSigNodes = 6, fn.prefix = "~/R/R_working_dir/Collaborations/Albertson/Figures/species.collapsed_TRC_BP.fisher", useInfo = "all", pdfSW = TRUE)
```
```{r enrichment analysis for MZ: benthic v pelagic}
#------------Using filtered dataset ie no species x species DEGs--------
##MZ.pelagic vs MZ.benthic
qlf_MZ.benthic_v_MZ.pelagic <- glmQLFTest(fit1, contrast=comparisons1[,"MZ.benthic_v_MZ.pelagic"])
#Show number of genes differentially expressed (up and down)
summary(decideTestsDGE(qlf_MZ.benthic_v_MZ.pelagic))
############Making topGO object for MBuna data####################
##Save all genes regardless of FDR
DEGs_MZ <- as.data.frame(topTags(qlf_MZ.benthic_v_MZ.pelagic, n=17525, p.value = p)) %>% #Save all genes regardless of FDR
rownames_to_column(var = "gene") %>%
mutate(species_env = ifelse(logFC > 0, "MZ_benthic", "MZ_pelagic")) #%>%
####Extract GO terms for each gene####
# GO_MZ_all <- biomartr::getGO(organism = "Maylandia zebra",
# genes = all_genes_MZ$gene,
# filters = "ensembl_gene_id") %>%
# dplyr::rename(gene = ensembl_gene_id,
# GO_term = goslim_goa_description,
# GO_accession = goslim_goa_accession)
#
# #% of genes for which there is at least one GO term
# # GO_MZ_all %>% distinct(gene) %>%
# # summarize(percent_genes_w_GO = round(n()/length(all_genes_MZ$gene)*100, 1))
#
# # #Combine with dataframe of all DEGs
# MZ_full.tbl_all <- GO_MZ_all %>% full_join(all_genes_MZ, by = "gene") %>%
# dplyr::select(gene, GO_term, GO_accession, logFC, FDR, species_env)
#--------------Separate by species_env----------------------
#Not sure if I'm going to use this ... UPDATE: I am
MZ_benthic <- DEGs_MZ %>%
filter(species_env == "MZ_benthic")
MZ_pelagic <- DEGs_MZ %>%
filter(species_env == "MZ_pelagic")
#Count GO terms
# MZ_benthic_GOterm.sum <- MZ_benthic %>% group_by(GO_term) %>%
# summarize(MZ_benthic = n()) %>%
# arrange(desc(MZ_benthic))
#
# MZ_pelagic_GOterm.sum <- MZ_pelagic %>% group_by(GO_term) %>%
# summarize(MZ_pelagic = n()) %>%
# arrange(desc(MZ_pelagic))
#
# MZ_all_GOterm.sum <- MZ_benthic_GOterm.sum %>% full_join(MZ_pelagic_GOterm.sum, by = "GO_term")
#
# #Write output files
# write.csv(MZ_all_GOterm.sum, file = "output/MZ.benthic.v.MZ.pelagic_all_GOterm.sum.csv", row.names = FALSE)
MZ_DEG_names_benthic <- c(MZ_benthic$gene) #Convert to vector
MZ_geneList_benthic <- factor(as.integer(all_gene_names %in% MZ_DEG_names_benthic))
names(MZ_geneList_benthic) <- all_gene_names
MZ_DEG_names_pelagic <- c(MZ_pelagic$gene) #Convert to vector
MZ_geneList_pelagic <- factor(as.integer(all_gene_names %in% MZ_DEG_names_pelagic))
names(MZ_geneList_pelagic) <- all_gene_names
#Create topGO object using lists of DEGs
GOdata_MZ.benthic <- new("topGOdata",
description = "GO enrichment analysis for pelagic environment",
ontology = "BP", #The 'ontology' argument can be 'BP' (biological process), 'MF' (molecular function), or 'CC' (cellular component)
allGenes = MZ_geneList_benthic, #this is the 'gene universe'
annot = annFUN.gene2GO, gene2GO = all.genes_list)
GOdata_MZ.pelagic <- new("topGOdata",
description = "GO enrichment analysis for pelagic environment",
ontology = "BP", #The 'ontology' argument can be 'BP' (biological process), 'MF' (molecular function), or 'CC' (cellular component)
allGenes = MZ_geneList_pelagic, #this is the 'gene universe'
annot = annFUN.gene2GO, gene2GO = all.genes_list)
#Run the KS test using a hybrid of the elim and weighted algorithm
#This is the default in topGO and seems like a solid default
result.Fis_MZ.benthic <- runTest(GOdata_MZ.benthic, algorithm = "weight01", statistic = "fisher")
result.Fis_MZ.benthic
result.Fis_MZ.pelagic <- runTest(GOdata_MZ.pelagic, algorithm = "weight01", statistic = "fisher")
result.Fis_MZ.pelagic
table_MZ.benthic <- GenTable(GOdata_MZ.benthic,
Fis = result.Fis_MZ.benthic, #Results from Fis test
topNodes = length(result.Fis_MZ.benthic@score), #Number of top GO terms to include in the table
numChar = 50 #Number of characters to truncate the GO term definition to
) %>% rename(P_value = Fis)
table_MZ.pelagic <- GenTable(GOdata_MZ.pelagic,
Fis = result.Fis_MZ.pelagic, #Results from Fis test
topNodes = length(result.Fis_MZ.pelagic@score), #Number of top GO terms to include in the table
numChar = 50 #Number of characters to truncate the GO term definition to
) %>% rename(P_value = Fis)
# showSigOfNodes(GOdata_pelagic.all, score(resultFis_pelagic.all), firstSigNodes = 5,
# useInfo ='all',#additional info to be plotted to each node
# reverse=T, #graph it upside down or not; default is T
# wantedNodes = NULL, #can specify nodes you want to be able to find and they will be colored so you can see them easily
# type=NULL,#can change to plot pie charts supposedly but no options are listed to specify this
# showEdges=T,#default is T; if F removed arrow showing relationships
# sigForAll=T) #score/p-value of all nodes in the DAG is shown, otherwise only the score for the sigNodes; default is T
(benthic.MZ_sigNodes <- table_MZ.benthic %>% filter(as.numeric(P_value) < p) %>%
summarize(n()))
(pelagic.MZ_sigNodes <- table_MZ.pelagic %>% filter(as.numeric(P_value) < p) %>%
summarize(n()))
printGraph(GOdata_MZ.benthic, result.Fis_MZ.benthic, firstSigNodes = 0, fn.prefix = "~/R/R_working_dir/Collaborations/Albertson/Figures/MZ.benthic_within.species_BP.fisher", useInfo = "all", pdfSW = TRUE)
printGraph(GOdata_MZ.pelagic, result.Fis_MZ.pelagic, firstSigNodes = 2, fn.prefix = "~/R/R_working_dir/Collaborations/Albertson/Figures/MZ.pelagic_within.species_BP.fisher", useInfo = "all", pdfSW = TRUE)
```