Emily and Craig RNA-Seq

# Emily and Craig RNA-Seq ## RNA-Seq The below script was used to run a differential gene expression analysis in R. To run the below script, open this page in Edit mode (pencil in top left corner), then copy and paste *everything* below this sentence into an Rmd file in RStudio. ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager") BiocManager::install(version = "3.10") BiocManager::install(c("GenomicFeatures", "AnnotationDbi","edgeR")) BiocManager::install("baySeq") BiocManager::install("ALL", "topGO") BiocManager::install("Rgraphviz") rm(list=ls()) library(edgeR) #library(baySeq) library(statmod) library(RColorBrewer) library(pheatmap) library(tidyverse) #library(biomaRt) library(biomartr) library(topGO) library(ALL) ``` ## R Markdown ```{r Create design matrix and fit model, echo = FALSE} setwd("Albertson") x <- read.csv("rnaRawCounts_full.csv", header=TRUE,sep=",", row.names = 1) head(x) cpm.reads <- cpm(x) #write.csv(cpm.reads, file="All_samples_cpm.csv", row.names=TRUE) #-----------------Create data frame with info from target (samples) ---------------------- sample <- colnames(x) targets <- as.data.frame(sample) %>% mutate( species = c(rep("MZ", times=11), rep("TRC", times= 11)), treatment = c(rep("benthic", times = 5), rep("pelagic", times = 6), rep("benthic", times = 5), rep("pelagic", times = 6))) %>% mutate(across(!sample, as.factor)) #design <- targets %>% select(species, treatment) %>% # mutate(species.MZ = ifelse(species == "MZ", 1, 0), # species.TRC = ifelse(species=="TRC", 1, 0), # treatment.Pelagic = ifelse(treatment == "pelagic", 1, 0), # TRC.pelagic = ifelse(species == "MZ" & "treatment")) group <- factor(paste(targets$species, targets$treatment, sep=".")) targets$group <- group species <- targets$species treatment <- targets$treatment #-------------Model 1: Treat all groups separately: normalize counts table, add groups, and fit model ----------------------- # design matrix for pairwise comparisons design <- model.matrix(~0 + group) # design matrix for cross-species comparisons design2 <- model.matrix(~0 + species) #design matrix for cross-treatment comparisons design3 <- model.matrix(~0 + treatment) y <- DGEList(counts=x, group=group) #Make MDS plot to ensure samples group together points <- c(rep(19, times=5), rep(18, times = 6), rep(15, times=5), rep(16, times=6)) colors <- c(rep("dark blue", times=5), rep("blue", times = 6), rep("dark green", times=5), rep("green", times=6)) plotMDS(y, col=colors, pch=points) legend("top", legend=levels(group), pch=points[c(1,6,12,18)], col=colors[c(1,6,12,18)], ncol=1) # Calculate normalization factors # Estimate dispersion - either generally with estimateDisp() or specifically with estimateGLMTagwiseDisp() # glmQLFit() will fit the negative binomial model # glmQLFTest() will test for differential expression # glmTreat() allows us to test thresholded hypotheses under the glm framework. # See page 31 of edgeR manual # May want to fit an additive model to test for main effects, and a separate interaction model to test for interaction effects y.full <- y keep <- filterByExpr(y, design = design) #Filter out lowly expressed genes that will mess with comparisons y <- y[keep,,keep.lib.sizes=FALSE] y <- calcNormFactors(y) y1 <- estimateDisp(y, design, robust=TRUE) #Estimate dispersion for pairwise comparisons y2 <- estimateDisp(y, design2, robust = TRUE) #Estimate dispersion for cross-species comparisons y3 <- estimateDisp(y, design3, robust = TRUE) #Plot biological coefficient of variation #plotBCV(y) fit1 <- glmQLFit(y1, design, robust=TRUE) #Fit the model for treatments fit2 <- glmQLFit(y2, design2, robust=TRUE) #Fit the model for species fit3 <- glmQLFit(y3, design3, robust=TRUE) #Fit the model for treatment #plotQLDisp(fit2) #Make design matrix for comparisons comparisons1 <- makeContrasts(MZ.benthic_v_TRC.benthic = groupMZ.benthic - groupTRC.benthic, MZ.pelagic_v_TRC.pelagic = groupMZ.pelagic - groupTRC.pelagic, MZ.benthic_v_MZ.pelagic = groupMZ.benthic - groupMZ.pelagic, TRC.benthic_v_TRC.pelagic = groupTRC.benthic - groupTRC.pelagic, MZ.pelagic_v_TRC.benthic = groupMZ.pelagic - groupTRC.benthic, #DEGs in preferred environment MZ.benthic_v_TRC.pelagic = groupMZ.benthic - groupTRC.pelagic, #DEGs in non-preferred environment species_env = (groupMZ.benthic - groupMZ.pelagic) - (groupTRC.benthic - groupTRC.pelagic), #IDs genes that respond differently based on the species when the environment is accounted for. env_species = (groupMZ.benthic - groupTRC.benthic) - (groupMZ.pelagic - groupTRC.pelagic), #IDs genes that respo1nd differently based on the environment when species is accounted for. #No DEGs pref_v_nonpref = (groupMZ.pelagic+groupTRC.benthic) - (groupMZ.benthic+groupTRC.pelagic), levels=design) #No DEGs comparisons2 <- makeContrasts(MZ_v_TRC = speciesMZ - speciesTRC, levels = design2) comparisons3 <- makeContrasts(benthic_v_pelagic = treatmentbenthic - treatmentpelagic, levels = design3) #Combine all comparisons into a single F statistic and p-value and extract top tags comps <- glmQLFTest(fit1, contrast=comparisons1) topTags(comps, n = 17525) ##Make heatmap of all comparisons## logcpm <- cpm(y, log=TRUE) #calculate counts per million rownames(logcpm) <- rownames(y$counts) #save gene names as rownames colnames(logcpm) <- paste(y$samples$group, sep="_") #name samples by group o <- order(comps$table$PValue) #order differentially expressed genes by p value and save row index value logcpm <- logcpm[o[1:300],] #extract counts per million of the highest DEGs specified above #Create heat map pheatmap(logcpm, cluster_col=FALSE, scale="row", show_rownames = FALSE) ``` ```{r Pairwise comparisons for model 1, echo=FALSE} #------------------------Perform pairwise comparisons------------------------ ##Comparison 1: MZ vs TRC: benthic #Perform differential expression analysis qlf_MZ.benthic_v_TRC.benthic <- glmQLFTest(fit1, contrast=comparisons1[,"MZ.benthic_v_TRC.benthic"]) #Make MD plot of comparisons #plotMD(qlf_MZ.benthic_v_TRC.benthic, main = "MZ.benthic vs TRC.benthic") #Show number of genes differentially expressed (up and down) summary(decideTestsDGE(qlf_MZ.benthic_v_TRC.benthic)) #Extract all differentially expressed genes at FDR < 0.05 differentially expressed genes (DEGs) - set n high so the limit is the p_value cutoff top_DEGs1.1 <- topTags(qlf_MZ.benthic_v_TRC.benthic, n=17525) #Save top DEGs in a csv file #write.csv(top_DEGs1.1, file="MZ.benthic_v_TRC.benthic.csv", row.names=TRUE) ##Comparison 2: MZ vs TRC: pelagic #Perform differential expression analysis qlf_MZ.pelagic_v_TRC.pelagic <- glmQLFTest(fit1, contrast=comparisons1[,"MZ.pelagic_v_TRC.pelagic"]) #Make MD plot of comparisons #plotMD(qlf_MZ.pelagic_v_TRC.pelagic, main = "MZ.pelagic vs TRC.pelagic") #Show number of genes differentially expressed (up and down) summary(decideTestsDGE(qlf_MZ.pelagic_v_TRC.pelagic)) #Extract all differentially expressed genes at FDR < 0.05 differentially expressed genes (DEGs) - set n high so the limit is the p_value cutoff top_DGEs1.2 <- topTags(qlf_MZ.pelagic_v_TRC.pelagic, n=17525) #Save top DEGs in a csv file #write.csv(top_DGEs1.2, file="MZ.pelagic_v_TRC.pelagic.csv", row.names=TRUE) ##Comparison 3: MZ: benthic vs pelagic #Perform differential expression analysis qlf_MZ.benthic_v_MZ.pelagic <- glmQLFTest(fit1, contrast=comparisons1[,"MZ.benthic_v_MZ.pelagic"]) #Make MD plot of comparisons #plotMD(qlf_MZ.benthic_v_MZ.pelagic, main = "MZ.benthic vs MZ.pelagic") #Show number of genes differentially expressed (up and down) summary(decideTestsDGE(qlf_MZ.benthic_v_MZ.pelagic)) #Extract all differentially expressed genes at FDR < 0.05 differentially expressed genes (DEGs) - set n high so the limit is the p_value cutoff top_DGEs1.3 <- topTags(qlf_MZ.benthic_v_MZ.pelagic, n=17525) #Save top DEGs in a csv file #write.csv(top_DGEs1.3, file="MZ.benthic_v_MZ.pelagic.csv", row.names=TRUE) ##Comparison 4: TRC: benthic vs pelagic #Perform differential expression analysis qlf_TRC.benthic_v_TRC.pelagic <- glmQLFTest(fit1, contrast=comparisons1[,"TRC.benthic_v_TRC.pelagic"]) #Make MD plot of comparisons #plotMD(qlf_TRC.benthic_v_TRC.pelagic, main = "TRC.benthic vs TRC.pelagic") #Show number of genes differentially expressed (up and down) summary(decideTestsDGE(qlf_TRC.benthic_v_TRC.pelagic)) #Extract all differentially expressed genes at FDR < 0.05 differentially expressed genes (DEGs) - set n high so the limit is the p_value cutoff top_DGEs1.4 <- topTags(qlf_TRC.benthic_v_TRC.pelagic, n=17525) #Save top DEGs in a csv file #write.csv(top_DGEs1.4, file="TRC.benthic_v_TRC.pelagic.csv", row.names=TRUE) ##Comparison 5: DEGs between species in preferred environment ##MZ.pelagic vs TRC benthic #Perform differential expression analysis qlf_MZ.pelagic_v_TRC.benthic <- glmQLFTest(fit1, contrast=comparisons1[,"MZ.pelagic_v_TRC.benthic"]) #Make MD plot of comparisons #plotMD(qlf_MZ.pelagic_v_TRC.benthic, main = "MZ pelagic vs TRC benthic") #Show number of genes differentially expressed (up and down) summary(decideTestsDGE(qlf_MZ.pelagic_v_TRC.benthic)) #Extract all differentially expressed genes at FDR < 0.05 differentially expressed genes (DEGs) - set n high so the limit is the p_value cutoff top_DGEs1.5 <- topTags(qlf_MZ.pelagic_v_TRC.benthic, n=17525) #Save top DEGs in a csv file #write.csv(top_DGEs1.5, file="MZ.pelagic_v_TRC.benthic.csv", row.names=TRUE) ##Comparison 6: DEGs between species in non-preferred environment ##MZ benthic vs TRC pelagic #Perform differential expression analysis qlf_MZ.benthic_v_TRC.pelagic <- glmQLFTest(fit1, contrast=comparisons1[,"MZ.benthic_v_TRC.pelagic"]) #Make MD plot of comparisons #plotMD(qlf_MZ.benthic_v_TRC.pelagic, main = "MZ benthic vs TRC pelagic") #Show number of genes differentially expressed (up and down) summary(decideTestsDGE(qlf_MZ.benthic_v_TRC.pelagic)) #Extract all differentially expressed genes at FDR < 0.05 differentially expressed genes (DEGs) - set n high so the limit is the p_value cutoff top_DGEs1.6 <- topTags(qlf_MZ.benthic_v_TRC.pelagic, n=17525) #Save top DEGs in a csv file #write.csv(top_DGEs1.6, file="MZ.benthic_v_TRC.pelagic.csv", row.names=TRUE) ##Comparison 7: DEGs in preferred environment regardless of species ##MZ.pelagic+TRC.benthic vs MZ.benthic+TRC.pelagic #No DEGs #Perform differential expression analysis qlf_pref_v_nonpref <- glmQLFTest(fit1, contrast=comparisons1[,"pref_v_nonpref"]) #Make MD plot of comparisons #plotMD(qlf_MZ.pelagic_v_TRC.benthic, main = "MZ pelagic vs TRC benthic") #Show number of genes differentially expressed (up and down) summary(decideTestsDGE(qlf_pref_v_nonpref)) #Extract all differentially expressed genes at FDR < 0.05 differentially expressed genes (DEGs) - set n high so the limit is the p_value cutoff top_DGEs1.7 <- topTags(qlf_pref_v_nonpref, n=17525) #Save top DEGs in a csv file #write.csv(top_DGEs1.5, file="MZ.pelagic_v_TRC.benthic.csv", row.names=TRUE) # ##Comparison 7: DEGs that respond differently to the environment based on the species. # #Of the genes that are differentially expressed between environments in a given species, which are different between species # ## (MZ benthic - MZ pelagic) - (TRC benthic - TRC pelagic) # #Perform differential expression analysis # qlf_species <- glmQLFTest(fit, contrast=comparisons[,"MZ_v_TRC.benthic"]) # #Make MD plot of comparisons # plotMD(qlf_species, main = "MZ vs TRC") # #Show number of genes differentially expressed (up and down) # summary(decideTestsDGE(qlf_species)) # #Extract all differentially expressed genes at FDR < 0.05 differentially expressed genes (DEGs) - set n high so the limit is the p_value cutoff # top_DGEs1.7 <- topTags(qlf_species) # # #Save top DEGs in a csv file # #write.csv(top_DGEs4, file="Top_tags_ALG6_vs_ALG6.UGGT1.csv", row.names=TRUE) # # # ##Comparison 8: DEGs that differ between the species when we account for environment # #Of the genes that are differentially expressed between species in a given environment, which are different between environments? # ## (MZ benthic - TRC benthic) - (MZ pelagic - TRC pelagic) # #Perform differential expression analysis # qlf_environment <- glmQLFTest(fit, contrast=comparisons[,"benthic_v_pelagic.MZ"]) # #Make MD plot of comparisons # plotMD(qlf_environment, main = "Benthic vs pelagic") # #Show number of genes differentially expressed (up and down) # summary(decideTestsDGE(qlf_environment)) # #Extract all differentially expressed genes at FDR < 0.05 differentially expressed genes (DEGs) - set n high so the limit is the p_value cutoff # top_DGEs1.8 <- topTags(qlf_environment) # # #Save top DEGs in a csv file #write.csv(top_DGEs4, file="Top_tags_ALG6_vs_ALG6.UGGT1.csv", row.names=TRUE) ``` ```{r Pairwise comparisons for collapsed models, echo = FALSE} #------------------------Perform pairwise comparisons------------------------ ##Comparison 1: #Perform differential expression analysis qlf_speciesMZ_v_TRC <- glmQLFTest(fit2,contrast=comparisons2[,"MZ_v_TRC"] ) #Make MD plot of comparisons #plotMD(qlf_speciesMZ_v_TRC, main = "MZ vs TRC") #Show number of genes differentially expressed (up and down) summary(decideTestsDGE(qlf_speciesMZ_v_TRC)) #17525 genes total #Extract all differentially expressed genes at FDR < 0.05 differentially expressed genes (DEGs) - set n high so the limit is the p_value cutoff top_DGEs2.1 <- topTags(qlf_speciesMZ_v_TRC, n = 17525) #Save top DEGs in a csv file #write.csv(top_DGEs2.1, file="MZ_v_TRC_all.csv", row.names=TRUE) ##Comparison 2: #Perform differential expression analysis qlf_environment_benthic_v_pelagic <- glmQLFTest(fit3,contrast=comparisons3[,"benthic_v_pelagic"] ) #Make MD plot of comparisons #plotMD(qlf_environment_benthic_v_pelagic, main = "MZ vs TRC") #Show number of genes differentially expressed (up and down) summary(decideTestsDGE(qlf_environment_benthic_v_pelagic)) #Extract all differentially expressed genes at FDR < 0.05 differentially expressed genes (DEGs) - set n high so the limit is the p_value cutoff top_DGEs2.2 <- topTags(qlf_environment_benthic_v_pelagic, n=17525) #Save top DEGs in a csv file #write.csv(top_DGEs1, file="Top_tags_ALG6_vs_ALG6.UGGT1.csv", row.names=TRUE) ``` # Enrichment analysis ## GO term overview See the [GO overview](http://geneontology.org/docs/ontology-documentation/), including description of the three (separate) categories. **Molecular function**: refers to the molecular-level activities performed by gene products (*e.g. catalytic activity*) **Cellular component**: The locations in which a gene product performs a function, either cellular compartments (*e.g. mitochondrion*) or macromolecular complexes (*e.g. ribosome*). Broadly, refers to cellular anatomy, rather than a process. **Biological process**: The larger biological processes accomplished by multiple molecular activities (*e.g. DNA repair (broad) or glucose transmembrane transport (specific)*). In other words, the specific molecular function will be under the umbrella of this. ```{r enrichment analysis setup} #Set up for all scripts below p <- 0.05 topDiffGenes <- function(allScore){ return(allScore < p) } ####------------Species v species DGE for filtering-----------#### ##Comparison 2.1: Species: MZ vs TRC (environment collapsed) qlf_speciesMZ_v_TRC <- glmQLFTest(fit2,contrast=comparisons2[,"MZ_v_TRC"]) #Show number of genes differentially expressed (up and down) summary(decideTestsDGE(qlf_speciesMZ_v_TRC)) DEGs_species <- as.data.frame(topTags(qlf_speciesMZ_v_TRC, n = 17525, p.value = p)) DEGs_species2 <- DEGs_species %>% rownames_to_column(var = "gene") #Create vector for all genes all_gene_names <- row.names(y$counts) GO_all <- biomartr::getGO(organism = "Maylandia zebra", genes = all_gene_names, filters = "ensembl_gene_id") %>% dplyr::rename(gene = ensembl_gene_id, GO_term = goslim_goa_description, GO_accession = goslim_goa_accession) #Remove genes with no GO terms - zero of those GO_all_noNA <- GO_all %>% filter(is.na(GO_term) == FALSE) #allGenes_vec <- all_genes_preferred$FDR #names(allGenes_vec_pref) <- all_genes_preferred$gene #Count number of times each gene occurs ie number of associated GO terms num.genes_all <- GO_all_noNA %>% group_by(gene) %>% dplyr::summarize(num = n()) #Make list to populate with GO accession numbers and populate first element all.genes_list <- list() all.genes_list[[1]] <- c(GO_all_noNA$GO_accession[1:num.genes_all$num[1]]) #Loop over remainder of elements and store the GO terms in the list ##MZ for(i in 2:nrow(num.genes_all)){ start <- sum(num.genes_all$num[1:i-1])+1 #Starting point for GO terms end <- start + num.genes_all$num[i] - 1 #Ending point for go terms all.genes_list[[i]] <- c(GO_all_noNA$GO_accession[start:end]) #Extract GO terms from index specified above } #Add gene names to list names(all.genes_list) <- num.genes_all$gene ####---------Filter genes that are DE between species-----------#### #Filter out genes from full gene list that are DE between species all_gene_names.filt <- all_gene_names[!all_gene_names %in% DEGs_species2$gene] GO_all.filt <- GO_all_noNA %>% filter(gene %in% all_gene_names.filt) num.genes_all.filt <- GO_all.filt %>% group_by(gene) %>% dplyr::summarize(num = n()) #Make list to populate with GO accession numbers and populate first element all.genes_list.filt <- list() all.genes_list.filt[[1]] <- c(GO_all.filt$GO_accession[1:num.genes_all.filt$num[1]]) #Loop over remainder of elements and store the GO terms in the list for(i in 2:nrow(num.genes_all.filt)){ start <- sum(num.genes_all.filt$num[1:i-1])+1 #Starting point for GO terms end <- start + num.genes_all.filt$num[i] - 1 #Ending point for go terms all.genes_list.filt[[i]] <- c(GO_all.filt$GO_accession[start:end]) #Extract GO terms from index specified above } #Add gene names to list names(all.genes_list.filt) <- num.genes_all.filt$gene ``` ## Enrichment analysis with topGO Here's a useful website: http://avrilomics.blogspot.com/2015/07/using-topgo-to-test-for-go-term.html ```{r enrichment analysis for preferred environment} ##Comparison 5: DEGs in preferred environment ##MZ.pelagic vs TRC benthic qlf_MZ.pelagic_v_TRC.benthic <- glmQLFTest(fit1, contrast=comparisons1[,"MZ.pelagic_v_TRC.benthic"]) #Show number of genes differentially expressed (up and down) summary(decideTestsDGE(qlf_MZ.pelagic_v_TRC.benthic)) #Save all DEGs DEGs.in.preferred.all <- as.data.frame(topTags(qlf_MZ.pelagic_v_TRC.benthic, n=17525, p.value = p)) %>% rownames_to_column(var = "gene") %>% mutate(species_env = ifelse(logFC > 0, "MZ_pelagic", "TRC_benthic")) #Filter out genes that are DE between species DEGs.in.preferred.all_filt <- DEGs.in.preferred.all %>% anti_join(DEGs_species2, by = "gene") %>% dplyr::select(gene, logFC, FDR, species_env) ####Get GO terms#### # #Change here to filtered dataset if wanting to ignore genes that are DE between species # GO_preferred <- biomartr::getGO(organism = "Maylandia zebra", # genes = DEGs.in.preferred.all_filt$gene, # filters = "ensembl_gene_id") # # #Rename # GO_preferred <- GO_preferred %>% dplyr::rename(gene = ensembl_gene_id, GO_term = goslim_goa_description, GO_accession = goslim_goa_accession) # # #Combine with dataframe of all (filtered) DEGs # preferred_full.tbl <- GO_preferred %>% full_join(DEGs.in.preferred.all_filt, by = "gene") #%>% # #dplyr::select(gene, GO_term, GO_accession, logFC, FDR, species_env) # # #% of genes for which there is at least one GO term # GO_preferred %>% distinct(gene) %>% # summarize(percent_genes_w_GO = round(n()/length(DEGs.in.preferred.all_filt$gene)*100, 1)) ####--------------Separate by species_env----------------------#### #Separate out DEGs from analysis by species/environment #Add "_all" to preferred_full.tbl to use all genes instead of filtered dataset preferred_MZ <- DEGs.in.preferred.all_filt %>% filter(species_env == "MZ_pelagic", FDR < p) preferred_TRC <- DEGs.in.preferred.all_filt %>% filter(species_env == "TRC_benthic", FDR < p) #Make list of gene names for topGO object MZ_DEG_names <- c(preferred_MZ$gene) #Convert to vector MZ_geneList <- factor(as.integer(all_gene_names.filt %in% MZ_DEG_names)) names(MZ_geneList) <- all_gene_names.filt TRC_DEG_names <- c(preferred_TRC$gene) #Convert to vector TRC_geneList <- factor(as.integer(all_gene_names.filt %in% TRC_DEG_names)) names(TRC_geneList) <- all_gene_names.filt #Create topGO object using whole list of genes #Here, I've already filtered for DEGs, so I'm passing a binary vector to the function that marks genes as interesting (ie DE) or not, and then maps them to the GO terms. GOdata_pref.MZ <- new("topGOdata", description = "GO enrichment analysis for preferred environment", ontology = "BP", #The 'ontology' argument can be 'BP' (biological process), 'MF' (molecular function), or 'CC' (cellular component) allGenes = MZ_geneList, #this is the 'gene universe' annot = annFUN.gene2GO, gene2GO = all.genes_list.filt) GOdata_pref.TRC <- new("topGOdata", description = "GO enrichment analysis for preferred environment", ontology = "BP", #The 'ontology' argument can be 'BP' (biological process), 'MF' (molecular function), or 'CC' (cellular component) allGenes = TRC_geneList, #this is the 'gene universe' annot = annFUN.gene2GO, gene2GO = all.genes_list.filt) #Run the KS test using a hybrid of the elim and weighted algorithm #This is the default in topGO and seems like a solid default result.Fis_pref.MZ <- runTest(GOdata_pref.MZ, algorithm = "weight01", statistic = "fisher") result.Fis_pref.MZ result.Fis_pref.TRC <- runTest(GOdata_pref.TRC, algorithm = "weight01", statistic = "fisher") result.Fis_pref.TRC table_pref.MZ <- GenTable(GOdata_pref.MZ, Fis = result.Fis_pref.MZ, #Results from KS test topNodes = length(result.Fis_pref.MZ@score), #Number of top GO terms to include in the table numChar = 50 #Number of characters to truncate the GO term definition to ) %>% rename(P_value = Fis) table_pref.TRC <- GenTable(GOdata_pref.TRC, Fis = result.Fis_pref.TRC, #Results from KS test topNodes = length(result.Fis_pref.TRC@score), #Number of top GO terms to include in the table numChar = 50 #Number of characters to truncate the GO term definition to ) %>% rename(P_value = Fis) # showSigOfNodes(GOdata_pref.all, score(resultKS_pref.all), firstSigNodes = 5, # useInfo ='all',#additional info to be plotted to each node # reverse=T, #graph it upside down or not; default is T # wantedNodes = NULL, #can specify nodes you want to be able to find and they will be colored so you can see them easily # type=NULL,#can change to plot pie charts supposedly but no options are listed to specify this # showEdges=T,#default is T; if F removed arrow showing relationships # sigForAll=T) #score/p-value of all nodes in the DAG is shown, otherwise only the score for the sigNodes; default is T #Calculate number of significant GO terms for viz (MZ.pref_sigNodes <- table_pref.MZ %>% filter(as.numeric(P_value) < p) %>% summarize(n())) (TRC.pref_sigNodes <- table_pref.TRC %>% filter(as.numeric(P_value) < p) %>% summarize(n())) printGraph(GOdata_pref.MZ, result.Fis_pref.MZ, firstSigNodes = 2, fn.prefix = "~/R/R_working_dir/Collaborations/Albertson/Figures/Preferred.env_MZ.filt_BP.fisher", useInfo = "all", pdfSW = TRUE) printGraph(GOdata_pref.TRC, result.Fis_pref.TRC, firstSigNodes = 3, fn.prefix = "~/R/R_working_dir/Collaborations/Albertson/Figures/Preferred.env_TRC.filt_BP.fisher", useInfo = "all", pdfSW = TRUE) ``` ```{r enrichment analysis for non-preferred environment} ##Comparison 5: DEGs in non-preferred environment ##MZ.pelagic vs TRC benthic qlf_MZ.benthic_v_TRC.pelagic <- glmQLFTest(fit1, contrast=comparisons1[,"MZ.benthic_v_TRC.pelagic"]) #Show number of genes differentially expressed (up and down) summary(decideTestsDGE(qlf_MZ.benthic_v_TRC.pelagic)) #Save DEGs DEGs_nonpreferred <- as.data.frame(topTags(qlf_MZ.benthic_v_TRC.pelagic, n=17525, p.value = p)) %>% rownames_to_column(var = "gene") %>% mutate(species_env = ifelse(logFC > 0, "MZ_benthic", "TRC_pelagic")) #Filter out genes that are DE between species DEGs.in.nonpreferred.all_filt <- DEGs_nonpreferred %>% anti_join(DEGs_species2, by = "gene") %>% mutate(species_env = ifelse(logFC > 0, "MZ_benthic", "TRC_pelagic")) %>% dplyr::select(gene, logFC, FDR, species_env) ####Get GO terms#### # GO_nonpreferred <- biomartr::getGO(organism = "Maylandia zebra", # genes = DEGs.in.nonpreferred.all_filt$gene, # filters = "ensembl_gene_id") # # GO_nonpreferred <- GO_nonpreferred %>% dplyr::rename(gene = ensembl_gene_id, GO_term = goslim_goa_description, GO_accession = goslim_goa_accession) # # #Combine with dataframe of all (filtered) DEGs # nonpreferred_full.tbl <- GO_nonpreferred %>% full_join(DEGs_nonpreferred, by = "gene") %>% # dplyr::select(gene, GO_term, GO_accession, logFC, FDR, species_env) # # #% of genes for which there is at least one GO term # GO_nonpreferred %>% distinct(gene) %>% # summarize(percent_genes_w_GO = round(n()/length(DEGs.in.nonpreferred.all_filt$gene)*100, 1)) ####--------------Separate by species_env----------------------#### #Not sure if I'm going to use this ... update: I am #Add "_all" to nonpreferred_full.tbl to use all genes rather than filtered dataset nonpreferred_MZ <- DEGs.in.nonpreferred.all_filt %>% filter(species_env == "MZ_benthic", FDR < p) nonpreferred_TRC <- DEGs.in.nonpreferred.all_filt %>% filter(species_env == "TRC_pelagic", FDR < p) ####Count GO terms#### # nonpreferred_MZ_GOterm.sum <- nonpreferred_MZ %>% group_by(GO_term) %>% # summarize(MZ_benthic = n()) %>% # arrange(desc(MZ_benthic)) # # nonpreferred_TRC_GOterm.sum <- nonpreferred_TRC %>% group_by(GO_term) %>% # summarize(TRC_pelagic = n()) %>% # arrange(desc(TRC_pelagic)) # # nonpreferred_all_GOterm.sum <- nonpreferred_MZ_GOterm.sum %>% full_join(nonpreferred_TRC_GOterm.sum, by = "GO_term") # # #Write output files # write.csv(nonpreferred_all_GOterm.sum, file = "output/nonpreferred_MZ.benthic.v.TRC.pelagic_all_GOterm.sum.csv", row.names = FALSE) #Remove genes with no GO terms # nonpref_MZ_GO <- nonpreferred_MZ %>% filter(is.na(GO_term) == FALSE) %>% # group_by(gene) %>% # select(-c(GO_term, logFC, FDR, species_env)) # # nonpref_TRC_GO <- nonpreferred_TRC %>% filter(is.na(GO_term) == FALSE) %>% # group_by(gene) %>% # select(-c(GO_term, logFC, FDR, species_env)) ####make topGO object#### #Make vector of interesting genes for topGO object MZ_DEG_names_nonpref <- c(nonpreferred_MZ$gene) #Convert to vector MZ_geneList_nonpref <- factor(as.integer(all_gene_names.filt %in% MZ_DEG_names_nonpref)) names(MZ_geneList_nonpref) <- all_gene_names.filt TRC_DEG_names_nonpref <- c(nonpreferred_TRC$gene) #Convert to vector TRC_geneList_nonpref <- factor(as.integer(all_gene_names.filt %in% TRC_DEG_names_nonpref)) names(TRC_geneList_nonpref) <- all_gene_names.filt #Create topGO object using whole list of genes GOdata_nonpref.MZ <- new("topGOdata", description = "GO enrichment analysis for nonnonpreferred environment", ontology = "BP", #The 'ontology' argument can be 'BP' (biological process), 'MF' (molecular function), or 'CC' (cellular component) allGenes = MZ_geneList_nonpref, #this is the 'gene universe' annot = annFUN.gene2GO, gene2GO = all.genes_list.filt) GOdata_nonpref.TRC <- new("topGOdata", description = "GO enrichment analysis for nonnonpreferred environment", ontology = "BP", #The 'ontology' argument can be 'BP' (biological process), 'MF' (molecular function), or 'CC' (cellular component) allGenes = TRC_geneList_nonpref, #this is the 'gene universe' annot = annFUN.gene2GO, gene2GO = all.genes_list.filt) #Run the KS test using a hybrid of the elim and weighted algorithm #This is the default in topGO and seems like a solid default result.Fis_nonpref.MZ <- runTest(GOdata_nonpref.MZ, algorithm = "weight01", statistic = "fisher") result.Fis_nonpref.MZ result.Fis_nonpref.TRC <- runTest(GOdata_nonpref.TRC, algorithm = "weight01", statistic = "fisher") result.Fis_nonpref.TRC table_nonpref.MZ <- GenTable(GOdata_nonpref.MZ, Fis = result.Fis_nonpref.MZ, #Results from Fis test topNodes = length(result.Fis_nonpref.MZ@score), #Number of top GO terms to include in the table numChar = 50 #Number of characters to truncate the GO term definition to ) %>% rename(P_value = Fis) table_nonpref.TRC <- GenTable(GOdata_nonpref.TRC, Fis = result.Fis_nonpref.TRC, #Results from Fis test topNodes = length(result.Fis_nonpref.TRC@score), #Number of top GO terms to include in the table numChar = 50 #Number of characters to truncate the GO term definition to ) %>% rename(P_value = Fis) # showSigOfNodes(GOdata_nonpref.all, score(resultFis_nonpref.all), firstSigNodes = 5, # useInfo ='all',#additional info to be plotted to each node # reverse=T, #graph it upside down or not; default is T # wantedNodes = NULL, #can specify nodes you want to be able to find and they will be colored so you can see them easily # type=NULL,#can change to plot pie charts supposedly but no options are listed to specify this # showEdges=T,#default is T; if F removed arrow showing relationships # sigForAll=T) #score/p-value of all nodes in the DAG is shown, otherwise only the score for the sigNodes; default is T #Calculate number of significant GO terms for viz (MZ.nonpref_sigNodes <- table_nonpref.MZ %>% filter(as.numeric(P_value) < p) %>% summarize(n())) (TRC.nonpref_sigNodes <- table_nonpref.TRC %>% filter(as.numeric(P_value) < p) %>% summarize(n())) printGraph(GOdata_nonpref.MZ, result.Fis_nonpref.MZ, firstSigNodes = 0, fn.prefix = "~/R/R_working_dir/Collaborations/Albertson/Figures/nonpreferred.env_MZ.filt_BP.fisher", useInfo = "all", pdfSW = TRUE) printGraph(GOdata_nonpref.TRC, result.Fis_nonpref.TRC, firstSigNodes = 1, fn.prefix = "~/R/R_working_dir/Collaborations/Albertson/Figures/nonpreferred.env_TRC.filt_BP.fisher", useInfo = "all", pdfSW = TRUE) ``` ```{r enrichment analysis for pelagic environment} #------------Using filtered dataset ie no species x species DEGs-------- ##Comparison 5: DEGs in preferred environment ##MZ.pelagic vs TRC benthic qlf_MZ.pelagic_v_TRC.pelagic <- glmQLFTest(fit1, contrast=comparisons1[,"MZ.pelagic_v_TRC.pelagic"]) #Show number of genes differentially expressed (up and down) summary(decideTestsDGE(qlf_MZ.pelagic_v_TRC.pelagic)) #Make filtered dataset for GO term analysis #Save DEGs DEGs_pelagic <- as.data.frame(topTags(qlf_MZ.pelagic_v_TRC.pelagic, n=17525, p.value = p)) %>% rownames_to_column(var = "gene") %>% mutate(species_env = ifelse(logFC > 0, "MZ_pelagic", "TRC_pelagic")) #Filter out genes that are DE between species DEGs.in.pelagic.all_filt <- DEGs_pelagic %>% anti_join(DEGs_species2, by = "gene") %>% mutate(species_env = ifelse(logFC > 0, "MZ_pelagic", "TRC_pelagic")) %>% dplyr::select(gene, logFC, FDR, species_env) ####Get GO terms#### # GO_pelagic <- biomartr::getGO(organism = "Maylandia zebra", # genes = DEGs.in.pelagic.all_filt$gene, # filters = "ensembl_gene_id") # # GO_pelagic <- GO_pelagic %>% dplyr::rename(gene = ensembl_gene_id, GO_term = goslim_goa_description, GO_accession = goslim_goa_accession) # # #Combine with dataframe of all (filtered) DEGs # pelagic_full.tbl <- GO_pelagic %>% full_join(DEGs_pelagic, by = "gene") %>% # dplyr::select(gene, GO_term, GO_accession, logFC, FDR, species_env) # # #% of genes for which there is at least one GO term # GO_pelagic %>% distinct(gene) %>% # summarize(percent_genes_w_GO = round(n()/length(DEGs.in.pelagic.all_filt$gene)*100, 1)) #--------------Separate by species_env---------------------- #Not sure if I'm going to use this ... UPDATE: I am #Add "_all" to end of pelagic_full.tbl to use full dataset instead of filtered pelagic_MZ <- DEGs.in.pelagic.all_filt %>% filter(species_env == "MZ_pelagic", FDR < p) pelagic_TRC <- DEGs.in.pelagic.all_filt %>% filter(species_env == "TRC_pelagic", FDR < p) #Count GO terms # pelagic_MZ_GOterm.sum <- pelagic_MZ %>% group_by(GO_term) %>% # summarize(MZ_pelagic = n()) %>% # arrange(desc(MZ_pelagic)) # # pelagic_TRC_GOterm.sum <- pelagic_TRC %>% group_by(GO_term) %>% # summarize(TRC_benthic = n()) %>% # arrange(desc(TRC_benthic)) # # pelagic_all_GOterm.sum <- pelagic_MZ_GOterm.sum %>% full_join(pelagic_TRC_GOterm.sum, by = "GO_term") # # #Write output files # write.csv(pelagic_all_GOterm.sum, file = "output/pelagic_MZ.pelagic.v.TRC.benthic_all_GOterm.sum.csv", row.names = FALSE) #Make lists of interesting genes MZ_DEG_names_pelagic <- c(pelagic_MZ$gene) #Convert to vector MZ_geneList_pelagic <- factor(as.integer(all_gene_names.filt %in% MZ_DEG_names_pelagic)) names(MZ_geneList_pelagic) <- all_gene_names.filt TRC_DEG_names_pelagic <- c(pelagic_TRC$gene) #Convert to vector TRC_geneList_pelagic <- factor(as.integer(all_gene_names.filt %in% TRC_DEG_names_pelagic)) names(TRC_geneList_pelagic) <- all_gene_names.filt #Create topGO object using lists of DEGs GOdata_pelagic.MZ <- new("topGOdata", description = "GO enrichment analysis for pelagic environment", ontology = "BP", #The 'ontology' argument can be 'BP' (biological process), 'MF' (molecular function), or 'CC' (cellular component) allGenes = MZ_geneList_pelagic, #this is the 'gene universe' annot = annFUN.gene2GO, gene2GO = all.genes_list.filt) GOdata_pelagic.TRC <- new("topGOdata", description = "GO enrichment analysis for pelagic environment", ontology = "BP", #The 'ontology' argument can be 'BP' (biological process), 'MF' (molecular function), or 'CC' (cellular component) allGenes = TRC_geneList_pelagic, #this is the 'gene universe' annot = annFUN.gene2GO, gene2GO = all.genes_list.filt) #Run the KS test using a hybrid of the elim and weighted algorithm #This is the default in topGO and seems like a solid default result.Fis_pelagic.MZ <- runTest(GOdata_pelagic.MZ, algorithm = "weight01", statistic = "fisher") result.Fis_pelagic.MZ result.Fis_pelagic.TRC <- runTest(GOdata_pelagic.TRC, algorithm = "weight01", statistic = "fisher") result.Fis_pelagic.TRC table_pelagic.MZ <- GenTable(GOdata_pelagic.MZ, Fis = result.Fis_pelagic.MZ, #Results from Fis test topNodes = length(result.Fis_pelagic.MZ@score), #Number of top GO terms to include in the table numChar = 50 #Number of characters to truncate the GO term definition to ) %>% rename(P_value = Fis) table_pelagic.TRC <- GenTable(GOdata_pelagic.TRC, Fis = result.Fis_pelagic.TRC, #Results from Fis test topNodes = length(result.Fis_pelagic.TRC@score), #Number of top GO terms to include in the table numChar = 50 #Number of characters to truncate the GO term definition to ) %>% rename(P_value = Fis) # showSigOfNodes(GOdata_pelagic.all, score(resultFis_pelagic.all), firstSigNodes = 5, # useInfo ='all',#additional info to be plotted to each node # reverse=T, #graph it upside down or not; default is T # wantedNodes = NULL, #can specify nodes you want to be able to find and they will be colored so you can see them easily # type=NULL,#can change to plot pie charts supposedly but no options are listed to specify this # showEdges=T,#default is T; if F removed arrow showing relationships # sigForAll=T) #score/p-value of all nodes in the DAG is shown, otherwise only the score for the sigNodes; default is T #Calculate number of significant GO terms for viz (MZ.pelagic_sigNodes <- table_pelagic.MZ %>% filter(as.numeric(P_value) < p) %>% summarize(n())) (TRC.pelagic_sigNodes <- table_pelagic.TRC %>% filter(as.numeric(P_value) < p) %>% summarize(n())) printGraph(GOdata_pelagic.MZ, result.Fis_pelagic.MZ, firstSigNodes = 3, fn.prefix = "~/R/R_working_dir/Collaborations/Albertson/Figures/pelagic.env_MZ.filt_BP.fisher", useInfo = "all", pdfSW = TRUE) printGraph(GOdata_pelagic.TRC, result.Fis_pelagic.TRC, firstSigNodes = 0, fn.prefix = "~/R/R_working_dir/Collaborations/Albertson/Figures/pelagic.env_TRC.filt_BP.fisher", useInfo = "all", pdfSW = TRUE) ``` ```{r enrichment analysis for benthic environment} #------------Using filtered dataset ie no species x species DEGs-------- ##Comparison 5: DEGs in preferred environment ##MZ.benthic vs TRC benthic qlf_MZ.benthic_v_TRC.benthic <- glmQLFTest(fit1, contrast=comparisons1[,"MZ.benthic_v_TRC.benthic"]) #Show number of genes differentially expressed (up and down) summary(decideTestsDGE(qlf_MZ.benthic_v_TRC.benthic)) #Run GO analysis on filtered dataset #Save DEGs DEGs_benthic <- as.data.frame(topTags(qlf_MZ.benthic_v_TRC.benthic, n=17525, p.value = p)) %>% rownames_to_column(var = "gene") %>% mutate(species_env = ifelse(logFC > 0, "MZ_benthic", "TRC_benthic")) #Filter out genes that are DE between species DEGs.in.benthic.all_filt <- DEGs_benthic %>% anti_join(DEGs_species2, by = "gene") %>% mutate(species_env = ifelse(logFC > 0, "MZ_benthic", "TRC_benthic")) %>% dplyr::select(gene, logFC, FDR, species_env) ####Get GO terms#### # GO_benthic <- biomartr::getGO(organism = "Maylandia zebra", # genes = DEGs.in.benthic.all_filt$gene, # filters = "ensembl_gene_id") # # GO_benthic <- GO_benthic %>% dplyr::rename(gene = ensembl_gene_id, GO_term = goslim_goa_description, GO_accession = goslim_goa_accession) # # #Combine with dataframe of all (filtered) DEGs # benthic_full.tbl <- GO_benthic %>% full_join(DEGs_benthic, by = "gene") %>% # dplyr::select(gene, GO_term, GO_accession, logFC, FDR, species_env) # # #% of genes for which there is at least one GO term # GO_benthic %>% distinct(gene) %>% # summarize(percent_genes_w_GO = round(n()/length(DEGs.in.benthic.all_filt$gene)*100, 1)) #--------------Separate by species_env---------------------- #Not sure if I'm going to use this ... #Add "_all" at end of benthic_full.tbl if wanting to use full dataset benthic_MZ <- DEGs.in.benthic.all_filt %>% filter(species_env == "MZ_benthic", FDR < p) benthic_TRC <- DEGs.in.benthic.all_filt %>% filter(species_env == "TRC_benthic", FDR < p) #Count GO terms # benthic_MZ_GOterm.sum <- benthic_MZ %>% group_by(GO_term) %>% # summarize(MZ_benthic = n()) %>% # arrange(desc(MZ_benthic)) # # benthic_TRC_GOterm.sum <- benthic_TRC %>% group_by(GO_term) %>% # summarize(TRC_benthic = n()) %>% # arrange(desc(TRC_benthic)) # # benthic_all_GOterm.sum <- benthic_MZ_GOterm.sum %>% full_join(benthic_TRC_GOterm.sum, by = "GO_term") # # #Write output files # write.csv(benthic_all_GOterm.sum, file = "output/benthic_MZ.benthic.v.TRC.benthic_all_GOterm.sum.csv", row.names = FALSE) MZ_DEG_names_benthic <- c(benthic_MZ$gene) #Convert to vector MZ_geneList_benthic <- factor(as.integer(all_gene_names.filt %in% MZ_DEG_names_benthic)) names(MZ_geneList_benthic) <- all_gene_names.filt TRC_DEG_names_benthic <- c(benthic_TRC$gene) #Convert to vector TRC_geneList_benthic <- factor(as.integer(all_gene_names.filt %in% TRC_DEG_names_benthic)) names(TRC_geneList_benthic) <- all_gene_names.filt #Create topGO object GOdata_benthic.MZ <- new("topGOdata", description = "GO enrichment analysis for benthic environment", ontology = "BP", #The 'ontology' argument can be 'BP' (biological process), 'MF' (molecular function), or 'CC' (cellular component) allGenes = MZ_geneList_benthic, #this is the 'gene universe' annot = annFUN.gene2GO, gene2GO = all.genes_list.filt) GOdata_benthic.TRC <- new("topGOdata", description = "GO enrichment analysis for benthic environment", ontology = "BP", #The 'ontology' argument can be 'BP' (biological process), 'MF' (molecular function), or 'CC' (cellular component) allGenes = TRC_geneList_benthic, #this is the 'gene universe' annot = annFUN.gene2GO, gene2GO = all.genes_list.filt) #Run the KS test using a hybrid of the elim and weighted algorithm #This is the default in topGO and seems like a solid default result.Fis_benthic.MZ <- runTest(GOdata_benthic.MZ, algorithm = "weight01", statistic = "fisher") result.Fis_benthic.MZ result.Fis_benthic.TRC <- runTest(GOdata_benthic.TRC, algorithm = "weight01", statistic = "fisher") result.Fis_benthic.TRC table_benthic.MZ <- GenTable(GOdata_benthic.MZ, Fis = result.Fis_benthic.MZ, #Results from Fis test topNodes = length(result.Fis_benthic.MZ@score), #Number of top GO terms to include in the table numChar = 50 #Number of characters to truncate the GO term definition to ) %>% rename(P_value = Fis) table_benthic.TRC <- GenTable(GOdata_benthic.TRC, Fis = result.Fis_benthic.TRC, #Results from Fis test topNodes = length(result.Fis_benthic.TRC@score), #Number of top GO terms to include in the table numChar = 50 #Number of characters to truncate the GO term definition to ) %>% rename(P_value = Fis) # showSigOfNodes(GOdata_benthic.all, score(resultFis_benthic.all), firstSigNodes = 5, # useInfo ='all',#additional info to be plotted to each node # reverse=T, #graph it upside down or not; default is T # wantedNodes = NULL, #can specify nodes you want to be able to find and they will be colored so you can see them easily # type=NULL,#can change to plot pie charts supposedly but no options are listed to specify this # showEdges=T,#default is T; if F removed arrow showing relationships # sigForAll=T) #score/p-value of all nodes in the DAG is shown, otherwise only the score for the sigNodes; default is T (MZ.benthic_sigNodes <- table_benthic.MZ %>% filter(as.numeric(P_value) < p) %>% summarize(n())) (TRC.benthic_sigNodes <- table_benthic.TRC %>% filter(as.numeric(P_value) < p) %>% summarize(n())) printGraph(GOdata_benthic.MZ, result.Fis_benthic.MZ, firstSigNodes = 0, fn.prefix = "~/R/R_working_dir/Collaborations/Albertson/Figures/benthic.env_MZ.filt_BP.fisher", useInfo = "all", pdfSW = TRUE) printGraph(GOdata_benthic.TRC, result.Fis_benthic.TRC, firstSigNodes = 0, fn.prefix = "~/R/R_working_dir/Collaborations/Albertson/Figures/benthic.env_TRC.filt_BP.fisher", useInfo = "all", pdfSW = TRUE) ``` ```{r species vs species enrichment analysis} qlf_speciesMZ_v_TRC <- glmQLFTest(fit2,contrast=comparisons2[,"MZ_v_TRC"] ) summary(decideTestsDGE(qlf_speciesMZ_v_TRC)) #17525 genes total ############Making topGO object for MBuna data#################### ##Save all genes regardless of FDR all_genes_species <- as.data.frame(topTags(qlf_speciesMZ_v_TRC, n=17525)) %>% #Save all genes regardless of FDR rownames_to_column(var = "gene") %>% mutate(species_env = ifelse(logFC > 0, "MZ", "TRC")) #%>% head(all_genes_species) ####Extract GO terms for each gene#### # GO_species_all <- biomartr::getGO(organism = "Maylandia zebra", # genes = all_genes_species$gene, # filters = "ensembl_gene_id") %>% # dplyr::rename(gene = ensembl_gene_id, # GO_term = goslim_goa_description, # GO_accession = goslim_goa_accession) #% of genes for which there is at least one GO term # GO_benthic_all %>% distinct(gene) %>% # summarize(percent_genes_w_GO = round(n()/length(all_genes_benthic$gene)*100, 1)) # #Combine with dataframe of all DEGs # species_full.tbl_all <- GO_species_all %>% full_join(all_genes_species, by = "gene") %>% # dplyr::select(gene, GO_term, GO_accession, logFC, FDR, species_env) #--------------Separate by species_env---------------------- #Not sure if I'm going to use this ... I AM species_MZ <- all_genes_species %>% filter(species_env == "MZ", FDR < p) species_TRC <- all_genes_species %>% filter(species_env == "TRC", FDR < p) #Count GO terms # species_MZ_GOterm.sum <- species_MZ %>% group_by(GO_term) %>% # summarize(MZ_species = n()) %>% # arrange(desc(MZ_species)) # # species_TRC_GOterm.sum <- species_TRC %>% group_by(GO_term) %>% # summarize(TRC_species = n()) %>% # arrange(desc(TRC_species)) # # species_all_GOterm.sum <- species_MZ_GOterm.sum %>% full_join(species_TRC_GOterm.sum, by = "GO_term") # # #Write output files # write.csv(species_all_GOterm.sum, file = "output/species_MZ.species.v.TRC.species_all_GOterm.sum.csv", row.names = FALSE) MZ_DEG_names_species <- c(species_MZ$gene) #Convert to vector MZ_geneList_species <- factor(as.integer(all_gene_names %in% MZ_DEG_names_species)) names(MZ_geneList_species) <- all_gene_names TRC_DEG_names_species <- c(species_TRC$gene) #Convert to vector TRC_geneList_species <- factor(as.integer(all_gene_names %in% TRC_DEG_names_species)) names(TRC_geneList_species) <- all_gene_names #Create topGO object using whole list of genes GOdata_species.MZ <- new("topGOdata", description = "GO enrichment analysis for species environment", ontology = "BP", #The 'ontology' argument can be 'BP' (biological process), 'MF' (molecular function), or 'CC' (cellular component) allGenes = MZ_geneList_species, #this is the 'gene universe' annot = annFUN.gene2GO, gene2GO = all.genes_list) GOdata_species.TRC <- new("topGOdata", description = "GO enrichment analysis for species environment", ontology = "BP", #The 'ontology' argument can be 'BP' (biological process), 'MF' (molecular function), or 'CC' (cellular component) allGenes = TRC_geneList_species, #this is the 'gene universe' annot = annFUN.gene2GO, gene2GO = all.genes_list) #Run the KS test using a hybrid of the elim and weighted algorithm #This is the default in topGO and seems like a solid default result.Fis_species.MZ <- runTest(GOdata_species.MZ, algorithm = "weight01", statistic = "fisher") result.Fis_species.MZ result.Fis_species.TRC <- runTest(GOdata_species.TRC, algorithm = "weight01", statistic = "fisher") result.Fis_species.TRC table_species.MZ <- GenTable(GOdata_species.MZ, Fis = result.Fis_species.MZ, #Results from Fis test topNodes = length(result.Fis_species.MZ@score), #Number of top GO terms to include in the table numChar = 50 #Number of characters to truncate the GO term definition to ) %>% rename(P_value = Fis) table_species.TRC <- GenTable(GOdata_species.TRC, Fis = result.Fis_species.TRC, #Results from Fis test topNodes = length(result.Fis_species.TRC@score), #Number of top GO terms to include in the table numChar = 50 #Number of characters to truncate the GO term definition to ) %>% rename(P_value = Fis) # showSigOfNodes(GOdata_species.all, score(resultFis_species.all), firstSigNodes = 5, # useInfo ='all',#additional info to be plotted to each node # reverse=T, #graph it upside down or not; default is T # wantedNodes = NULL, #can specify nodes you want to be able to find and they will be colored so you can see them easily # type=NULL,#can change to plot pie charts supposedly but no options are listed to specify this # showEdges=T,#default is T; if F removed arrow showing relationships # sigForAll=T) #score/p-value of all nodes in the DAG is shown, otherwise only the score for the sigNodes; default is T (MZ.species_sigNodes <- table_species.MZ %>% filter(as.numeric(P_value) < p) %>% summarize(n())) (TRC.species_sigNodes <- table_species.TRC %>% filter(as.numeric(P_value) < p) %>% summarize(n())) printGraph(GOdata_species.MZ, result.Fis_species.MZ, firstSigNodes = 5, fn.prefix = "~/R/R_working_dir/Collaborations/Albertson/Figures/species.collapsed_MZ_BP.fisher", useInfo = "all", pdfSW = TRUE) printGraph(GOdata_species.TRC, result.Fis_species.TRC, firstSigNodes = 6, fn.prefix = "~/R/R_working_dir/Collaborations/Albertson/Figures/species.collapsed_TRC_BP.fisher", useInfo = "all", pdfSW = TRUE) ``` ```{r enrichment analysis for MZ: benthic v pelagic} #------------Using filtered dataset ie no species x species DEGs-------- ##MZ.pelagic vs MZ.benthic qlf_MZ.benthic_v_MZ.pelagic <- glmQLFTest(fit1, contrast=comparisons1[,"MZ.benthic_v_MZ.pelagic"]) #Show number of genes differentially expressed (up and down) summary(decideTestsDGE(qlf_MZ.benthic_v_MZ.pelagic)) ############Making topGO object for MBuna data#################### ##Save all genes regardless of FDR DEGs_MZ <- as.data.frame(topTags(qlf_MZ.benthic_v_MZ.pelagic, n=17525, p.value = p)) %>% #Save all genes regardless of FDR rownames_to_column(var = "gene") %>% mutate(species_env = ifelse(logFC > 0, "MZ_benthic", "MZ_pelagic")) #%>% ####Extract GO terms for each gene#### # GO_MZ_all <- biomartr::getGO(organism = "Maylandia zebra", # genes = all_genes_MZ$gene, # filters = "ensembl_gene_id") %>% # dplyr::rename(gene = ensembl_gene_id, # GO_term = goslim_goa_description, # GO_accession = goslim_goa_accession) # # #% of genes for which there is at least one GO term # # GO_MZ_all %>% distinct(gene) %>% # # summarize(percent_genes_w_GO = round(n()/length(all_genes_MZ$gene)*100, 1)) # # # #Combine with dataframe of all DEGs # MZ_full.tbl_all <- GO_MZ_all %>% full_join(all_genes_MZ, by = "gene") %>% # dplyr::select(gene, GO_term, GO_accession, logFC, FDR, species_env) #--------------Separate by species_env---------------------- #Not sure if I'm going to use this ... UPDATE: I am MZ_benthic <- DEGs_MZ %>% filter(species_env == "MZ_benthic") MZ_pelagic <- DEGs_MZ %>% filter(species_env == "MZ_pelagic") #Count GO terms # MZ_benthic_GOterm.sum <- MZ_benthic %>% group_by(GO_term) %>% # summarize(MZ_benthic = n()) %>% # arrange(desc(MZ_benthic)) # # MZ_pelagic_GOterm.sum <- MZ_pelagic %>% group_by(GO_term) %>% # summarize(MZ_pelagic = n()) %>% # arrange(desc(MZ_pelagic)) # # MZ_all_GOterm.sum <- MZ_benthic_GOterm.sum %>% full_join(MZ_pelagic_GOterm.sum, by = "GO_term") # # #Write output files # write.csv(MZ_all_GOterm.sum, file = "output/MZ.benthic.v.MZ.pelagic_all_GOterm.sum.csv", row.names = FALSE) MZ_DEG_names_benthic <- c(MZ_benthic$gene) #Convert to vector MZ_geneList_benthic <- factor(as.integer(all_gene_names %in% MZ_DEG_names_benthic)) names(MZ_geneList_benthic) <- all_gene_names MZ_DEG_names_pelagic <- c(MZ_pelagic$gene) #Convert to vector MZ_geneList_pelagic <- factor(as.integer(all_gene_names %in% MZ_DEG_names_pelagic)) names(MZ_geneList_pelagic) <- all_gene_names #Create topGO object using lists of DEGs GOdata_MZ.benthic <- new("topGOdata", description = "GO enrichment analysis for pelagic environment", ontology = "BP", #The 'ontology' argument can be 'BP' (biological process), 'MF' (molecular function), or 'CC' (cellular component) allGenes = MZ_geneList_benthic, #this is the 'gene universe' annot = annFUN.gene2GO, gene2GO = all.genes_list) GOdata_MZ.pelagic <- new("topGOdata", description = "GO enrichment analysis for pelagic environment", ontology = "BP", #The 'ontology' argument can be 'BP' (biological process), 'MF' (molecular function), or 'CC' (cellular component) allGenes = MZ_geneList_pelagic, #this is the 'gene universe' annot = annFUN.gene2GO, gene2GO = all.genes_list) #Run the KS test using a hybrid of the elim and weighted algorithm #This is the default in topGO and seems like a solid default result.Fis_MZ.benthic <- runTest(GOdata_MZ.benthic, algorithm = "weight01", statistic = "fisher") result.Fis_MZ.benthic result.Fis_MZ.pelagic <- runTest(GOdata_MZ.pelagic, algorithm = "weight01", statistic = "fisher") result.Fis_MZ.pelagic table_MZ.benthic <- GenTable(GOdata_MZ.benthic, Fis = result.Fis_MZ.benthic, #Results from Fis test topNodes = length(result.Fis_MZ.benthic@score), #Number of top GO terms to include in the table numChar = 50 #Number of characters to truncate the GO term definition to ) %>% rename(P_value = Fis) table_MZ.pelagic <- GenTable(GOdata_MZ.pelagic, Fis = result.Fis_MZ.pelagic, #Results from Fis test topNodes = length(result.Fis_MZ.pelagic@score), #Number of top GO terms to include in the table numChar = 50 #Number of characters to truncate the GO term definition to ) %>% rename(P_value = Fis) # showSigOfNodes(GOdata_pelagic.all, score(resultFis_pelagic.all), firstSigNodes = 5, # useInfo ='all',#additional info to be plotted to each node # reverse=T, #graph it upside down or not; default is T # wantedNodes = NULL, #can specify nodes you want to be able to find and they will be colored so you can see them easily # type=NULL,#can change to plot pie charts supposedly but no options are listed to specify this # showEdges=T,#default is T; if F removed arrow showing relationships # sigForAll=T) #score/p-value of all nodes in the DAG is shown, otherwise only the score for the sigNodes; default is T (benthic.MZ_sigNodes <- table_MZ.benthic %>% filter(as.numeric(P_value) < p) %>% summarize(n())) (pelagic.MZ_sigNodes <- table_MZ.pelagic %>% filter(as.numeric(P_value) < p) %>% summarize(n())) printGraph(GOdata_MZ.benthic, result.Fis_MZ.benthic, firstSigNodes = 0, fn.prefix = "~/R/R_working_dir/Collaborations/Albertson/Figures/MZ.benthic_within.species_BP.fisher", useInfo = "all", pdfSW = TRUE) printGraph(GOdata_MZ.pelagic, result.Fis_MZ.pelagic, firstSigNodes = 2, fn.prefix = "~/R/R_working_dir/Collaborations/Albertson/Figures/MZ.pelagic_within.species_BP.fisher", useInfo = "all", pdfSW = TRUE) ```