# DSP - Volcano plot ![](https://i.imgur.com/PaQOKF3.png) ![](https://i.imgur.com/kZomhjT.jpg) 簡介 ``` # Labeled Volcano Plot # Version 1.1 # # Produces a Labeled Volcano Plot # Supports: DSP-nCounter Protein, DSP-nCounter RNA, DSP-NGS CTA, DSP-NGS WTA (mouse & human) # Note: this script can be run only after a DSPDA 'Statistical Tests' is performed # Please do not use spaces, special characters, or numbers when naming factors # that are used in a statistical test ``` * ==請填入filename== * ==選檔案格式== ``` # User Options # ############################## de_results_filename <- "VOLCANO PLOT.txt" # volcano plot results (tab delimited) filename # Plot Parameters output_format <- "jpg" # options: png, jpg, tiff, svg, pdf, & bmp ``` * ==填入x軸左端的標式== * ==填入x軸右端的標式== * ==過thredshold的基因最多要顯示幾個基因名稱==或是==必須顯示的基因名稱== (二擇一) eg: 顯示SMA,CD34在圖上 gene_list <- c("SMA","CD34") ``` ######################## LABELING ######################## negative_label <- "Left Label" # Negative (left) label for Fold Change from volcano plot positive_label <- "Right Label" # Positive (right) label for Fold Change from volcano plot n_genes <- 25 # Number of top genes to label, gene_list overrides this variable if set gene_list <- NULL #c("IL2RG", "GLUL", "SPIB", "C2") # genes to label no matter where they are in the volcano plot. This overrides n_genes ``` * ***Thresholds*** 預設pvalue 0.05 FDR 0.01 (先改成NULL) 有需要自行修改 ``` ####################### THRESHOLDS ####################### pval_thresh <- 0.05 # P-value threshold, must set fdr_thresh to NULL to use fdr_thresh <- 0.01 # FDR threshold, default threshold over pval_thresh ``` * ***調整字型大小*** ``` ######################### FONTS ########################## font_size <- 8 # Font Size label_size <- 2 # Label Font Size font_family <- "sans" # options include: serif, sans, mono ``` * ***圖的大小*** ``` ####################### PLOT SIZE ######################## plot_width <- 6 # Plot Width in inches plot_height <- 4 # Plot Height in inches ``` * ***調整顏色*** ![](https://i.imgur.com/J63lz3w.png) ``` ####################### COLORING ######################### default_color <- "grey65" # Default point color for points not in target groups or # above thresholds # Color options for plot. Must have more colors than # number of target groups color_options <- c("#3A6CA1", "#FFD861", "#CF4244", "#47BAB4", "#474747", "#EB739E", "#318026", "#A66293", "#F28E2B", "#8F6954") ``` * ***圖片title*** * ***是否顯示figure legend*** (TRUE/FALSE) ``` # Optional Labels plot_title <- "Enter Title Here" # Volcano Plot Title show_legend <- TRUE # Show color legend on figure ``` * ==設定fold change thresholds== * ==是否顯示fold change thresholds== ``` # Optional thresholds fc_thresh <- 0.5 # Fold Change threshold, if set coloring options will change label_fc <- FALSE # Should genes below fc_thresh be labeled # Optional Color parameters fc_color <- "grey30" # Color of points below fc_thresh but above pval and/or # fdr thresholds. change to the same as default_color # if you don't want these targets called out # Optional Target labeling target_groups <- NULL #c("Hemostasis", "DNA Repair") # Specific target groups to color in the plot # If variable is set, genes are colored by given target # group; else genes colored if they are above thresholds ``` 以下都不要改 ``` ########################################################## #### End of User Inputs. PLEASE DO NOT CHANGE CODE BELOW HERE #### ########################################################## # MIT License # Copyright 2020 Nanostring Technologies, Inc. # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: # The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. # Contact us: # NanoString Technologies, Inc. # 530 Fairview Avenue N # Seattle, WA 98109 # Tel: (888) 358-6266 ############################## ############################## # Execution Code # ############################## # Libraries: library(ggplot2) library(ggrepel) library(testthat) library(scales) library(stats) library(stringr) # main function with GeoMxSet wrapper main <- function(obj1, obj2, obj3, obj4){ if(class(obj1) == "NanoStringGeoMxSet"){ dataset <- exprs(obj1) segmentAnnotations <- pData(obj1) targetAnnotations <- fData(obj1) outputFolder <- obj3 }else{ dataset <- obj1 segmentAnnotations <- obj2 targetAnnotations <- obj3 outputFolder <- obj4 } volcanoPlot(dataset = dataset, segmentAnnotations = segmentAnnotations, targetAnnotations = targetAnnotations, outputFolder = outputFolder) } # volcano plot function volcanoPlot <- function(dataset, segmentAnnotations, targetAnnotations, outputFolder){ if(!file.exists(de_results_filename)){ fail(message="Given volcano plot results file does not exist. Please check file name and if the volcano plot file was uploaded to DSPDA") } # access volcano plot file: de_results <- read.table(de_results_filename, header=TRUE, sep="\t", stringsAsFactors=FALSE) #check if header is removed from volcano plot file if(!"target.name" %in% tolower(colnames(de_results))){ #if header isn't removed in de_results_file then remove header and set column names header <- grep(pattern="target name", x=tolower(de_results)) if(length(header) > 0){ colnames(de_results) <- de_results[header, ] de_results <- de_results[-c(1:header), ] colnames(de_results) <- gsub(pattern="\\W", replacement=".", colnames(de_results)) if(any(startsWith(colnames(de_results), prefix="."))){ starts_with_num <- which(startsWith(colnames(de_results), prefix=".")) colnames(de_results)[starts_with_num] <- paste0("X", colnames(de_results)[starts_with_num]) } }else{ fail(message="Too many rows were removed from VOLCANO PLOT.xlsx before running script. The row with Target Name in the first column must be kept.") } de_results$Log2 <- as.numeric(de_results$Log2) de_results$Pvalue <- as.numeric(de_results$Pvalue) } #ensure consistent capitalization with Target.Name column name colnames(de_results)[which(tolower(colnames(de_results)) == "target.name")] <- "Target.Name" # test for valid input variables testVariableFormats(de=de_results) # calculate FDR and add it as a column de_results$FDR <- p.adjust(de_results$Pvalue, method="fdr") # create volcano plot volcanoPlot_results <- plotVolcano(de=de_results) if(output_format == "svg"){ svg(filename=paste0(outputFolder, "/volcano_plot_", plot_title, ".", output_format), width=plot_width, height=plot_height) print(volcanoPlot_results$plot) dev.off() }else{ ggsave(filename=paste0("volcano_plot_", plot_title, ".", output_format), plot=volcanoPlot_results$plot, device=output_format, path=outputFolder, height=plot_height, width=plot_width) } write.table(x=volcanoPlot_results$gene_labels, file=file.path(outputFolder, paste0("labeled_genes_", plot_title, ".txt"), fsep=.Platform$file.sep), sep="\t", quote=FALSE, row.names=FALSE) } #' plotVolcano #' #' create volcano plot figure using user input variables and DE results from DSPDA #' @param de data frame of DE results from DSPDA #' @return ggFigure ggplot of volcano plot #' @export plotVolcano <- function(de){ # determine highest x point to make volcano plot equal on both sides maxFC <- max(abs(de$Log2)) maxPval <- min(de$Pvalue) # find closest FDR value to fdr_thresh and use that pvalue to add y axis cutoff line if(!is.null(fdr_thresh)){ fdr_pval <- mean(de$Pvalue[which(abs(de$FDR - fdr_thresh) == min(abs(de$FDR - fdr_thresh)))]) }else{ fdr_pval <- NULL } # create basic volcano plot with correct formatting ggFigure <- ggplot(de, aes(x=Log2, y=Pvalue))+ geom_point(color=default_color)+ labs(y="Pvalue", x=paste(negative_label, "<-", "log2(FC)", "->", positive_label, sep=" "), title=plot_title)+ theme_bw(base_size = font_size) + theme(text = element_text(family = font_family))+ scale_x_continuous(limits=c(-maxFC, maxFC)) # this makes for easier testing if not running in DSPDA, will flip yaxis of graph # scale_y_continuous(trans=change_axis_revlog_trans(base=10), # labels=function(x) format(x, trim=TRUE, digits=4, # scientific=ifelse(maxPval < 0.0001, TRUE, FALSE), # drop0trailing=TRUE)) if(show_legend == FALSE){ ggFigure <- ggFigure + theme(legend.position="none") } # subset de to only include genes either in specified target groups or above pval/fdr threshold if(!is.null(target_groups)){ probe_groups <- strsplit(de$Target.group.membership.s, split=", ") de_subset_list <- lapply(X=target_groups, FUN=function(x){ #return de rows with specified target group named in column return(cbind(de[grep(x=de$Target.group.membership.s, pattern=x),], x)) }) names(de_subset_list) <- target_groups gene_coloring <- as.data.frame(do.call(rbind, de_subset_list)) names(gene_coloring)[names(gene_coloring) == "x"] <- "Target_coloring" gene_coloring$Target_coloring <- str_wrap(gene_coloring$Target_coloring, width=45) color_label <- "Target Group\nMembership" }else{ color_options <- color_options[1:2] if(!is.null(fdr_thresh) & !is.null(pval_thresh)){ if(fdr_pval < pval_thresh){ gene_coloring <- de[which(de$Pvalue < pval_thresh),] label_thresh_low <- paste("pval <", pval_thresh) label_thresh_high <- paste("FDR <", fdr_thresh) high_thresh <- fdr_pval }else{ gene_coloring <- de[which(de$Pvalue < fdr_pval),] label_thresh_low <- paste("FDR <", fdr_thresh) label_thresh_high <- paste("pval <", pval_thresh) high_thresh <- pval_thresh } # label points as positive or negative FC gene_coloring$Target_coloring <- ifelse(test=gene_coloring$Log2 < 0, yes=negative_label, no=positive_label) # label points as above pval or FDR threshold gene_coloring$Target_coloring <- ifelse(test=gene_coloring$Pvalue >= high_thresh, yes=paste(label_thresh_low, gene_coloring$Target_coloring), no=paste(label_thresh_high, gene_coloring$Target_coloring)) # make color options have muted colors for higher threshold color_options <- c(color_options, muted(color_options, l=80)) names(color_options) <- c(paste(label_thresh_high, negative_label), paste(label_thresh_high, positive_label), paste(label_thresh_low, negative_label), paste(label_thresh_low, positive_label)) }else{ if(is.null(fdr_thresh)){ gene_coloring <- de[which(de$Pvalue < pval_thresh),] label_thresh <- paste("pval <", pval_thresh) }else{ gene_coloring <- de[which(de$FDR < fdr_thresh),] label_thresh <- paste("FDR <", fdr_thresh) } gene_coloring$Target_coloring <- ifelse(test=gene_coloring$Log2 < 0, yes=paste(label_thresh, negative_label), no=paste(label_thresh, positive_label)) names(color_options) <- c(paste(label_thresh, negative_label), paste(label_thresh, positive_label)) } color_label <- "Significance:" # color by fc_thresh if fc_thresh is not NULL if(!is.null(fc_thresh)){ gene_coloring$Target_coloring[abs(gene_coloring$Log2) < fc_thresh] <- paste("FC <", fc_thresh) color_options <- c(color_options, fc_color) names(color_options)[length(color_options)] <- paste("FC <", fc_thresh) } } color_options <- c(color_options, default_color) names(color_options)[length(color_options)] <- "Not Specified" # add coloring to ggplot ggFigure <- ggFigure + geom_point(data=gene_coloring, aes(x=Log2, y=Pvalue, color=Target_coloring))+ labs(color=color_label)+ scale_color_manual(values=color_options) # add threshold line values to y axis yaxis <- data.frame(brk=as.numeric(pretty_breaks(n=4)(0:max(de$X.log10.pvalue)))) yaxis$brk <- 10^-(yaxis$brk) # keep scientific notation if small enough pvalues when changing to character yaxis$label <- format(yaxis$brk, trim=TRUE, digits=4, scientific=ifelse(maxPval < 0.0001, TRUE, FALSE), drop0trailing=TRUE) # add threshold lines if thresholds are not NULL if(!is.null(fc_thresh)){ ggFigure <- ggFigure + geom_vline(xintercept=fc_thresh, linetype="dotted")+ geom_vline(xintercept=-fc_thresh, linetype="dotted")+ annotate("text", x=fc_thresh+0.35, y=1,family=font_family, size=font_size/3, label=paste0("FC=", round(2^fc_thresh, digits=2))) } if(!is.null(pval_thresh)){ ggFigure <- ggFigure + geom_hline(yintercept=pval_thresh, linetype="dotted") yaxis <- rbind(yaxis, c(pval_thresh, paste0('pval=',pval_thresh))) } if(!is.null(fdr_thresh)){ ggFigure <- ggFigure + geom_hline(yintercept=fdr_pval, linetype="dotted") yaxis <- rbind(yaxis, c(fdr_pval, paste0('FDR=',fdr_thresh))) } # order yaxis in increasing value yaxis$brk <- as.numeric(yaxis$brk) yaxis <- yaxis[order(yaxis$brk, decreasing=F),] # subset de to only contain genes to label on plot, either by user specified genes or top n_genes by pval if(!is.null(gene_list)){ gene_labels <- subset(de, subset=Target.Name %in% gene_list) }else{ # remove gene labels for genes below lowest pvalue cutoff gene_labels <- de[which(de$Pvalue < min(fdr_pval, pval_thresh, na.rm = T)),] # only label genes above fc_thresh if set by user else only look at pvalue if(!is.null(fc_thresh) & label_fc == FALSE){ gene_labels <- gene_labels[which(abs(gene_labels$Log2) > fc_thresh),] } # only keep top # of genes by pvalue gene_labels <- gene_labels[head(order(gene_labels$Pvalue, decreasing=FALSE), n=n_genes),] } # add gene labels to ggplot ggFigure <- ggFigure + geom_text_repel(data=gene_labels, aes(x=Log2, y=Pvalue, label=Target.Name), family=font_family, force=5, fontface="bold", min.segment.length=0.1, size=label_size) # add y axis labels ggFigure <- ggFigure + scale_y_continuous(trans=change_axis_revlog_trans(base=10), breaks=as.numeric(yaxis$brk), labels=yaxis$label) colnames(gene_labels) <- c("Target tag", "Target group memership/s", "Target Name", "Log2", "Pvalue", "Adjusted pvalue", "-log10 pvalue", "-log10 adjusted pvalue", "FDR") volcanoPlot <- list(ggFigure, gene_labels) names(volcanoPlot) <- c("plot", "gene_labels") return(volcanoPlot) } #' testAreColors #' #' checks if all colors in a vector are valid color names #' @param colors color names #' @return TRUE/FALSE statement on valid color status #' @export testAreColors <- function(colors) { return(sapply(colors, function(X) { tryCatch(is.matrix(col2rgb(X)), error=function(e) FALSE) })) } #' testIdenticalClass #' #' raises error if given object is not the assumed variable class #' @param object object to determine variable class #' @param object_name name of object for error message #' @param class_name expected variable class #' @return None, errors out if class is not expected #' @export testIdenticalClass <- function(object, object_name, class_name){ expect_identical(object=class(object), expected=class_name, label=paste(object_name, "variable must be", class_name,"\n You've supplied a", class(object))) } #' testVariableFormats #' #' check all user input variables for class and other validity markers #' @param de data frame of DE results from DSPDA #' @return None, errors out if input is not valid #' @export testVariableFormats <- function(de=de_results){ # check user input de file for number of columns and column name matching expected_col_names <- c("Target.tag", "Target.group.membership.s", "Target.Name", "Log2", "Pvalue", "Adjusted.pvalue", "X.log10.pvalue", "X.log10.adjusted.pvalue") expect_identical(object=ncol(de), expected=length(expected_col_names), label="Number of columns in given volcano plot tab delimited file do not match expected. Make sure file is tab delimited") expect_identical(object=colnames(de), expected=expected_col_names, label="Column names in given volcano plot tab delimited file do not match expected.") # check that all target_groups have at least one gene if not NULL if(!is.null(target_groups)){ invisible(lapply(X=target_groups, FUN=function(x){ genes_in_group <- grep(x=de$Target.group.membership.s, pattern=x) if(length(genes_in_group) == 0){ fail(message=paste(x, "is not a valid probe group. Please check spelling or remove from target_groups before continuing")) } })) } ############################### USER DEFINED VARIABLE CLASS CHECKS ############################### numeric_variables <- c("n_genes", "pval_thresh", "fdr_thresh", "fc_thresh", "font_size", "label_size", "plot_width", "plot_height") character_variables <- c("plot_title", "gene_list", "target_groups", "negative_label", "positive_label") logical_variables <- c("show_legend", "label_fc") for(v in 1:length(numeric_variables)){ if(!is.null(eval(parse(text=numeric_variables[v])))){ testIdenticalClass(object=eval(parse(text=numeric_variables[v])), object_name=numeric_variables[v], class_name="numeric") } } for(v in 1:length(character_variables)){ if(!is.null(eval(parse(text=character_variables[v])))){ testIdenticalClass(object=eval(parse(text=character_variables[v])), object_name=character_variables[v], class_name="character") } } for(v in 1:length(logical_variables)){ if(!is.null(eval(parse(text=logical_variables[v])))){ testIdenticalClass(object=eval(parse(text=logical_variables[v])), object_name=logical_variables[v], class_name="logical") } } # check that either n_genes or gene_list is not NULL if(is.null(gene_list) & is.null(n_genes)){ fail(message="Either n_genes or gene_list must not be NULL both n_genes and gene_list are currently NULL") } # check that either pval_thresh or fdr_thresh is not NULL if(is.null(pval_thresh) & is.null(fdr_thresh)){ fail(message="Either fdr_thresh or pval_thresh must not be NULL both fdr_thresh and pval_thresh are currently NULL") } # check that gene_list only contains genes in de results if(!is.null(gene_list)){ expect_true(object=all(gene_list %in% de$Target.Name), label="At least one gene in gene_list does not match genes in volcano plot file results") } ################################# USER DEFINED VARIABLE CHECKS ################################## allowed_fonts <- c("serif", "sans", "mono") # check that font_family is an allowable font if(!font_family %in% allowed_fonts){ fail(message=paste(font_family, "is not a valid font. Allowed fonts are", paste(allowed_fonts, collapse=", "))) } # check that default_color is an allowable color if(!testAreColors(colors=default_color)){ fail(message=paste(paste(default_color, collapse=", "), "is not a valid color")) } # check that fc_color is an allowable color if(!testAreColors(colors=fc_color)){ fail(message=paste(paste(fc_color, collapse=", "), "is not a valid color")) } # check that color_options are all allowable colors if(!all(testAreColors(colors=color_options))){ error_colors <- color_options[which(!testAreColors(colors=color_options))] fail(message=paste(paste(error_colors, collapse=", "), "is/are not valid color(s)")) } # check that enough colors are given for labeled target groups expect_gte(object=length(color_options), expected=max(length(target_groups), 2), label=paste("Not enough colors were given./n", max(length(target_groups), 2), "expected, only", length(color_options), "given")) expected_output_format <- c("png", "jpg", "tiff", "svg", "pdf", "bmp") if(!output_format %in% expected_output_format){ fail(message=paste("Output format not in expected list of formats.\n", output_format, "given\n expected", paste(expected_output_format, collapse=", "))) } } #' change_axis_revlog_trans #' #' reverse log transform axis; used to return pvalue rather than -log10(pvalue) on yaxis #' @param base base in which logs are computed #' @return revlog_trans reverse log transformation #' @export change_axis_revlog_trans <- function(base=exp(1)){ trans <- function(x) -log(x, base) inv <- function(x) base^(-x) revlog_trans <- trans_new(name=paste0("revlog-", base), transform=trans, inverse=inv, breaks=log_breaks(base=base), domain=c(1e-100, Inf)) return(revlog_trans) } ```