McKenzie enrichment

# McKenzie enrichment Data from: McKenzie, A.T., Wang, M., Hauberg, M.E. et al. Brain Cell Type Specific Gene Expression and Co-expression Network Architectures. Sci Rep 8, 8868 (2018). https://doi.org/10.1038/s41598-018-27293-5 >In this manuscript, we first systematically evaluate cell type-specific RNA expression patterns identified in five of these RNA-seq studies15,16,17,18,19. The six cell types that we set out to compare are: astrocytes, endothelial cells, microglia, neurons, oligodendrocytes, and oligodendrocyte precursor cells (OPCs). We defined three criteria of ascription of cell type-associated gene expression: specificity, which measures whether a gene is expressed in only one cell type; enrichment, which measures whether a gene tends to have higher expression in one cell type compared to all others; and absolute expression, which measures whether a gene tends to have high expression in a given cell type, irrespective of its expression levels in other cell types. We showed that cell type enrichment patterns in the brain have high overall conservation in pairwise and global comparisons based on all three measures. Next, we interrogated marker genes most specific to each of the cell types, and found that they were significantly associated with text mining results in PubMed (https://www.ncbi.nlm.nih.gov/pubmed) searches, and that their aggregate expression patterns were well-correlated with immunohistochemistry-level data from the same brain samples. As a resource to the community, we release these gene signatures and functions for leveraging them in an R package, BRETIGEA (BRain cEll Type specIfic Gene Expression Analysis), which we validated on a postmortem brain gene expression data set. Finally, we perform network analysis on the scRNA-seq data sets using the multiscale network modeling approach MEGENA, showing that there is conservation of modules both within and across cell types. > # code ```bash library(data.table) library(triwise) library(openxlsx) library(gdata) library(ggplot2) library(ggpubr) library(dplyr) library(clusterProfiler) library(org.Hs.eg.db) library(DOSE) #our results files ps0 <- list.files(path = "/gpfs/scratch/jrest/csea/Genelists", full.names = TRUE,include.dirs = FALSE)[-9] #filter protein structure results --> remove all length normalized. load("/gpfs/scratch/jrest/csea/Results/Results_PS_MCL1.5.rda") #mimic original list - to test Results2 <- unique(subset(Results,Padj<0.1)$Gene_Symbol) #444 Results2b <- unique(subset(Results,Padj<0.1)$all_Human_Symbols) Results2c <- unique(unlist(sapply(Results2b,function(dx) unlist(strsplit(dx,", "))))) #495 names(Results2c) <- NULL Results2d <- unique(c(Results2,Results2c)) #503 Results2e <- Results2d[Results2d != "-"] #502 #Now filter strName <- sapply(Results$OG.strName,function(dx) strsplit(dx,".",fixed=TRUE)[[1]][2]) Results1 <- Results Results1$strName <- strName keepStrs <- c("disop" ,"e3" , "e32" , "g8" , "h3" , "e8", "tme8" , "esa" , "h32" , "msa2" , "tm" , "tmh8", "tmx8" , "bsa" , "bsa2" , "bsap" , "c3" , "diso" , "esap" , "h8" , "hn8" [28] "msa" "msan" "msap" "s8" "sn8" "t8" "tmn" "tmp" "tmp2" [37] "tn8" "b8" "bn8") Results2 <- unique(subset(Results,Padj<0.1)$Gene_Symbol) #444 Results2b <- unique(subset(Results,Padj<0.1)$all_Human_Symbols) Results2c <- unique(unlist(sapply(Results2b,function(dx) unlist(strsplit(dx,", "))))) #495 names(Results2c) <- NULL Results2d <- unique(c(Results2,Results2c)) #503 Results2e <- Results2d[Results2d != "-"] #502 #allbgs created; skip and load below #to create background sets, used aggregateV4resultsFullResiduals1.5only.getAllHumanGenes.nowForProteins or equivelant allbgs <- list() #CNV background set load("/gpfs/scratch/jrest/csea/Results/Results_CNV_ALLMCL.rda") bg <- unique(Results$Gene_Symbol) bg1 <- unique(Results$all_Human_Symbols) bg2 <- unique(unlist(sapply(bg1,function(dx) unlist(strsplit(dx,", "))))) names(bg2) <- NULL allbgs[[ps0[1]]] <- bg2 #16158 #Cpg load("/gpfs/scratch/jrest/csea/Results/Results_CPG_MCL1.5.rda") bg1 <- unique(Results$all_Human_Symbols) bg2 <- unique(unlist(sapply(bg1,function(dx) unlist(strsplit(dx,", "))))) names(bg2) <- NULL allbgs[[ps0[3]]] <- bg2 #12938 #Gene structure load("/gpfs/scratch/jrest/csea/Results/Results_GS_MCL1.5.rda") #combined intron length bg1 <- unique(subset(Results,grepl("CombinedIntron_Length",OG.strName))$all_Human_Symbols) bg2 <- unique(unlist(sapply(bg1,function(dx) unlist(strsplit(dx,", "))))) names(bg2) <- NULL allbgs[[ps0[2]]] <- bg2 #16516 #first intron length bg1 <- unique(subset(Results,grepl("FirstIntron_Length",OG.strName))$all_Human_Symbols) bg2 <- unique(unlist(sapply(bg1,function(dx) unlist(strsplit(dx,", "))))) names(bg2) <- NULL allbgs[[ps0[4]]] <- bg2 #16516 #FirstPostCDSIntron_Length bg1 <- unique(subset(Results,grepl("FirstPostCDSIntron_Length",OG.strName))$all_Human_Symbols) bg2 <- unique(unlist(sapply(bg1,function(dx) unlist(strsplit(dx,", "))))) names(bg2) <- NULL allbgs[[ps0[5]]] <- bg2 #16285 #all Gene structure bg1 <- unique(Results$all_Human_Symbols) bg2 <- unique(unlist(sapply(bg1,function(dx) unlist(strsplit(dx,", "))))) names(bg2) <- NULL allbgs[[ps0[6]]] <- bg2 #16575 #Num_AltTranscripts bg1 <- unique(subset(Results,grepl("Num_AltTranscripts",OG.strName))$all_Human_Symbols) bg2 <- unique(unlist(sapply(bg1,function(dx) unlist(strsplit(dx,", "))))) names(bg2) <- NULL allbgs[[ps0[7]]] <- bg2 #16309 #Num_Exons bg1 <- unique(subset(Results,grepl("Num_Exons",OG.strName))$all_Human_Symbols) bg2 <- unique(unlist(sapply(bg1,function(dx) unlist(strsplit(dx,", "))))) names(bg2) <- NULL allbgs[[ps0[8]]] <- bg2 #16551 #PS background set - deprecated load("/gpfs/scratch/jrest/csea/str_OGv4.ocl1.5All2.rda") #ocl1.5All2 bg <- unique(ocl1.5All2$Gene_Symbol) bg1 <- unique(c(ocl1.5All2$all_Human_Symbols,ocl1.5All2$Gene_Symbol.x)) bg2 <- unique(unlist(sapply(bg1,function(dx) unlist(strsplit(dx,", "))))) names(bg2) <- NULL allbgs[[ps0[9]]] <- bg2 #18892 #PS background set - no logs load("/gpfs/projects/RestGroup/josh/phenomegenome/structure/Results_PS_nolog_MCL1.5.rda") #Results bg1 <- unique(Results$all_Human_Symbols) bg2 <- unique(unlist(sapply(bg1,function(dxx) { ps3 <- unlist(strsplit(dxx,", ")[[1]]) ps4 <- ps3[!(ps3=="-")] return(ps4) } ))) #18657 allbgs[[ps0[9]]] <- bg2 #18892 load("/gpfs/scratch/jrest/csea/Results/Results_TFBS_MCL1.5_withRemapSupport.rda") #TFBS background set (with remap) Positive bg <- unique(subset(tfbsResults, remap_support.y==1 & Slopes > 0)$Gene_Symbol) bg1 <- unique(subset(tfbsResults, remap_support.y==1 & Slopes > 0)$all_Human_Symbols) bg2 <- unique(unlist(sapply(bg1,function(dx) unlist(strsplit(dx,", "))))) names(bg2) <- NULL allbgs[[ps0[10]]] <- bg2 allbgs[[ps0[11]]] <- bg2 #17474 #TFBS background set (with remap) Negative and Positive bg1 <- unique(subset(tfbsResults, remap_support.y==0 & Slopes < 0)$all_Human_Symbols) bg2 <- unlist(sapply(bg1,function(dx) unlist(strsplit(dx,", ")))) bg3 <- unique(subset(tfbsResults, remap_support.y==1 & Slopes > 0)$all_Human_Symbols) bg4 <- unlist(sapply(bg3,function(dx) unlist(strsplit(dx,", ")))) bg5 <- unique(c(bg2,bg4)) names(bg5) <- NULL allbgs[[ps0[12]]] <- bg5 allbgs[[ps0[13]]] <- bg5 #17939 #positive TFs background set (NOT the target genes) bg1 <- unique(subset(tfbsResults, remap_support.y==1 & Slopes > 0)$TF_Name) #458 Results2b <- unique(subset(tfbsResults, remap_support.y==1 & Slopes > 0 & Padj<0.1 )$TF_Name) # 356 #all background set length(allbgs[[ps0[14]]]) #19399; with ps logs allbgs[[ps0[14]]] <- NULL concatbgs <- unique(unlist(allbgs)) allbgs[[ps0[14]]] <- concatbgs length(allbgs[[ps0[14]]]) #19192; with no ps logs save(allbgs,file="/gpfs/home/jrest/allbgs.nologs.rda") #/gpfs/home/jrest load("/gpfs/home/jrest/allbgs.nologs.rda") #load all background set names(allbgs) <- sapply(names(allbgs),function(dx) strsplit(strsplit(dx,"/",fixed=TRUE)[[1]][7],".xlsx",fixed=TRUE)[[1]][1]) names(allbgs)[[2]] <- "PS_HitGenes_0.1_nolog" save(allbgs,file="/gpfs/home/jrest/allbgs.nologs.rn.rda") #/gpfs/home/jrest #our results files ps0 <- list.files(path = "/gpfs/projects/RestGroup/csea/Genelists", full.names = TRUE,include.dirs = FALSE)[-9] ``` ### spec2 - msig cell type expression #read in csea msig tsv files from Priya ```R psSimp <- list.files(path = "/gpfs/scratch/jrest/csea/EnrichmentGeneTsvFiles", full.names = TRUE,include.dirs = FALSE) psSimpTsvs <- lapply(psSimp,function(dx)t(read.delim(dx,header=FALSE,row.names=1))) psSimpT <- data.table(do.call("rbind",psSimpTsvs)) #(1) Descartes dataset - https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7780123/ #(2) Fan dataset - https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6028726/ #(3) Manno dataset - https://www.ncbi.nlm.nih.gov/pmc/articles/pmid/27716510/ #(4) Zhong dataset - https://www.nature.com/articles/nature25980 psSimp2 <- lapply(unique(psSimpT$PMID),function(dx){ ps2 <- subset(psSimpT,PMID==dx)[,c(1,20)] ps5 <- apply(ps2,1,function(dxx) { ps3 <- unlist(strsplit(unlist(dxx[2]),",")) ps4 <- ps3[!(ps3=="")] return(data.frame(Celltype=unlist(dxx[1]),gene=ps4)) } ) return(do.call("rbind",ps5)) }) spec2 <- as.data.table(psSimp2[[1]]) #for datasets 1:4 save(spec2,file="/gpfs/scratch/jrest/csea/MSIG.spec2.rda") #load("/gpfs/scratch/jrest/csea/MSIG.spec2.rda") #msig tsv files ``` ### Not sure how this is different from above - seems the same #The Excel file contains the statistics from the enrichment test of MGI phenotype gene signatures in the cell type specific marker gene signatures. ```R psSimp <- list.files(path = "/gpfs/scratch/jrest/csea/EnrichmentGeneTsvFiles", full.names = TRUE,include.dirs = FALSE) psSimpTsvs <- read.xlsx("/gpfs/scratch/jrest/csea/file3_41598_2018_27293_MOESM4_ESM.xlsx",startRow=3) psSimpTsvs$phenotype.celltype <- paste(psSimpTsvs$Phenotype,psSimpTsvs$Celltype,sep=".") #psSimpTsvs$phenotype.celltype <- psSimpTsvs$Celltype psSimp2 <- lapply(unique(psSimpTsvs$Phenotype.description),function(dx){ ps2 <- subset(psSimpTsvs,Phenotype.description==dx)[,c(11,10)] ps5 <- apply(ps2,1,function(dxx) { ps3 <- unlist(strsplit(unlist(dxx[2]),";")) ps4 <- ps3[!(ps3=="")] return(data.frame(Celltype=unlist(dxx[1]),gene=ps4)) } ) return(do.call("rbind",ps5)) }) spec2 <- do.call("rbind",psSimp2) #spec2 <- as.data.table(psSimp2[[1]]) #for datasets 1:4 ``` ## code to just return the significance of the cell type enrichments, without plots ```bash #backgrounds load("/gpfs/home/jrest/allbgs.nologs.rn.rda") #allbgs ps0 <- list.files(path = "/gpfs/projects/RestGroup/csea/Genelists", full.names = TRUE,include.dirs = FALSE,pattern = "\\.xlsx$")[-9] #for(i in ps0[c(2,6:13)]){ #1,3,5,14 #allOurResults1 <- lapply(ps0[c(2,4,6:13)],function(i){ #now includes 4, FirstIntron_Length_HitGenes_0.1, and 14, all lapply(names(allbgs)[c(4,6:12)],function(i){ #i <- ps0[14] ps <- NULL print(i) ps <- read.xlsx(paste0("/gpfs/projects/RestGroup/csea/Genelists/",i,".xlsx"),startRow=2,colNames=FALSE)$X2 print(paste("new result type",i,"of length",length(ps))) ps <- unique(ps) print(paste("unique length",length(ps))) save(ps,file=paste0("/gpfs/projects/RestGroup/csea/ps/",i,".ps.rda")) }) ### lateral cerebellum #or, from a results file: #load("/gpfs/projects/RestGroup/josh/phenomegenome/structure/Results_PS_nolog_MCL1.5.rda") #lateral cerebellum Results load("/gpfs/projects/RestGroup/csea/lateral_cereb_results/Results_TFBS_MCL1.5_withRemapSupport.rda") #lateral cerebellum Results tfbsResults ## Note different name Results <- subset(tfbsResults, remap_support.y==1 & Padj<0.1 ) #TFBS_HitGenes_PosNeg_0.1 i <- "TFBS_HitGenes_PosNeg_0.1" Results <- subset(tfbsResults, remap_support.y==1 & Slopes > 0 & Padj<0.1 ) #TFBS_HitGenes_Pos_0.1 i <- "TFBS_HitGenes_Pos_0.1" #hippocampus load("/gpfs/projects/RestGroup/csea/hippo_results/Results_TFBS_Hippocampus_MCL1.5.rda") #Results tfbsResults <- Results Results <- subset(tfbsResults, remap_support.y==1 & Padj<0.1 ) #TFBS_HitGenes_PosNeg_0.1 i <- "TFBS_HitGenes_PosNeg_0.1" load("/gpfs/projects/RestGroup/csea/hippo_results/Results_TFBS_Hippocampus_MCL1.5.rda") #Results tfbsResults <- Results Results <- subset(tfbsResults, remap_support.y==1 & Slopes > 0 & Padj<0.1 ) #TFBS_HitGenes_Pos_0.1 i <- "TFBS_HitGenes_Pos_0.1" load("/gpfs/projects/RestGroup/csea/hippo_results/Results_PS_nolog_hippo_MCL1.5.rda") #hippocampus Results i <- "PS_HitGenes_0.1_nolog" load("/gpfs/projects/RestGroup/csea/hippo_results/Results_GS_Hippocampus_MCL1.5.rda") #hippocampus Results i <- "GS_HitGenes_0.1" load("/gpfs/projects/RestGroup/csea/hippo_results/Results_CPG_Hippocampus_MCL1.5.rda") #hippocampus Results i <- "CpG_HitGenes_0.1" load("/gpfs/projects/RestGroup/csea/hippo_results/Results_TFBS_Hippocampus_MCL1.5.rda") #hippocampus Results i <- "CpG_HitGenes_0.1" Results2b <- unique(subset(Results,Padj<0.1)$all_Human_Symbols) ps <- unique(unlist(sapply(Results2b,function(dxx) { ps3 <- unlist(strsplit(dxx,", ")[[1]]) ps4 <- ps3[!(ps3=="-")] return(ps4) } ))) #save(ps,file=paste0("/gpfs/projects/RestGroup/csea/ps/PS_HitGenes_0.1_nolog_hippo.ps.rda")) ###end or from a results file #now do the enrichments allOurResults1 <- lapply(names(allbgs)[c(2)],function(i){ #,4,6:11 # i <- "/gpfs/scratch/jrest/csea/Genelists/PS_HitGenes_0.1.xlsx" #Make a plot for each sheet print(paste("start",i)) load(paste0("/gpfs/projects/RestGroup/csea/ps/",i,".ps.rda")) myEnrichList <- lapply(c(1,2,3,4,5,6,7,8,9),function(j){ print(j) spec1b <- read.xlsx("/gpfs/projects/RestGroup/csea/file1_41598_2018_27293_MOESM2_ESM.xlsx",sheet=j,startRow=3,colNames=TRUE) #if(j %in% c(1,2,3,7,8,9)){ #myfolds <- seq(0,7,0.25)} #Grand Means: sheets 1,2,7,8 #if(j %in% c(4,5,6)){ #myfolds <- seq(100,999,50)} #Grand Ranks: sheet 4,5 #myEnrich <- lapply(myfolds,function(fold){ #if(j %in% c(1,2,3,7,8,9)){ #spec2 <- as.data.table(subset(spec1b,grand_mean > fold))} # sheets 1,2,7, 8 #if(j %in% c(4,5,6)){ #spec2 <- as.data.table(subset(spec1b,grand_rank < fold)) #} # sheet 4,5 spec2 <- as.data.table(spec1b) #or from msig tsv files #load("/gpfs/scratch/jrest/csea/MSIG.spec2.rda") #msig tsv files, spec2 a <- NULL a <- split(spec2, by="Celltype") gsets2 <- lapply(a,function(dx) dx[1:500,]$gene[dx[1:500,]$gene %in% allbgs[[i]]]) ###& gsets2 <- lapply(a,function(dx) dx[1:500,]$gene[dx[1:500,]$gene %in% bg1]) z <- NULL z <- testEnrichment(Goi=ps, gsets=gsets2, background=allbgs[[i]], minknown = 2, mindiffexp = 2,maxknown = 1000) ###& z <- testEnrichment(Goi=Results2b, gsets=gsets2, background=bg1, minknown = 2, mindiffexp = 2,maxknown = 1000) z2 <- NULL #if(dim(z)[1] >0 ){z$thresh <- fold #z2 <- subset(z,qval<0.05) #z2 <- subset(z,pval<0.05) z2 <- z #if(dim(z2)[1] >0 ){ z2$genes <- apply(z2,1,function(dx){ paste(ps[ps %in% gsets2[[unlist(dx[4])]] ],collapse=",") }) z2$expgb <- sapply(gsets2[z2$gsetid],function(dx){ paste(dx,collapse=",") }) #z2$go <- sapply(1:dim(z2)[[1]],function(dx) gost(query = strsplit(z2$genes[dx],split=",",fixed=TRUE)[[1]], # organism = "hsapiens", ordered_query = FALSE, # multi_query = FALSE, significant = TRUE, # measure_underrepresentation = FALSE, evcodes = FALSE, # custom_bg = strsplit(z2$expgb[dx],split=",",fixed=TRUE)[[1]], # as_short_link = FALSE)) z2$go <- sapply(1:dim(z2)[[1]],function(dx) unlist(as.data.frame(enrichGO(gene = strsplit(z2$genes[dx],split=",",fixed=TRUE)[[1]], universe = strsplit(z2$expgb[dx],split=",",fixed=TRUE)[[1]], OrgDb = org.Hs.eg.db, ont = "ALL", keyType = 'SYMBOL', pAdjustMethod = "BH", pvalueCutoff = 0.01, qvalueCutoff = 0.05))$ID) ) z2$DO <- sapply(1:dim(z2)[[1]],function(dx) unlist(as.data.frame(enrichDO(gene = mapIds(org.Hs.eg.db, strsplit(z2$genes[dx],split=",",fixed=TRUE)[[1]], 'ENTREZID', 'SYMBOL'), universe = mapIds(org.Hs.eg.db, strsplit(z2$expgb[dx],split=",",fixed=TRUE)[[1]], 'ENTREZID', 'SYMBOL'), ont = "DO" ))$ID) ) # }} #return(z2) # }) #lappyly over mfolds #myEnrich2 <- (do.call("rbind",myEnrich)) #if(dim(myEnrich2)[1] >0 ){ sheetids <- c("top_all_enrich","top_human_enrich","top_mouse_enrich","top_all_expression","top_human_expression","top_mouse_expression","top_all_specificity","top_human_specificity","top_mouse_specificty") z2$sheet <- sheetids[j] #} #return(myEnrich2) return(z2) }) #for each sheet j myEnrichList2 <- NULL myEnrichList2 <- do.call("rbind",myEnrichList) #if(dim(myEnrichList2)[1]>0){ myEnrichList2$info <- i # strsplit(strsplit(i,"/")[[1]][7],".xlsx")[[1]][1] #myEnrichList2$info <- "PS_HitGenes_0.1_nologs" #openxlsx::write.xlsx(myEnrichList2,file="/gpfs/projects/RestGroup/csea/hippcampus.cpg.mkenzie.011823.xlsx",overwrite=TRUE) print(paste("end",i)) return(myEnrichList2) }) allOurResults2 <- do.call("rbind",allOurResults1) #openxlsx::write.xlsx(myEnrichList2,file="/gpfs/projects/RestGroup/csea/McKenzieEnrichALLP.tfbsPos0.1.remap.hippo.xlsx",overwrite=TRUE) openxlsx::write.xlsx(allOurResults2,file="/gpfs/projects/RestGroup/csea/allMcKenzieEnrichALLP.1216.xlsx",overwrite=TRUE) save(myEnrichList2,file=paste0("mkenzieEnrich.",strsplit(i,"/")[[1]][7],".rda")) openxlsx::write.xlsx(myEnrichList2,file=paste0("mkenzieEnrich.",strsplit(i,"/")[[1]][7]),overwrite=TRUE) } #read in tsv files from Priya psSimp <- list.files(path = "/gpfs/projects/RestGroup/csea/EnrichmentGeneTsvFiles", full.names = TRUE,include.dirs = FALSE) psSimpTsvs <- lapply(psSimp,function(dx)t(read.delim(dx,header=FALSE,row.names=1))) psSimpT <- data.table(do.call("rbind",psSimpTsvs)) #(1) Descartes dataset - https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7780123/ #(2) Fan dataset - https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6028726/ #(3) Manno dataset - https://www.ncbi.nlm.nih.gov/pmc/articles/pmid/27716510/ #(4) Zhong dataset - https://www.nature.com/articles/nature25980 psSimp2 <- lapply(unique(psSimpT$PMID),function(dx){ ps2 <- subset(psSimpT,PMID==dx)[,c(1,20)] ps5 <- apply(ps2,1,function(dxx) { ps3 <- unlist(strsplit(unlist(dxx[2]),",")) ps4 <- ps3[!(ps3=="")] return(data.frame(Celltype=unlist(dxx[1]),gene=ps4)) } ) return(do.call("rbind",ps5)) }) studyNames <- c("Descartes","Fan","Manno","Zhong") #for(for(i in ps0[c(2,6:13)])){ #for each of OUR results allOurResults <- lapply(ps0[c(2,6:13)],function(i){ ps <- unique(read.xlsx(i,startRow=2,colNames=FALSE)$X2) myEnrich <- lapply(1:4,function(q){ #for each of priyas four studies spec2 <- as.data.table(psSimp2[[q]]) #for datasets 1:4 a <- NULL a <- split(spec2, by="Celltype") gsets2 <- lapply(a,function(dx) dx$gene[dx$gene %in% allbgs[[i]]]) z <- NULL z <- testEnrichment(Goi=ps, gsets=gsets2, background=allbgs[[i]], minknown = 2, mindiffexp = 2,maxknown = 1000) z2 <- NULL if(dim(z)[1] >0 ){z$thresh <- studyNames[q] #z2 <- subset(z,qval<0.05) #z2 <- subset(z,pval<0.05) z2 <- z if(dim(z2)[1] >0 ){ z2$genes <- apply(z2,1,function(dx){ paste(ps[ps %in% gsets2[[unlist(dx[4])]] ],collapse=",") }) }} return(z2) }) #for each of four studies myEnrich2 <- NULL myEnrich2 <- (do.call("rbind",myEnrich)) if(dim(myEnrich2)[1] >0 ){ myEnrich2$sheet <- i} #openxlsx::write.xlsx(myEnrich,file="/gpfs/projects/RestGroup/csea/EnrichmentGeneTsv.TFBS_HitGenes_PosNeg_0.1.hippo.remap.xlsx",overwrite=TRUE) #myEnrich2$sheet <- "PS_HitGenes_0.1_nologs" return(myEnrich2) }) myEnrichList2 <- do.call("rbind",allOurResults) #openxlsx::write.xlsx(myEnrich2,file="psNoLog.EnrichmentGeneTsvALLRESULTS.xlsx",overwrite=TRUE) #save(myEnrichList2,file=paste0("mkenzieEnrich.",strsplit(i,"/")[[1]][7],".rda")) openxlsx::write.xlsx(myEnrichList2,file="EnrichmentGeneTsvALLRESULTS.xlsx",overwrite=TRUE) ### ############ #for mckenzie enrichment of encrichments allOurResults <- lapply(ps0[c(2,6:13)],function(i){ ps <- unique(read.xlsx(i,startRow=2,colNames=FALSE)$X2) #myEnrich <- lapply(1:4,function(q){ #for each of priyas four studies #spec2 <- as.data.table(psSimp2[[q]]) #for datasets 1:4 spec2 <- as.data.table(spec2) a <- NULL a <- split(spec2, by="Celltype") gsets2 <- lapply(a,function(dx) dx$gene[dx$gene %in% allbgs[[i]]]) z <- NULL z <- testEnrichment(Goi=ps, gsets=gsets2, background=allbgs[[i]], minknown = 2, mindiffexp = 2,maxknown = 1000) z2 <- NULL if(dim(z)[1] >0 ){z$thresh <- i # studyNames[q] #z2 <- subset(z,qval<0.05) z2 <- subset(z,pval<0.05) if(dim(z2)[1] >0 ){ z2$genes <- apply(z2,1,function(dx){ paste(ps[ps %in% gsets2[[unlist(dx[4])]] ],collapse=",") }) }} return(z2) }) myEnrichList2 <- do.call("rbind",allOurResults) #save(myEnrichList2,file=paste0("mkenzieEnrich.",strsplit(i,"/")[[1]][7],".rda")) openxlsx::write.xlsx(myEnrichList2,file="EnrichmentOfEnrichment.xlsx",overwrite=TRUE) ``` ## merge in info about the gene significant in our results load("mkenzieEnrich.PS_HitGenes_0.1.xlsx.rda") #myEnrichList2 myEnrichList3 <- unique(unlist(sapply(myEnrichList2$genes,function(dx) unlist(strsplit(dx,","))))) load("/gpfs/scratch/jrest/csea/Results/Results_PS_MCL1.5.rda") #mimic original list Results2 <- unique(subset(Results,Padj<0.1)$Gene_Symbol) #444 Results2b <- unique(subset(Results,Padj<0.1)$all_Human_Symbols) #s1 <- lapply(myEnrichList3,function(dx) #paste0(grep(dx,subset(Results,Padj<0.1)$Gene_Symbol,value=FALSE),collapse=",")) s2 <- unlist(lapply(myEnrichList3,function(dx) Results[grep(dx,subset(Results,Padj<0.1)$all_Human_Symbols,value=FALSE),"OG.strName"])) s3 <- unique(s2) s4 <- sapply(s3,function(dx) strsplit(dx,".",fixed=TRUE)[[1]][2]) allStrs <- unique(sapply(Results$OG.strName,function(dx) strsplit(dx,".",fixed=TRUE)[[1]][2])) allStrs [!allStrs %in% unique(s4) ] #not found enriched in any "tmh8" "bsa2" "b8" "bn8" table(s4) bsa bsan bsap c3 diso disop e3 e32 e8 en3 en8 esa esan 3 1 3 1 4 2 4 4 2 3 1 4 1 esap g8 gn8 h3 h32 h8 hn3 hn8 msa msa2 msan msap s8 2 1 1 2 5 3 1 1 1 4 5 5 3 sn8 t8 tm tme8 tmn tmp tmp2 tmx8 tn8 2 1 1 2 2 1 5 3 2 # enrichment results **ONLY Results with q-value < 0.05 are shown. Everything else left empty.** Sheets 4&5: Grand rank < thresholds tested: 100 200 300 400 500 600 700 800 900 100 Sheets 1,2,7,8: Grand mean > thresholds tested: 0.0 0.5 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 5.5 6.0 sheet 1: top all enrich sheet 2: top human enrich sheet 4: top all expression sheet 5: top human expression sheet 7: top all specificity sheet 8: top human specificity # Results # CNV_HitGenes_0.1 ## sheet 1 (top all enrich) ## sheet 2 (top human enrich ) ## sheet 4 (top all expression ) ## sheet 5 (top human expression ) ## sheet 7 (top all specificity ) ## sheet 8 (top human specificity ) # CombinedIntron_Length_HitGenes_0.1 ## sheet 1 (top all enrich) ## sheet 2 (top human enrich ) ## sheet 4 (top all expression ) ## sheet 5 (top human expression ) ## sheet 7 (top all specificity ) ## sheet 8 (top human specificity ) # CpG_HitGenes_0.1 ## sheet 1 (top all enrich) ## sheet 2 (top human enrich ) ## sheet 4 (top all expression ) ## sheet 5 (top human expression ) ## sheet 7 (top all specificity ) ## sheet 8 (top human specificity ) # FirstIntron_Length_HitGenes_0.1 ## sheet 1 (top all enrich) ## sheet 2 (top human enrich ) ## sheet 4 (top all expression ) ## sheet 5 (top human expression ) ## deprecated ### script for printing the results to a formatted table... ``` #output file fc<-"/gpfs/scratch/jrest/csea/output.md" write("# Results",fc,append=FALSE) for(i in ps0){ ps <- read.xlsx(i,startRow=2,colNames=FALSE)$X2 write(paste("# ",i),fc,append=TRUE) #McKenzie data; specify sheet for(j in c(1,2,4,5,7,8)){ spec1b <- NULL spec1b <- read.xlsx("/gpfs/scratch/jrest/csea/file1_41598_2018_27293_MOESM2_ESM.xlsx",sheet=j,startRow=3,colNames=TRUE) write(paste("## sheet ",j),fc,append=TRUE) #Thresholds to test if(j %in% c(1,2,7,8)){ myfolds <- seq(0,6,0.5)} #Grand Means: sheets 1,2,7,8 if(j %in% c(4,5)){ myfolds <- seq(100,1000,100)} #Grand Ranks: sheet 4,5 spec2 <- NULL for(fold in myfolds){ if(j %in% c(1,2,7,8)){ spec2 <- as.data.table(subset(spec1b,grand_mean > fold))} # sheets 1,2,7, 8 if(j %in% c(4,5)){ spec2 <- as.data.table(subset(spec1b,grand_rank < fold))} # sheet 4,5 a <- split(spec2, by="Celltype") gsets2 <- lapply(a,function(dx) dx$gene[dx$gene %in% bg2]) z <- NULL z <- testEnrichment(Goi=ps, gsets=gsets2, background=bg2, minknown = 2, mindiffexp = 2,maxknown = 1000) if(dim(z)[1] >= 1){ z2 <- subset(z,qval<0.05) if(dim(z2)[1] >= 1){ write(paste("### threshold ",fold),fc,append=TRUE) # write.table(z2, fc, append=TRUE) write("```",fc,append=TRUE) gdata::write.fwf(z2, file = fc, append = TRUE, quote = FALSE, sep = "\t", colnames = TRUE) write("```",fc,append=TRUE) } } } #for each threshold } #for each mckenzie sheet j } #for of our gene lists i (protein structure, etc.) ``` # Notes ## Background Set I used the protein structure background for all. This should be fixed; protein structure has a slightly smaller background. ## Only about 70% of McKenzie genes are in our background set. `sum(gsets2[[1]] %in% bg2)` >693 of 1000, for example, in ast gset >others pseudogenes, FISH probe regions, etc. ## structure of gsets `str(gsets2)` ``` #List of 5 # $ ast: chr [1:693] "AQP4" "BMPR1B" "ETNPPL" "GJB6" ... # $ end: chr [1:779] "APOLD1" "CD34" "TGM2" "IFI27" ... # $ mic: chr [1:677] "CCL3" "CCL3L3" "CCL4L1" "CCL4" ... # $ neu: chr [1:767] "SYNPR" "RELN" "CNR1" "GAD2" ... # $ oli: chr [1:698] "UGT8" "PLP1" "ERMN" "CNDP1" ... ``` ## examples of Goi in gsets ``sum(ps$V1 %in% gsets2[[1]])`` >17 of 502 Goi in gsets2 (length 693) , for example ## deprecated Code for heatmap ```bash #1 and 3 and 4 and 5 aren't currently working? #do for i= 2,6:13; tfbs is 10:13 for(i in ps0[c(2,6:13)]){ #i <- ps0[14] ps <- NULL ps <- unique(read.xlsx(i,startRow=2,colNames=FALSE)$X2) print(paste("new result type",i,"of length",length(ps))) #Make a plot for each sheet myEnrichList <- lapply(c(1,2,3,4,5,6,7,8,9),function(j){ print(j) spec1b <- read.xlsx("/gpfs/scratch/jrest/csea/file1_41598_2018_27293_MOESM2_ESM.xlsx",sheet=j,startRow=3,colNames=TRUE) if(j %in% c(1,2,3,7,8,9)){ myfolds <- seq(0,7,0.25)} #Grand Means: sheets 1,2,7,8 if(j %in% c(4,5,6)){ myfolds <- seq(100,999,50)} #Grand Ranks: sheet 4,5 myEnrich <- lapply(myfolds,function(fold){ if(j %in% c(1,2,3,7,8,9)){ spec2 <- as.data.table(subset(spec1b,grand_mean > fold))} # sheets 1,2,7, 8 if(j %in% c(4,5,6)){ spec2 <- as.data.table(subset(spec1b,grand_rank < fold))} # sheet 4,5 a <- NULL a <- split(spec2, by="Celltype") gsets2 <- lapply(a,function(dx) dx$gene[dx$gene %in% allbgs[[i]]]) z <- NULL z <- testEnrichment(Goi=ps, gsets=gsets2, background=allbgs[[i]], minknown = 2, mindiffexp = 2,maxknown = 1000) z2 <- NULL if(dim(z)[1] >0 ){z$thresh <- fold} return(z) }) #lappyly over mfolds myEnrich2 <- NULL myEnrich2 <- (do.call("rbind",myEnrich)) enrichPlot <- NULL enrichPlot <- ggplot(myEnrich2) + geom_tile(data = myEnrich2 %>% filter(qval < .05), aes(x = gsetid,y = factor(thresh, levels = unique(thresh))),fill = "red") + geom_tile(data = myEnrich2 %>% filter(qval < .1 & qval >= .05), aes(x = gsetid,y = factor(thresh, levels = unique(thresh))),fill = "pink") + geom_tile(data = myEnrich2 %>% filter( qval >= .1 & qval < .5), aes(x = gsetid,y = factor(thresh, levels = unique(thresh))),fill = "lightcyan1") + geom_tile(data = myEnrich2 %>% filter( qval >= .5), aes(x = gsetid,y = factor(thresh, levels = unique(thresh))),fill = "grey") + xlab("cell type") + ylab(paste("sheet",j,"threshold")) + theme_light() return(enrichPlot) }) #for each sheet j pdf(paste0(strsplit(i,"/")[[1]][7],".enrich.pdf")) print(ggarrange(myEnrichList[[1]],myEnrichList[[2]],myEnrichList[[3]],myEnrichList[[4]], myEnrichList[[5]],myEnrichList[[6]],myEnrichList[[7]],myEnrichList[[8]],myEnrichList[[9]], ncol = 3, nrow = 3) ) dev.off() } ``` ###### tags: `pgwas`