# McKenzie enrichment
Data from: McKenzie, A.T., Wang, M., Hauberg, M.E. et al. Brain Cell Type Specific Gene Expression and Co-expression Network Architectures. Sci Rep 8, 8868 (2018). https://doi.org/10.1038/s41598-018-27293-5
>In this manuscript, we first systematically evaluate cell type-specific RNA expression patterns identified in five of these RNA-seq studies15,16,17,18,19. The six cell types that we set out to compare are: astrocytes, endothelial cells, microglia, neurons, oligodendrocytes, and oligodendrocyte precursor cells (OPCs). We defined three criteria of ascription of cell type-associated gene expression: specificity, which measures whether a gene is expressed in only one cell type; enrichment, which measures whether a gene tends to have higher expression in one cell type compared to all others; and absolute expression, which measures whether a gene tends to have high expression in a given cell type, irrespective of its expression levels in other cell types. We showed that cell type enrichment patterns in the brain have high overall conservation in pairwise and global comparisons based on all three measures. Next, we interrogated marker genes most specific to each of the cell types, and found that they were significantly associated with text mining results in PubMed (https://www.ncbi.nlm.nih.gov/pubmed) searches, and that their aggregate expression patterns were well-correlated with immunohistochemistry-level data from the same brain samples. As a resource to the community, we release these gene signatures and functions for leveraging them in an R package, BRETIGEA (BRain cEll Type specIfic Gene Expression Analysis), which we validated on a postmortem brain gene expression data set. Finally, we perform network analysis on the scRNA-seq data sets using the multiscale network modeling approach MEGENA, showing that there is conservation of modules both within and across cell types.
>
# code
```bash
library(data.table)
library(triwise)
library(openxlsx)
library(gdata)
library(ggplot2)
library(ggpubr)
library(dplyr)
library(clusterProfiler)
library(org.Hs.eg.db)
library(DOSE)
#our results files
ps0 <- list.files(path = "/gpfs/scratch/jrest/csea/Genelists", full.names = TRUE,include.dirs = FALSE)[-9]
#filter protein structure results --> remove all length normalized.
load("/gpfs/scratch/jrest/csea/Results/Results_PS_MCL1.5.rda")
#mimic original list - to test
Results2 <- unique(subset(Results,Padj<0.1)$Gene_Symbol) #444
Results2b <- unique(subset(Results,Padj<0.1)$all_Human_Symbols)
Results2c <- unique(unlist(sapply(Results2b,function(dx) unlist(strsplit(dx,", "))))) #495
names(Results2c) <- NULL
Results2d <- unique(c(Results2,Results2c)) #503
Results2e <- Results2d[Results2d != "-"] #502
#Now filter
strName <- sapply(Results$OG.strName,function(dx) strsplit(dx,".",fixed=TRUE)[[1]][2])
Results1 <- Results
Results1$strName <- strName
keepStrs <- c("disop" ,"e3" , "e32" , "g8" , "h3" , "e8", "tme8" , "esa" , "h32" , "msa2" , "tm" , "tmh8", "tmx8" , "bsa" , "bsa2" , "bsap" , "c3" , "diso" , "esap" , "h8" , "hn8"
[28] "msa" "msan" "msap" "s8" "sn8" "t8" "tmn" "tmp" "tmp2"
[37] "tn8" "b8" "bn8")
Results2 <- unique(subset(Results,Padj<0.1)$Gene_Symbol) #444
Results2b <- unique(subset(Results,Padj<0.1)$all_Human_Symbols)
Results2c <- unique(unlist(sapply(Results2b,function(dx) unlist(strsplit(dx,", "))))) #495
names(Results2c) <- NULL
Results2d <- unique(c(Results2,Results2c)) #503
Results2e <- Results2d[Results2d != "-"] #502
#allbgs created; skip and load below
#to create background sets, used aggregateV4resultsFullResiduals1.5only.getAllHumanGenes.nowForProteins or equivelant
allbgs <- list()
#CNV background set
load("/gpfs/scratch/jrest/csea/Results/Results_CNV_ALLMCL.rda")
bg <- unique(Results$Gene_Symbol)
bg1 <- unique(Results$all_Human_Symbols)
bg2 <- unique(unlist(sapply(bg1,function(dx) unlist(strsplit(dx,", ")))))
names(bg2) <- NULL
allbgs[[ps0[1]]] <- bg2
#16158
#Cpg
load("/gpfs/scratch/jrest/csea/Results/Results_CPG_MCL1.5.rda")
bg1 <- unique(Results$all_Human_Symbols)
bg2 <- unique(unlist(sapply(bg1,function(dx) unlist(strsplit(dx,", ")))))
names(bg2) <- NULL
allbgs[[ps0[3]]] <- bg2
#12938
#Gene structure
load("/gpfs/scratch/jrest/csea/Results/Results_GS_MCL1.5.rda")
#combined intron length
bg1 <- unique(subset(Results,grepl("CombinedIntron_Length",OG.strName))$all_Human_Symbols)
bg2 <- unique(unlist(sapply(bg1,function(dx) unlist(strsplit(dx,", ")))))
names(bg2) <- NULL
allbgs[[ps0[2]]] <- bg2
#16516
#first intron length
bg1 <- unique(subset(Results,grepl("FirstIntron_Length",OG.strName))$all_Human_Symbols)
bg2 <- unique(unlist(sapply(bg1,function(dx) unlist(strsplit(dx,", ")))))
names(bg2) <- NULL
allbgs[[ps0[4]]] <- bg2
#16516
#FirstPostCDSIntron_Length
bg1 <- unique(subset(Results,grepl("FirstPostCDSIntron_Length",OG.strName))$all_Human_Symbols)
bg2 <- unique(unlist(sapply(bg1,function(dx) unlist(strsplit(dx,", ")))))
names(bg2) <- NULL
allbgs[[ps0[5]]] <- bg2
#16285
#all Gene structure
bg1 <- unique(Results$all_Human_Symbols)
bg2 <- unique(unlist(sapply(bg1,function(dx) unlist(strsplit(dx,", ")))))
names(bg2) <- NULL
allbgs[[ps0[6]]] <- bg2
#16575
#Num_AltTranscripts
bg1 <- unique(subset(Results,grepl("Num_AltTranscripts",OG.strName))$all_Human_Symbols)
bg2 <- unique(unlist(sapply(bg1,function(dx) unlist(strsplit(dx,", ")))))
names(bg2) <- NULL
allbgs[[ps0[7]]] <- bg2
#16309
#Num_Exons
bg1 <- unique(subset(Results,grepl("Num_Exons",OG.strName))$all_Human_Symbols)
bg2 <- unique(unlist(sapply(bg1,function(dx) unlist(strsplit(dx,", ")))))
names(bg2) <- NULL
allbgs[[ps0[8]]] <- bg2
#16551
#PS background set - deprecated
load("/gpfs/scratch/jrest/csea/str_OGv4.ocl1.5All2.rda") #ocl1.5All2
bg <- unique(ocl1.5All2$Gene_Symbol)
bg1 <- unique(c(ocl1.5All2$all_Human_Symbols,ocl1.5All2$Gene_Symbol.x))
bg2 <- unique(unlist(sapply(bg1,function(dx) unlist(strsplit(dx,", ")))))
names(bg2) <- NULL
allbgs[[ps0[9]]] <- bg2
#18892
#PS background set - no logs
load("/gpfs/projects/RestGroup/josh/phenomegenome/structure/Results_PS_nolog_MCL1.5.rda") #Results
bg1 <- unique(Results$all_Human_Symbols)
bg2 <- unique(unlist(sapply(bg1,function(dxx) {
ps3 <- unlist(strsplit(dxx,", ")[[1]])
ps4 <- ps3[!(ps3=="-")]
return(ps4)
}
)))
#18657
allbgs[[ps0[9]]] <- bg2
#18892
load("/gpfs/scratch/jrest/csea/Results/Results_TFBS_MCL1.5_withRemapSupport.rda")
#TFBS background set (with remap) Positive
bg <- unique(subset(tfbsResults, remap_support.y==1 & Slopes > 0)$Gene_Symbol)
bg1 <- unique(subset(tfbsResults, remap_support.y==1 & Slopes > 0)$all_Human_Symbols)
bg2 <- unique(unlist(sapply(bg1,function(dx) unlist(strsplit(dx,", ")))))
names(bg2) <- NULL
allbgs[[ps0[10]]] <- bg2
allbgs[[ps0[11]]] <- bg2
#17474
#TFBS background set (with remap) Negative and Positive
bg1 <- unique(subset(tfbsResults, remap_support.y==0 & Slopes < 0)$all_Human_Symbols)
bg2 <- unlist(sapply(bg1,function(dx) unlist(strsplit(dx,", "))))
bg3 <- unique(subset(tfbsResults, remap_support.y==1 & Slopes > 0)$all_Human_Symbols)
bg4 <- unlist(sapply(bg3,function(dx) unlist(strsplit(dx,", "))))
bg5 <- unique(c(bg2,bg4))
names(bg5) <- NULL
allbgs[[ps0[12]]] <- bg5
allbgs[[ps0[13]]] <- bg5
#17939
#positive TFs background set (NOT the target genes)
bg1 <- unique(subset(tfbsResults, remap_support.y==1 & Slopes > 0)$TF_Name) #458
Results2b <- unique(subset(tfbsResults, remap_support.y==1 & Slopes > 0 & Padj<0.1 )$TF_Name) # 356
#all background set
length(allbgs[[ps0[14]]]) #19399; with ps logs
allbgs[[ps0[14]]] <- NULL
concatbgs <- unique(unlist(allbgs))
allbgs[[ps0[14]]] <- concatbgs
length(allbgs[[ps0[14]]]) #19192; with no ps logs
save(allbgs,file="/gpfs/home/jrest/allbgs.nologs.rda") #/gpfs/home/jrest
load("/gpfs/home/jrest/allbgs.nologs.rda") #load all background set
names(allbgs) <- sapply(names(allbgs),function(dx) strsplit(strsplit(dx,"/",fixed=TRUE)[[1]][7],".xlsx",fixed=TRUE)[[1]][1])
names(allbgs)[[2]] <- "PS_HitGenes_0.1_nolog"
save(allbgs,file="/gpfs/home/jrest/allbgs.nologs.rn.rda") #/gpfs/home/jrest
#our results files
ps0 <- list.files(path = "/gpfs/projects/RestGroup/csea/Genelists", full.names = TRUE,include.dirs = FALSE)[-9]
```
### spec2 - msig cell type expression
#read in csea msig tsv files from Priya
```R
psSimp <- list.files(path = "/gpfs/scratch/jrest/csea/EnrichmentGeneTsvFiles", full.names = TRUE,include.dirs = FALSE)
psSimpTsvs <- lapply(psSimp,function(dx)t(read.delim(dx,header=FALSE,row.names=1)))
psSimpT <- data.table(do.call("rbind",psSimpTsvs))
#(1) Descartes dataset - https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7780123/
#(2) Fan dataset - https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6028726/
#(3) Manno dataset - https://www.ncbi.nlm.nih.gov/pmc/articles/pmid/27716510/
#(4) Zhong dataset - https://www.nature.com/articles/nature25980
psSimp2 <- lapply(unique(psSimpT$PMID),function(dx){
ps2 <- subset(psSimpT,PMID==dx)[,c(1,20)]
ps5 <- apply(ps2,1,function(dxx) {
ps3 <- unlist(strsplit(unlist(dxx[2]),","))
ps4 <- ps3[!(ps3=="")]
return(data.frame(Celltype=unlist(dxx[1]),gene=ps4))
}
)
return(do.call("rbind",ps5))
})
spec2 <- as.data.table(psSimp2[[1]]) #for datasets 1:4
save(spec2,file="/gpfs/scratch/jrest/csea/MSIG.spec2.rda")
#load("/gpfs/scratch/jrest/csea/MSIG.spec2.rda") #msig tsv files
```
### Not sure how this is different from above - seems the same
#The Excel file contains the statistics from the enrichment test of MGI phenotype gene signatures in the cell type specific marker gene signatures.
```R
psSimp <- list.files(path = "/gpfs/scratch/jrest/csea/EnrichmentGeneTsvFiles", full.names = TRUE,include.dirs = FALSE)
psSimpTsvs <- read.xlsx("/gpfs/scratch/jrest/csea/file3_41598_2018_27293_MOESM4_ESM.xlsx",startRow=3)
psSimpTsvs$phenotype.celltype <- paste(psSimpTsvs$Phenotype,psSimpTsvs$Celltype,sep=".")
#psSimpTsvs$phenotype.celltype <- psSimpTsvs$Celltype
psSimp2 <- lapply(unique(psSimpTsvs$Phenotype.description),function(dx){
ps2 <- subset(psSimpTsvs,Phenotype.description==dx)[,c(11,10)]
ps5 <- apply(ps2,1,function(dxx) {
ps3 <- unlist(strsplit(unlist(dxx[2]),";"))
ps4 <- ps3[!(ps3=="")]
return(data.frame(Celltype=unlist(dxx[1]),gene=ps4))
}
)
return(do.call("rbind",ps5))
})
spec2 <- do.call("rbind",psSimp2)
#spec2 <- as.data.table(psSimp2[[1]]) #for datasets 1:4
```
## code to just return the significance of the cell type enrichments, without plots
```bash
#backgrounds
load("/gpfs/home/jrest/allbgs.nologs.rn.rda") #allbgs
ps0 <- list.files(path = "/gpfs/projects/RestGroup/csea/Genelists", full.names = TRUE,include.dirs = FALSE,pattern = "\\.xlsx$")[-9]
#for(i in ps0[c(2,6:13)]){ #1,3,5,14
#allOurResults1 <- lapply(ps0[c(2,4,6:13)],function(i){ #now includes 4, FirstIntron_Length_HitGenes_0.1, and 14, all
lapply(names(allbgs)[c(4,6:12)],function(i){
#i <- ps0[14]
ps <- NULL
print(i)
ps <- read.xlsx(paste0("/gpfs/projects/RestGroup/csea/Genelists/",i,".xlsx"),startRow=2,colNames=FALSE)$X2
print(paste("new result type",i,"of length",length(ps)))
ps <- unique(ps)
print(paste("unique length",length(ps)))
save(ps,file=paste0("/gpfs/projects/RestGroup/csea/ps/",i,".ps.rda"))
})
### lateral cerebellum
#or, from a results file:
#load("/gpfs/projects/RestGroup/josh/phenomegenome/structure/Results_PS_nolog_MCL1.5.rda") #lateral cerebellum Results
load("/gpfs/projects/RestGroup/csea/lateral_cereb_results/Results_TFBS_MCL1.5_withRemapSupport.rda") #lateral cerebellum Results tfbsResults ## Note different name
Results <- subset(tfbsResults, remap_support.y==1 & Padj<0.1 ) #TFBS_HitGenes_PosNeg_0.1
i <- "TFBS_HitGenes_PosNeg_0.1"
Results <- subset(tfbsResults, remap_support.y==1 & Slopes > 0 & Padj<0.1 ) #TFBS_HitGenes_Pos_0.1
i <- "TFBS_HitGenes_Pos_0.1"
#hippocampus
load("/gpfs/projects/RestGroup/csea/hippo_results/Results_TFBS_Hippocampus_MCL1.5.rda") #Results
tfbsResults <- Results
Results <- subset(tfbsResults, remap_support.y==1 & Padj<0.1 ) #TFBS_HitGenes_PosNeg_0.1
i <- "TFBS_HitGenes_PosNeg_0.1"
load("/gpfs/projects/RestGroup/csea/hippo_results/Results_TFBS_Hippocampus_MCL1.5.rda") #Results
tfbsResults <- Results
Results <- subset(tfbsResults, remap_support.y==1 & Slopes > 0 & Padj<0.1 ) #TFBS_HitGenes_Pos_0.1
i <- "TFBS_HitGenes_Pos_0.1"
load("/gpfs/projects/RestGroup/csea/hippo_results/Results_PS_nolog_hippo_MCL1.5.rda") #hippocampus Results
i <- "PS_HitGenes_0.1_nolog"
load("/gpfs/projects/RestGroup/csea/hippo_results/Results_GS_Hippocampus_MCL1.5.rda") #hippocampus Results
i <- "GS_HitGenes_0.1"
load("/gpfs/projects/RestGroup/csea/hippo_results/Results_CPG_Hippocampus_MCL1.5.rda") #hippocampus Results
i <- "CpG_HitGenes_0.1"
load("/gpfs/projects/RestGroup/csea/hippo_results/Results_TFBS_Hippocampus_MCL1.5.rda") #hippocampus Results
i <- "CpG_HitGenes_0.1"
Results2b <- unique(subset(Results,Padj<0.1)$all_Human_Symbols)
ps <- unique(unlist(sapply(Results2b,function(dxx) {
ps3 <- unlist(strsplit(dxx,", ")[[1]])
ps4 <- ps3[!(ps3=="-")]
return(ps4)
}
)))
#save(ps,file=paste0("/gpfs/projects/RestGroup/csea/ps/PS_HitGenes_0.1_nolog_hippo.ps.rda"))
###end or from a results file
#now do the enrichments
allOurResults1 <- lapply(names(allbgs)[c(2)],function(i){ #,4,6:11
# i <- "/gpfs/scratch/jrest/csea/Genelists/PS_HitGenes_0.1.xlsx"
#Make a plot for each sheet
print(paste("start",i))
load(paste0("/gpfs/projects/RestGroup/csea/ps/",i,".ps.rda"))
myEnrichList <- lapply(c(1,2,3,4,5,6,7,8,9),function(j){
print(j)
spec1b <- read.xlsx("/gpfs/projects/RestGroup/csea/file1_41598_2018_27293_MOESM2_ESM.xlsx",sheet=j,startRow=3,colNames=TRUE)
#if(j %in% c(1,2,3,7,8,9)){
#myfolds <- seq(0,7,0.25)} #Grand Means: sheets 1,2,7,8
#if(j %in% c(4,5,6)){
#myfolds <- seq(100,999,50)} #Grand Ranks: sheet 4,5
#myEnrich <- lapply(myfolds,function(fold){
#if(j %in% c(1,2,3,7,8,9)){
#spec2 <- as.data.table(subset(spec1b,grand_mean > fold))} # sheets 1,2,7, 8
#if(j %in% c(4,5,6)){
#spec2 <- as.data.table(subset(spec1b,grand_rank < fold)) #} # sheet 4,5
spec2 <- as.data.table(spec1b)
#or from msig tsv files
#load("/gpfs/scratch/jrest/csea/MSIG.spec2.rda") #msig tsv files, spec2
a <- NULL
a <- split(spec2, by="Celltype")
gsets2 <- lapply(a,function(dx) dx[1:500,]$gene[dx[1:500,]$gene %in% allbgs[[i]]])
###& gsets2 <- lapply(a,function(dx) dx[1:500,]$gene[dx[1:500,]$gene %in% bg1])
z <- NULL
z <- testEnrichment(Goi=ps, gsets=gsets2, background=allbgs[[i]], minknown = 2, mindiffexp = 2,maxknown = 1000)
###& z <- testEnrichment(Goi=Results2b, gsets=gsets2, background=bg1, minknown = 2, mindiffexp = 2,maxknown = 1000)
z2 <- NULL
#if(dim(z)[1] >0 ){z$thresh <- fold
#z2 <- subset(z,qval<0.05)
#z2 <- subset(z,pval<0.05)
z2 <- z
#if(dim(z2)[1] >0 ){
z2$genes <- apply(z2,1,function(dx){
paste(ps[ps %in% gsets2[[unlist(dx[4])]] ],collapse=",")
})
z2$expgb <- sapply(gsets2[z2$gsetid],function(dx){
paste(dx,collapse=",")
})
#z2$go <- sapply(1:dim(z2)[[1]],function(dx) gost(query = strsplit(z2$genes[dx],split=",",fixed=TRUE)[[1]],
# organism = "hsapiens", ordered_query = FALSE,
# multi_query = FALSE, significant = TRUE,
# measure_underrepresentation = FALSE, evcodes = FALSE,
# custom_bg = strsplit(z2$expgb[dx],split=",",fixed=TRUE)[[1]],
# as_short_link = FALSE))
z2$go <- sapply(1:dim(z2)[[1]],function(dx) unlist(as.data.frame(enrichGO(gene = strsplit(z2$genes[dx],split=",",fixed=TRUE)[[1]],
universe = strsplit(z2$expgb[dx],split=",",fixed=TRUE)[[1]],
OrgDb = org.Hs.eg.db,
ont = "ALL",
keyType = 'SYMBOL',
pAdjustMethod = "BH",
pvalueCutoff = 0.01,
qvalueCutoff = 0.05))$ID) )
z2$DO <- sapply(1:dim(z2)[[1]],function(dx) unlist(as.data.frame(enrichDO(gene =
mapIds(org.Hs.eg.db, strsplit(z2$genes[dx],split=",",fixed=TRUE)[[1]], 'ENTREZID', 'SYMBOL'),
universe = mapIds(org.Hs.eg.db, strsplit(z2$expgb[dx],split=",",fixed=TRUE)[[1]], 'ENTREZID', 'SYMBOL'),
ont = "DO"
))$ID) )
# }}
#return(z2)
# }) #lappyly over mfolds
#myEnrich2 <- (do.call("rbind",myEnrich))
#if(dim(myEnrich2)[1] >0 ){
sheetids <- c("top_all_enrich","top_human_enrich","top_mouse_enrich","top_all_expression","top_human_expression","top_mouse_expression","top_all_specificity","top_human_specificity","top_mouse_specificty")
z2$sheet <- sheetids[j] #}
#return(myEnrich2)
return(z2)
}) #for each sheet j
myEnrichList2 <- NULL
myEnrichList2 <- do.call("rbind",myEnrichList)
#if(dim(myEnrichList2)[1]>0){
myEnrichList2$info <- i # strsplit(strsplit(i,"/")[[1]][7],".xlsx")[[1]][1]
#myEnrichList2$info <- "PS_HitGenes_0.1_nologs"
#openxlsx::write.xlsx(myEnrichList2,file="/gpfs/projects/RestGroup/csea/hippcampus.cpg.mkenzie.011823.xlsx",overwrite=TRUE)
print(paste("end",i))
return(myEnrichList2)
})
allOurResults2 <- do.call("rbind",allOurResults1)
#openxlsx::write.xlsx(myEnrichList2,file="/gpfs/projects/RestGroup/csea/McKenzieEnrichALLP.tfbsPos0.1.remap.hippo.xlsx",overwrite=TRUE)
openxlsx::write.xlsx(allOurResults2,file="/gpfs/projects/RestGroup/csea/allMcKenzieEnrichALLP.1216.xlsx",overwrite=TRUE)
save(myEnrichList2,file=paste0("mkenzieEnrich.",strsplit(i,"/")[[1]][7],".rda"))
openxlsx::write.xlsx(myEnrichList2,file=paste0("mkenzieEnrich.",strsplit(i,"/")[[1]][7]),overwrite=TRUE)
}
#read in tsv files from Priya
psSimp <- list.files(path = "/gpfs/projects/RestGroup/csea/EnrichmentGeneTsvFiles", full.names = TRUE,include.dirs = FALSE)
psSimpTsvs <- lapply(psSimp,function(dx)t(read.delim(dx,header=FALSE,row.names=1)))
psSimpT <- data.table(do.call("rbind",psSimpTsvs))
#(1) Descartes dataset - https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7780123/
#(2) Fan dataset - https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6028726/
#(3) Manno dataset - https://www.ncbi.nlm.nih.gov/pmc/articles/pmid/27716510/
#(4) Zhong dataset - https://www.nature.com/articles/nature25980
psSimp2 <- lapply(unique(psSimpT$PMID),function(dx){
ps2 <- subset(psSimpT,PMID==dx)[,c(1,20)]
ps5 <- apply(ps2,1,function(dxx) {
ps3 <- unlist(strsplit(unlist(dxx[2]),","))
ps4 <- ps3[!(ps3=="")]
return(data.frame(Celltype=unlist(dxx[1]),gene=ps4))
}
)
return(do.call("rbind",ps5))
})
studyNames <- c("Descartes","Fan","Manno","Zhong")
#for(for(i in ps0[c(2,6:13)])){ #for each of OUR results
allOurResults <- lapply(ps0[c(2,6:13)],function(i){
ps <- unique(read.xlsx(i,startRow=2,colNames=FALSE)$X2)
myEnrich <- lapply(1:4,function(q){ #for each of priyas four studies
spec2 <- as.data.table(psSimp2[[q]]) #for datasets 1:4
a <- NULL
a <- split(spec2, by="Celltype")
gsets2 <- lapply(a,function(dx) dx$gene[dx$gene %in% allbgs[[i]]])
z <- NULL
z <- testEnrichment(Goi=ps, gsets=gsets2, background=allbgs[[i]], minknown = 2, mindiffexp = 2,maxknown = 1000)
z2 <- NULL
if(dim(z)[1] >0 ){z$thresh <- studyNames[q]
#z2 <- subset(z,qval<0.05)
#z2 <- subset(z,pval<0.05)
z2 <- z
if(dim(z2)[1] >0 ){
z2$genes <- apply(z2,1,function(dx){
paste(ps[ps %in% gsets2[[unlist(dx[4])]] ],collapse=",")
})
}}
return(z2)
}) #for each of four studies
myEnrich2 <- NULL
myEnrich2 <- (do.call("rbind",myEnrich))
if(dim(myEnrich2)[1] >0 ){
myEnrich2$sheet <- i}
#openxlsx::write.xlsx(myEnrich,file="/gpfs/projects/RestGroup/csea/EnrichmentGeneTsv.TFBS_HitGenes_PosNeg_0.1.hippo.remap.xlsx",overwrite=TRUE)
#myEnrich2$sheet <- "PS_HitGenes_0.1_nologs"
return(myEnrich2)
})
myEnrichList2 <- do.call("rbind",allOurResults)
#openxlsx::write.xlsx(myEnrich2,file="psNoLog.EnrichmentGeneTsvALLRESULTS.xlsx",overwrite=TRUE)
#save(myEnrichList2,file=paste0("mkenzieEnrich.",strsplit(i,"/")[[1]][7],".rda"))
openxlsx::write.xlsx(myEnrichList2,file="EnrichmentGeneTsvALLRESULTS.xlsx",overwrite=TRUE)
###
############
#for mckenzie enrichment of encrichments
allOurResults <- lapply(ps0[c(2,6:13)],function(i){
ps <- unique(read.xlsx(i,startRow=2,colNames=FALSE)$X2)
#myEnrich <- lapply(1:4,function(q){ #for each of priyas four studies
#spec2 <- as.data.table(psSimp2[[q]]) #for datasets 1:4
spec2 <- as.data.table(spec2)
a <- NULL
a <- split(spec2, by="Celltype")
gsets2 <- lapply(a,function(dx) dx$gene[dx$gene %in% allbgs[[i]]])
z <- NULL
z <- testEnrichment(Goi=ps, gsets=gsets2, background=allbgs[[i]], minknown = 2, mindiffexp = 2,maxknown = 1000)
z2 <- NULL
if(dim(z)[1] >0 ){z$thresh <- i # studyNames[q]
#z2 <- subset(z,qval<0.05)
z2 <- subset(z,pval<0.05)
if(dim(z2)[1] >0 ){
z2$genes <- apply(z2,1,function(dx){
paste(ps[ps %in% gsets2[[unlist(dx[4])]] ],collapse=",")
})
}}
return(z2)
})
myEnrichList2 <- do.call("rbind",allOurResults)
#save(myEnrichList2,file=paste0("mkenzieEnrich.",strsplit(i,"/")[[1]][7],".rda"))
openxlsx::write.xlsx(myEnrichList2,file="EnrichmentOfEnrichment.xlsx",overwrite=TRUE)
```
## merge in info about the gene significant in our results
load("mkenzieEnrich.PS_HitGenes_0.1.xlsx.rda") #myEnrichList2
myEnrichList3 <- unique(unlist(sapply(myEnrichList2$genes,function(dx) unlist(strsplit(dx,",")))))
load("/gpfs/scratch/jrest/csea/Results/Results_PS_MCL1.5.rda")
#mimic original list
Results2 <- unique(subset(Results,Padj<0.1)$Gene_Symbol) #444
Results2b <- unique(subset(Results,Padj<0.1)$all_Human_Symbols)
#s1 <- lapply(myEnrichList3,function(dx) #paste0(grep(dx,subset(Results,Padj<0.1)$Gene_Symbol,value=FALSE),collapse=","))
s2 <- unlist(lapply(myEnrichList3,function(dx) Results[grep(dx,subset(Results,Padj<0.1)$all_Human_Symbols,value=FALSE),"OG.strName"]))
s3 <- unique(s2)
s4 <- sapply(s3,function(dx) strsplit(dx,".",fixed=TRUE)[[1]][2])
allStrs <- unique(sapply(Results$OG.strName,function(dx) strsplit(dx,".",fixed=TRUE)[[1]][2]))
allStrs [!allStrs %in% unique(s4) ]
#not found enriched in any
"tmh8" "bsa2" "b8" "bn8"
table(s4)
bsa bsan bsap c3 diso disop e3 e32 e8 en3 en8 esa esan
3 1 3 1 4 2 4 4 2 3 1 4 1
esap g8 gn8 h3 h32 h8 hn3 hn8 msa msa2 msan msap s8
2 1 1 2 5 3 1 1 1 4 5 5 3
sn8 t8 tm tme8 tmn tmp tmp2 tmx8 tn8
2 1 1 2 2 1 5 3 2
# enrichment results
**ONLY Results with q-value < 0.05 are shown. Everything else left empty.**
Sheets 4&5: Grand rank < thresholds tested: 100 200 300 400 500 600 700 800 900 100
Sheets 1,2,7,8: Grand mean > thresholds tested: 0.0 0.5 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 5.5 6.0
sheet 1: top all enrich
sheet 2: top human enrich
sheet 4: top all expression
sheet 5: top human expression
sheet 7: top all specificity
sheet 8: top human specificity
# Results
# CNV_HitGenes_0.1
## sheet 1 (top all enrich)
## sheet 2 (top human enrich )
## sheet 4 (top all expression )
## sheet 5 (top human expression )
## sheet 7 (top all specificity )
## sheet 8 (top human specificity )
# CombinedIntron_Length_HitGenes_0.1
## sheet 1 (top all enrich)
## sheet 2 (top human enrich )
## sheet 4 (top all expression )
## sheet 5 (top human expression )
## sheet 7 (top all specificity )
## sheet 8 (top human specificity )
# CpG_HitGenes_0.1
## sheet 1 (top all enrich)
## sheet 2 (top human enrich )
## sheet 4 (top all expression )
## sheet 5 (top human expression )
## sheet 7 (top all specificity )
## sheet 8 (top human specificity )
# FirstIntron_Length_HitGenes_0.1
## sheet 1 (top all enrich)
## sheet 2 (top human enrich )
## sheet 4 (top all expression )
## sheet 5 (top human expression )
## deprecated
### script for printing the results to a formatted table...
```
#output file
fc<-"/gpfs/scratch/jrest/csea/output.md"
write("# Results",fc,append=FALSE)
for(i in ps0){
ps <- read.xlsx(i,startRow=2,colNames=FALSE)$X2
write(paste("# ",i),fc,append=TRUE)
#McKenzie data; specify sheet
for(j in c(1,2,4,5,7,8)){
spec1b <- NULL
spec1b <- read.xlsx("/gpfs/scratch/jrest/csea/file1_41598_2018_27293_MOESM2_ESM.xlsx",sheet=j,startRow=3,colNames=TRUE)
write(paste("## sheet ",j),fc,append=TRUE)
#Thresholds to test
if(j %in% c(1,2,7,8)){
myfolds <- seq(0,6,0.5)} #Grand Means: sheets 1,2,7,8
if(j %in% c(4,5)){
myfolds <- seq(100,1000,100)} #Grand Ranks: sheet 4,5
spec2 <- NULL
for(fold in myfolds){
if(j %in% c(1,2,7,8)){
spec2 <- as.data.table(subset(spec1b,grand_mean > fold))} # sheets 1,2,7, 8
if(j %in% c(4,5)){
spec2 <- as.data.table(subset(spec1b,grand_rank < fold))} # sheet 4,5
a <- split(spec2, by="Celltype")
gsets2 <- lapply(a,function(dx) dx$gene[dx$gene %in% bg2])
z <- NULL
z <- testEnrichment(Goi=ps, gsets=gsets2, background=bg2, minknown = 2, mindiffexp = 2,maxknown = 1000)
if(dim(z)[1] >= 1){
z2 <- subset(z,qval<0.05)
if(dim(z2)[1] >= 1){
write(paste("### threshold ",fold),fc,append=TRUE)
# write.table(z2, fc, append=TRUE)
write("```",fc,append=TRUE)
gdata::write.fwf(z2, file = fc, append = TRUE, quote = FALSE, sep = "\t",
colnames = TRUE)
write("```",fc,append=TRUE)
}
}
} #for each threshold
} #for each mckenzie sheet j
} #for of our gene lists i (protein structure, etc.)
```
# Notes
## Background Set
I used the protein structure background for all. This should be fixed; protein structure has a slightly smaller background.
## Only about 70% of McKenzie genes are in our background set.
`sum(gsets2[[1]] %in% bg2)`
>693 of 1000, for example, in ast gset
>others pseudogenes, FISH probe regions, etc.
## structure of gsets
`str(gsets2)`
```
#List of 5
# $ ast: chr [1:693] "AQP4" "BMPR1B" "ETNPPL" "GJB6" ...
# $ end: chr [1:779] "APOLD1" "CD34" "TGM2" "IFI27" ...
# $ mic: chr [1:677] "CCL3" "CCL3L3" "CCL4L1" "CCL4" ...
# $ neu: chr [1:767] "SYNPR" "RELN" "CNR1" "GAD2" ...
# $ oli: chr [1:698] "UGT8" "PLP1" "ERMN" "CNDP1" ...
```
## examples of Goi in gsets
``sum(ps$V1 %in% gsets2[[1]])``
>17 of 502 Goi in gsets2 (length 693) , for example
## deprecated
Code for heatmap
```bash
#1 and 3 and 4 and 5 aren't currently working?
#do for i= 2,6:13; tfbs is 10:13
for(i in ps0[c(2,6:13)]){
#i <- ps0[14]
ps <- NULL
ps <- unique(read.xlsx(i,startRow=2,colNames=FALSE)$X2)
print(paste("new result type",i,"of length",length(ps)))
#Make a plot for each sheet
myEnrichList <- lapply(c(1,2,3,4,5,6,7,8,9),function(j){
print(j)
spec1b <- read.xlsx("/gpfs/scratch/jrest/csea/file1_41598_2018_27293_MOESM2_ESM.xlsx",sheet=j,startRow=3,colNames=TRUE)
if(j %in% c(1,2,3,7,8,9)){
myfolds <- seq(0,7,0.25)} #Grand Means: sheets 1,2,7,8
if(j %in% c(4,5,6)){
myfolds <- seq(100,999,50)} #Grand Ranks: sheet 4,5
myEnrich <- lapply(myfolds,function(fold){
if(j %in% c(1,2,3,7,8,9)){
spec2 <- as.data.table(subset(spec1b,grand_mean > fold))} # sheets 1,2,7, 8
if(j %in% c(4,5,6)){
spec2 <- as.data.table(subset(spec1b,grand_rank < fold))} # sheet 4,5
a <- NULL
a <- split(spec2, by="Celltype")
gsets2 <- lapply(a,function(dx) dx$gene[dx$gene %in% allbgs[[i]]])
z <- NULL
z <- testEnrichment(Goi=ps, gsets=gsets2, background=allbgs[[i]], minknown = 2, mindiffexp = 2,maxknown = 1000)
z2 <- NULL
if(dim(z)[1] >0 ){z$thresh <- fold}
return(z)
}) #lappyly over mfolds
myEnrich2 <- NULL
myEnrich2 <- (do.call("rbind",myEnrich))
enrichPlot <- NULL
enrichPlot <- ggplot(myEnrich2) +
geom_tile(data = myEnrich2 %>% filter(qval < .05), aes(x = gsetid,y = factor(thresh, levels = unique(thresh))),fill = "red") +
geom_tile(data = myEnrich2 %>% filter(qval < .1 & qval >= .05), aes(x = gsetid,y = factor(thresh, levels = unique(thresh))),fill = "pink") +
geom_tile(data = myEnrich2 %>% filter( qval >= .1 & qval < .5), aes(x = gsetid,y = factor(thresh, levels = unique(thresh))),fill = "lightcyan1") +
geom_tile(data = myEnrich2 %>% filter( qval >= .5), aes(x = gsetid,y = factor(thresh, levels = unique(thresh))),fill = "grey") +
xlab("cell type") + ylab(paste("sheet",j,"threshold")) + theme_light()
return(enrichPlot)
}) #for each sheet j
pdf(paste0(strsplit(i,"/")[[1]][7],".enrich.pdf"))
print(ggarrange(myEnrichList[[1]],myEnrichList[[2]],myEnrichList[[3]],myEnrichList[[4]],
myEnrichList[[5]],myEnrichList[[6]],myEnrichList[[7]],myEnrichList[[8]],myEnrichList[[9]], ncol = 3, nrow = 3)
)
dev.off()
}
```
###### tags: `pgwas`