Running xcms analysis with Vincent

--- author: Benjamin Brachi date: 20/05/2020 title: Running xcms analysis with Vincent output: html_document: toc: true toc_float: true theme: yeti --- # analyse xcms: Changement - noise=0 dans CentWaveParam - Dans Camera: `E <- groupCorr(D, calcCiS=T, calcCaS=T, calcIso=T)` - `peaklist=getPeaklist(E)` this was getPeaklist(D) which would have been wrong I imagine. - PeakDensityParam () `sampleGroups = xdata$sample_group` # Analyse Tableau peak après xcms ```{r eval=F} data=readRDS("./data/xsannotated_bourran.rds") d=as.matrix(data[,8:(ncol(data)-3)]) peaks=data[, c(1:8, (ncol(data)-3):ncol(data))] dim(d) dim(peaks) length(unique(peaks$pcgroup)) ##count na in rows... nar=apply(d, 1, function(x)(sum(is.na(x)))) ##count na in cols nac=apply(d, 2, function(x)(sum(is.na(x)))) ##I think NA values should be 0. d[is.na(d)]=0 ##see about intensities per row intmaxr=apply(d, 1,max) hist(intmaxr) hist(intmaxr, breaks=10000, xlim=c(0, 5000)) ``` Je vois pas de raison de mettre un threshold sur intmaxr pour filtrer les pics. ```{r} ##look for peaks with too many 0 nzr=apply(d, 1, function(x, N){sum(x==0)/N}, N=ncol(d)) hist(nzr) ``` Je vois pas de raison de filtré à ce niveau non plus. Now we can filter using the experimental design. let's filter for peaks to be stable in QCs. The expected error in the machine coefficient of variation is 20%. ```{r eval=T, echo=T} samples=readRDS("../../../res/bourran_samples.rds") ##reorder according to data I=intersect(samples$id, colnames(d)) samples=droplevels(samples[match(I, samples$id),]) d=d[,I] all(colnames(d)==samples$id) ##what do we have in terms of samples in the data. table(paste(samples$type)) ##the rows with type NA should be removed. s=(samples$type%in%c("NA", "parents","timeserie")==F) samples <- samples[s,] d <- d[,s] ``` ```{r} ##investigate coefficient of variation in QCs dqc=d[,samples$type=="QC"] ##check that all QC are correct irqc=apply(dqc, 2, sum) hist(irqc) ``` There are QCs that look bad. let's remove them. ```{r} ##remove bad QCs w=names(which(irqc<5e6)) d=d[, colnames(d)%in%w==F] samples=samples[samples$id%in%w==F,] all(samples$id==colnames(d)) ``` ```{r} #resubset to keep only QCs dqc=d[,samples$type=="QC"] rs=rowMeans(dqc) cv=apply(dqc, 1, function(x){sd(x)/mean(x)}) plot(log10(rs), cv) ##add the point for the internal standard. mzstd = 303.0504 winmz = 0.005 w=(peaks$mz >= mzstd - winmz & peaks$mz <= mzstd + winmz & peaks$rt > 450) points(log10(rs)[w], cv[w], pch=16, cex=2, col="gold") cv[w] abline(0.2, 0) dqc=dqc[cv<=0.2,] d=d[cv<=0.2,] peaks=peaks[cv<=0.2,] dim(d) dim(peaks) length(unique(peaks$pcgroup)) ``` We're getting closer. Now we need to keep peaks that are higher in QCs than in blancks ```{r} dbl=d[,samples$type=="blank"] ##are all the blanks, blanks.. ib=apply(dbl, 2,sum) hist(ib) ##no.. there are a few contaminations w=names(which(ib>5e6)) ##remove those. d=d[,colnames(d)%in%w==F] samples=samples[(samples$id%in%w)==F,] all(samples$id==colnames(d)) dbl=d[,samples$type=="blank"] dqc=d[,samples$type=="QC"] ##mesure noise in blanks and compare to qc. Use to filter. ib=apply(dbl, 1, quantile, 0.99) ##quantile à 99% dans les blancs iqc=apply(dqc, 1,mean) w=(iqc>=ib*10) ##signal to noise ratio 100X ##for peaks only, but we can remove them. d=d[w, ] peaks=peaks[w,] ##check we have the same number of peaks and count the number of groups dim(d) dim(peaks) dim(samples) length(unique(peaks$pcgroup)) ``` ```{r} ``` Now we have 101 pseudo-molecules. We can investigate those. ```{r} saveRDS(d, "./data/intensities_filt.rds") saveRDS(peaks, "./data/peaks_filt.rds") saveRDS(samples, "./data/samples_filt.rds") ``` ## Analyses de données ## technical replicability ```{r} d=readRDS("data/intensities_filt.rds") samples=readRDS("data/samples_filt.rds") peaks=readRDS("data/peaks_filt.rds") ##subset dat to keep only techreps samples$techrep[samples$type=="blank"]=NA samples$techrep=as.integer(samples$techrep) u=unique(samples$sample[samples$techrep>1 & is.na(samples$techrep)==F]) drep=d[,samples$sample%in%u] s=droplevels(samples$sample[samples$sample%in%u]) estim_repet=function(trait, s){ repet=c(Fstat=NA, pv=NA) s=as.factor(s) trait=as.numeric(trait) f=formula(paste("trait~ s")) m=lm(f) a=anova(m) as.numeric(a["F value"][1,1])->repet[1] as.numeric(a["Pr(>F)"][1,1])->repet[2] return(repet) } ##estimate replicability for each fragment rep=t(apply(log(drep+1), 1, estim_repet, s=s)) rep=as.data.frame(rep) rep$pv_adj=p.adjust(rep$pv, method="fdr") cols=rep("dodgerblue", nrow(rep)) cols[rep$pv_adj<=0.05]="gold" barplot(rep$Fstat, col=cols, xlab="Pseudo-molecules", ylab="Fstab", border=cols) #hist(rep$Fstat, breaks=100) ##filter avec pv<=0.05 ##filter avec pvalue corrigées <=0.05 s=(rep$pv_adj<=0.05) d=d[s, ] peaks=cbind(peaks, rep) peaks=peaks[s,] saveRDS(d, "./data/intensities_filt_rep.rds") saveRDS(peaks, "./data/peaks_filt_rep.rds") saveRDS(samples, "./data/samples_filt_rep.rds") ``` ### PCA ```{r} library(plyr) d=readRDS("data/intensities_filt_rep.rds") samples=readRDS("data/samples_filt_rep.rds") peaks=readRDS("data/peaks_filt_rep.rds") ##take highest peak per pcgroup as the representative peak. #Intmax = rowSums(d,na.rm = TRUE) funmax=function(x){return(x[which.max(x$Fstat)[1],])} data=data.frame(peak=paste("peak_", peaks$X1, sep=""), pcgroup=paste("group_", peaks$pcgroup, sep=""),Intmax=Intmax,d) res <-ddply(data,"pcgroup",funmax) ## dpm=as.matrix(res[, -1:-3]) row.names(dpm)=res$pcgroup dpm=t(dpm) ppm=peaks[match(res$peak, paste("peak_", peaks$X1, sep="")),] ##preparer pour l'ACP library(FactoMineR) library(factoextra) dpmsq = sqrt(dpm + 0.001) dpmsq = scale(dpmsq) res.pca <- PCA(dpmsq,ncp = 3,scale.unit = FALSE, graph=F) fviz_eig(res.pca) fviz_pca_var(res.pca, col.var = "black") fviz_contrib(res.pca, choice = "var", axes = 1, top = 50) fviz_contrib(res.pca, choice = "var", axes = 2, top = 50) fviz_contrib(res.pca, choice = "var", axes = 3, top = 50) fviz_contrib(res.pca, choice = "var", axes = 1:2, top = 10) fviz_pca_ind(res.pca, geom.ind = c("point"), col.ind = samples$type, # colorer by groups addEllipses = FALSE, # Ellipses de concentration legend.title = "Type", title= "representation des individus en fonction du type d'echantillons" ) ``` The diffence between the blanks and the other samples is well captured by the first axis explaining over 80\% of the variance. The problem is that the cloud of points looking like blanks doesn't only include blanks. There are bad samples left in the data! Let's remove them. ```{r} samples=droplevels(samples) rs=rowSums(dpm) boxplot(rs~samples$type) stripchart(rs~samples$type,add=T, pch=16, col="Dodgerblue" , vertical=T, method="jitter") abline(5e4, 0) ``` Some samples look like blanks, which suggest injection didn't work. We can remove all the blanks and all of these by removing all rows with rs<3e6 ```{r} ## now without the blanks! unique(samples$type) s=(samples$type%in%c("allgeno", "reps", "QC") & rs>=3e6) dpm=dpm[s,] samples=samples[s,] all(samples$id==row.names(dpm)) dpmsq = sqrt(dpm + 0.001) dpmsq = scale(dpmsq) all(samples$id==row.names(dpmsq)) res.pca <- PCA(dpmsq,ncp = 3,scale.unit = FALSE, graph=F) fviz_eig(res.pca) fviz_contrib(res.pca, choice = "var", axes = 1, top = 50) fviz_contrib(res.pca, choice = "var", axes = 2, top = 50) fviz_contrib(res.pca, choice = "var", axes = 3, top = 50) fviz_pca_ind(res.pca, geom.ind = c("point"), col.ind = samples$type, # colorer by groups addEllipses = T, # Ellipses de concentration legend.title = "Type", title= "representation des individus en fonction du type d'echantillons" ) samples$height2=paste(samples$height) samples$height2[samples$type=="QC"]<-"QC" fviz_pca_ind(res.pca, geom.ind = c("point"), col.ind = samples$height2, # colorer by groups addEllipses = FALSE, # Ellipses de concentration legend.title = "Type", title= "representation des individus en fonction du type d'echantillons" ) fviz_pca_ind(res.pca, geom.ind = c("point"), col.ind = samples$height2, # colorer by groups addEllipses = TRUE, # Ellipses de concentration legend.title = "Type", title= "representation des individus en fonction du type d'echantillons" ) ``` ## test de l'effet genotype. ```{r} dpm=readRDS("./data/d_filt_rep_pm.rds") samples=readRDS("./data/samples_filt_rep_pm.rds") ppm=readRDS("./data/peaks_filt_rep_pm.rds") ##garder unique les genotype "allgeno" ## s=(samples$type=="allgeno") ## dpm=dpm[s, ] ## samples=samples[s,] ##identifier les genotype repliqué. tab=table(samples$genotype) u=names(tab[tab>=7]) s=(samples$genotype%in%u & samples$techrep==1) dat=dpm[s,] sam=droplevels(samples[s,]) table(sam$genotype) table(sam$techrep) ``` ``` estim_geno=function(trait, height, genotype){ trait=as.numeric(trait) height=as.factor(height) genotype=as.factor(genotype) f=formula(paste("trait~ height+genotype+height*genotype")) m=lm(f) a=anova(m) return(a) } ##estimate replicability for each fragment res=apply(log(dat+1), 2, estim_geno, height=sam$height, genotype=sam$genotype) library(plyr) effgeno=ldply(res, function(x){data.frame(Fstat=x["F value"]["genotype",], pv=x["Pr(>F)"]["genotype",])}) effgeno$pv_adj=p.adjust(effgeno$pv, method="fdr") cols=rep("dodgerblue", nrow(effgeno)) cols[effgeno$pv_adj<=0.05]="gold" barplot(effgeno$Fstat, col=cols, xlab="Pseudo-molecules", ylab="Fstab", border=cols, names.arg=effgeno[,1])

Syntax	Example	Reference
# Header	Header	基本排版
- Unordered List	Unordered List
1. Ordered List	Ordered List
- [ ] Todo List	Todo List
> Blockquote	Blockquote
Bold font	Bold font
Italics font	Italics font
~~Strikethrough~~	~~Strikethrough~~
19^th^	19^th
H~2~O	H₂O
++Inserted text++	Inserted text
==Marked text==	Marked text
[link text](https:// "title")	Link
![image alt](https:// "title")	Image
`Code`	`Code`	在筆記中貼入程式碼
```javascript var i = 0; ```	`var i = 0;`
:smile:		Emoji list
{%youtube youtube_id %}	Externals
$L^aT_eX$	L^aT_eX
:::info This is a alert area. :::	This is a alert area.